Files
deb-python-oauth2client/samples/gtaskqueue_sample/gtaskqueue/gtaskqueue_puller
Joe Gregorio 239855e6e1 Add TaskQueue sample.
Index: samples/gtaskqueue_sample/README
===================================================================
new file mode 100755
2011-05-04 14:22:30 -04:00

349 lines
13 KiB
Python

#!/usr/bin/env python
#
# Copyright (C) 2010 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Module to pull tasks from TaskQueues and execute them.
This module does the following in an infinite loop.
1. Connects to Task API (of TaskQueues API collection) to request lease on
certain number of tasks (specified by user).
2. Spawns parallel processes to execute the leased tasks.
3. Polls all the tasks continously till they finish.
4. Deletes the tasks from taskqueue on their successful completion.
5. It lets the user specify when to invoke the lease request instead of polling
tasks status in a tight loop for better resource utilization:
a. Invoke the Lease request when runnning tasks go beyound certain
threshold (min_running_tasks)
b. Wait time becomes more than specified poll-time-out interval.
6. Repeat the steps from 1 to 5 when either all tasks have finished executing
or one of the conditions in 5) is met. """
import sys
import time
from apiclient.errors import HttpError
from gtaskqueue.client_task import ClientTask
from gtaskqueue.taskqueue_client import TaskQueueClient
from gtaskqueue.taskqueue_logger import logger
from gtaskqueue.taskqueue_logger import set_logger
from google.apputils import app
import gflags as flags
FLAGS = flags.FLAGS
flags.DEFINE_string(
'project_name',
'default',
'The name of the Taskqueue API project.')
flags.DEFINE_string(
'taskqueue_name',
'testpuller',
'taskqueue to which client wants to connect to')
flags.DEFINE_integer(
'lease_secs',
30,
'The lease for the task in seconds')
flags.DEFINE_integer(
'num_tasks',
10,
'The number of tasks to lease')
flags.DEFINE_integer(
'min_running_tasks',
0,
'minmum number of tasks below which lease can be invoked')
flags.DEFINE_float(
'sleep_interval_secs',
2,
'sleep interval when no tasks are found in the taskqueue')
flags.DEFINE_float(
'timeout_secs_for_next_lease_request',
600,
'Wait time before next poll when no tasks are found in the'
'queue (in seconds)')
flags.DEFINE_integer(
'taskapi_requests_per_sec',
None,
'limit on task_api requests per second')
flags.DEFINE_float(
'sleep_before_next_poll_secs',
2,
'sleep interval before next poll')
class TaskQueuePuller(object):
"""Maintains state information for TaskQueuePuller."""
def __init__(self):
self._last_lease_time = None
self._poll_timeout_start = None
self._num_last_leased_tasks = 0
# Dictionary for running tasks's ids and their corresponding
# client_task object.
self._taskprocess_map = {}
try:
self.__tcq = TaskQueueClient()
self.task_api = self.__tcq.get_taskapi()
except HttpError, http_error:
logger.error('Could not get TaskQueue API handler and hence' \
'exiting: %s' % str(http_error))
sys.exit()
def _can_lease(self):
"""Determines if new tasks can be leased.
Determines if new taks can be leased based on
1. Number of tasks already running in the system.
2. Limit on accessing the taskqueue apirary.
Returns:
True/False.
"""
if self._num_tasks_to_lease() > 0 and not self._is_rate_exceeded():
return True
else:
return False
def _is_rate_exceeded(self):
"""Determines if requests/second to TaskQueue API has exceeded limit.
We do not access the APIs beyond the specified permissible limit.
If we have run N tasks in elapsed time since last lease, we have
already made N+1 requests to API (1 for collective lease and N for
their individual delete operations). If K reqs/sec is the limit on
accessing APIs, then we sould not invoke any request to API before
N+1/K sec approximately. The above condition is formulated in the
following method.
Returns:
True/False
"""
if not FLAGS.taskapi_requests_per_sec:
return False
if not self._last_lease_time:
return False
curr_time = time.time()
if ((curr_time - self._last_lease_time) <
((1.0 * (self._num_last_leased_tasks -
len(self._taskprocess_map)) /
FLAGS.taskapi_requests_per_sec))):
return True
else:
return False
def _num_tasks_to_lease(self):
"""Determines how many tasks can be leased.
num_tasks is upper limit to running tasks in the system and hence
number of tasks which could be leased is difference of numtasks and
currently running tasks.
Returns:
Number of tasks to lease.
"""
return FLAGS.num_tasks - len(self._taskprocess_map)
def _update_last_lease_info(self, result):
"""Updates the information regarding last lease.
Args:
result: Response object from TaskQueue API, containing list of
tasks.
"""
self._last_lease_time = time.time()
if result:
if result.get('items'):
self._num_last_leased_tasks = len(result.get('items'))
else:
self._num_last_leased_tasks = 0
else:
self._num_last_leased_tasks = 0
def _update_poll_timeout_start(self):
"""Updates the start time for poll-timeout."""
if not self._poll_timeout_start:
self._poll_timeout_start = time.time()
def _continue_polling(self):
"""Checks whether lease can be invoked based on running tasks and
timeout.
Lease can be invoked if
1. Running tasks in the sytem has gone below the specified
threshold (min_running_tasks).
2. Wait time has exceeded beyond time-out specified and at least one
tas has finished since last lease invocation.
By doing this, we are essentially trying to batch the lease requests.
If this is not done and we start off leasing N tasks, its likely tasks
may finish slightly one after another, and we make N lease requests for
each task for next N tasks and so on. This can result in unnecessary
lease API call and hence to avoid that, we try and batch the lease
requests. Also we put certain limit on wait time for batching the
requests by incororating the time-out.
Returns:
True/False
"""
if len(self._taskprocess_map) <= FLAGS.min_running_tasks:
return False
if self._poll_timeout_start:
elapsed_time = time.time() - self._poll_timeout_start
if elapsed_time > FLAGS.timeout_secs_for_next_lease_request:
self._poll_timeout_start = None
return False
return True
def _get_tasks_from_queue(self):
"""Gets the available tasks from the taskqueue.
Returns:
Lease response object.
"""
try:
tasks_to_fetch = self._num_tasks_to_lease()
lease_req = self.task_api.tasks().lease(
project=FLAGS.project_name,
taskqueue=FLAGS.taskqueue_name,
leaseSecs=FLAGS.lease_secs,
numTasks=tasks_to_fetch,
body={})
result = lease_req.execute()
return result
except HttpError, http_error:
logger.error('Error during lease request: %s' % str(http_error))
return None
def _create_subprocesses_for_tasks(self, result):
"""Spawns parallel sub processes to execute tasks for better
throughput.
Args:
result: lease resonse dictionary object.
"""
if not result:
logger.info('Error: result is not defined')
return None
if result.get('items'):
for task in result.get('items'):
task_id = task.get('id')
# Given that a task may be leased multiple times, we may get a
# task which we are currently executing on, so make sure we
# dont spaw another subprocess for it.
if task_id not in self._taskprocess_map:
ct = ClientTask(task)
# Check if tasks got initialized properly and then pu them
# in running tasks map.
if ct.init():
# Put the clientTask objects in a dictionary to keep
# track of stats and objects are used later to delete
# the tasks from taskqueue
self._taskprocess_map[ct.get_task_id()] = ct
def _poll_running_tasks(self):
"""Polls all the running tasks and delete them from taskqueue if
completed."""
if self._taskprocess_map:
for task in self._taskprocess_map.values():
if task.is_completed(self.task_api):
del self._taskprocess_map[task.get_task_id()]
# updates scheduling information for later use.
self._update_poll_timeout_start()
def _sleep_before_next_lease(self):
"""Sleeps before invoking lease if required based on last lease info.
It sleeps when no tasks were found on the taskqueue during last lease
request. To note, it discount the time taken in polling the tasks and
sleeps for (sleep_interval - time taken in poll). This avoids the
unnecessary wait if tasks could be leased. If no time was taken in
poll since there were not tasks in the system, it waits for full sleep
interval and thus optimizes the CPU cycles.
It does not sleep if the method is called for the first time (when no
lease request has ever been made).
"""
if not self._last_lease_time:
sleep_secs = 0
elif self._num_last_leased_tasks <= 0:
time_elpased_since_last_lease = time.time() - self._last_lease_time
sleep_secs = (FLAGS.sleep_interval_secs -
time_elpased_since_last_lease)
if sleep_secs > 0:
logger.info('No tasks found and hence sleeping for sometime')
time.sleep(FLAGS.sleep_interval_secs)
def lease_tasks(self):
"""Requests lease for specified number of tasks.
It invokes lease request for appropriate number of tasks, spawns
parallel processes to execute them and also maintains scheduling
information.
LeaseTask also takes care of waiting(sleeping) before invoking lease if
there are no tasks which can be leased in the taskqueue. This results
in better resource utilization. Apart from this, it also controls the
number of requests being sent to taskqueue APIs.
Returns:
True/False based on if tasks could be leased or not.
"""
self._sleep_before_next_lease()
if self._can_lease():
result = self._get_tasks_from_queue()
self._update_last_lease_info(result)
self._create_subprocesses_for_tasks(result)
return True
return False
def poll_tasks(self):
"""Polls the status of running tasks of the system.
Polls the status of tasks and then decides if it should continue to
poll depending on number of tasks running in the system and timeouts.
Instead of polling in a tight loop, it sleeps for sometime before the
next poll to avoid any unnecessary CPU cycles. poll_tasks returns
only when system has capability to accomodate at least one new task.
"""
self._poll_running_tasks()
while self._continue_polling():
logger.info('Sleeping before next poll')
time.sleep(FLAGS.sleep_before_next_poll_secs)
self._poll_running_tasks()
def main(argv):
"""Infinite loop to lease new tasks and poll them for completion."""
# Settings for logger
set_logger()
# Instantiate puller
puller = TaskQueuePuller()
while True:
puller.lease_tasks()
puller.poll_tasks()
if __name__ == '__main__':
app.run()