Big threadpool update.
This commit is contained in:
parent
12175c23a7
commit
4e9b43be8b
1 changed files with 195 additions and 110 deletions
|
@ -1,11 +1,30 @@
|
||||||
|
'''
|
||||||
|
The documentation for the classes and methods are below. Here are some examples
|
||||||
|
of threadpool in use:
|
||||||
|
|
||||||
|
1. Powering a single api scraping generator with many threads:
|
||||||
|
|
||||||
|
pool = threadpool.ThreadPool(thread_count, paused=True)
|
||||||
|
job_gen = ({'function': api.get_item, 'kwargs': {'id': i}} for i in range(lower, upper+1))
|
||||||
|
pool.add_generator(job_gen)
|
||||||
|
for job in pool.result_generator():
|
||||||
|
if job.exception:
|
||||||
|
raise job.exception
|
||||||
|
if job.value is not None:
|
||||||
|
yield job.value
|
||||||
|
|
||||||
|
'''
|
||||||
|
import collections
|
||||||
|
import queue
|
||||||
import threading
|
import threading
|
||||||
|
|
||||||
|
from voussoirkit import lazychain
|
||||||
from voussoirkit import sentinel
|
from voussoirkit import sentinel
|
||||||
|
|
||||||
PENDING = 'pending'
|
PENDING = sentinel.Sentinel('PENDING')
|
||||||
RUNNING = 'running'
|
RUNNING = sentinel.Sentinel('RUNNING')
|
||||||
FINISHED = 'finished'
|
FINISHED = sentinel.Sentinel('FINISHED')
|
||||||
RAISED = 'raised'
|
RAISED = sentinel.Sentinel('RAISED')
|
||||||
|
|
||||||
NO_RETURN = sentinel.Sentinel('NO_RETURN', truthyness=False)
|
NO_RETURN = sentinel.Sentinel('NO_RETURN', truthyness=False)
|
||||||
NO_EXCEPTION = sentinel.Sentinel('NO_EXCEPTION', truthyness=False)
|
NO_EXCEPTION = sentinel.Sentinel('NO_EXCEPTION', truthyness=False)
|
||||||
|
@ -17,134 +36,136 @@ class PoolClosed(ThreadPoolException):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
class ThreadPool:
|
class ThreadPool:
|
||||||
def __init__(self, size, paused=False):
|
'''
|
||||||
|
The ThreadPool is used to perform large numbers of tasks using a pool of
|
||||||
|
worker threads. Jobs are run in the order they are added.
|
||||||
|
|
||||||
|
The pool supports two main paradigms of usage:
|
||||||
|
|
||||||
|
1. Callback / async style
|
||||||
|
If the job function performs your desired side effects by itself, or is
|
||||||
|
given a callback function, you can simply add it to the pool and wait
|
||||||
|
for it to run.
|
||||||
|
|
||||||
|
2. Generator style
|
||||||
|
If you want to yield the job results back to the main thread for
|
||||||
|
processing (e.g. you are feeding the results into sqlite, which must be
|
||||||
|
done on the thread which opened the sqlite connection), you can use
|
||||||
|
`result_generator` to get each job in the order they were added to the
|
||||||
|
pool. This style also makes it easier to terminate the main thread when
|
||||||
|
a single job encounters an issue. Just `raise job.exception`.
|
||||||
|
'''
|
||||||
|
def __init__(self, size, paused=True):
|
||||||
'''
|
'''
|
||||||
|
size:
|
||||||
|
The number of worker threads.
|
||||||
|
|
||||||
paused:
|
paused:
|
||||||
The pool will start in a paused state and you will have to call
|
If True, the pool will start in a paused state and you will have to
|
||||||
`start` to start it.
|
call `start` to start it. If False, the pool will run as soon as
|
||||||
|
jobs are added to it.
|
||||||
'''
|
'''
|
||||||
if not isinstance(size, int):
|
if not isinstance(size, int):
|
||||||
raise TypeError(f'size must be an int, not {type(size)}.')
|
raise TypeError(f'size must be an int, not {type(size)}.')
|
||||||
if size < 1:
|
if size < 1:
|
||||||
raise ValueError(f'size must be >= 1, not {size}.')
|
raise ValueError(f'size must be >= 1, not {size}.')
|
||||||
|
|
||||||
self.max_size = size
|
self.max_size = size
|
||||||
self.closed = False
|
|
||||||
self.paused = paused
|
self.paused = paused
|
||||||
self._jobs = []
|
|
||||||
|
self._closed = False
|
||||||
|
self._running_count = 0
|
||||||
|
self._result_queue = None
|
||||||
|
self._pending_jobs = lazychain.LazyChain()
|
||||||
self._job_manager_lock = threading.Lock()
|
self._job_manager_lock = threading.Lock()
|
||||||
|
self._all_done_event = threading.Event()
|
||||||
def _clear_done_jobs(self):
|
self._all_done_event.set()
|
||||||
'''
|
|
||||||
This function assumes that _job_manager_lock is acquired!!
|
|
||||||
You should call start instead!
|
|
||||||
'''
|
|
||||||
self._jobs[:] = [j for j in self._jobs if j.status in {PENDING, RUNNING}]
|
|
||||||
|
|
||||||
def _start_jobs(self):
|
|
||||||
'''
|
|
||||||
This function assumes that _job_manager_lock is acquired!!
|
|
||||||
You should call start instead!
|
|
||||||
'''
|
|
||||||
available = self.max_size - self.running_count
|
|
||||||
available = max(0, available)
|
|
||||||
if available == 0:
|
|
||||||
return
|
|
||||||
for job in list(self._jobs):
|
|
||||||
if job.status == PENDING:
|
|
||||||
job.start()
|
|
||||||
available -= 1
|
|
||||||
if available == 0:
|
|
||||||
break
|
|
||||||
|
|
||||||
def _clear_done_and_start_jobs(self):
|
|
||||||
'''
|
|
||||||
This function assumes that _job_manager_lock is acquired!!
|
|
||||||
You should call start instead!
|
|
||||||
'''
|
|
||||||
self._clear_done_jobs()
|
|
||||||
self._start_jobs()
|
|
||||||
|
|
||||||
def _job_finished(self):
|
def _job_finished(self):
|
||||||
'''
|
'''
|
||||||
When a job finishes, it will call here.
|
When a job finishes, it will call here so that a new job can be started.
|
||||||
'''
|
'''
|
||||||
if self.paused:
|
self._running_count -= 1
|
||||||
return
|
|
||||||
|
|
||||||
# Although this method is private, we are calling the public `start`
|
if not self.paused:
|
||||||
# instead of the private method because we do not hold the lock.
|
self.start()
|
||||||
self.start()
|
|
||||||
|
@property
|
||||||
|
def closed(self):
|
||||||
|
return self.closed
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def running_count(self):
|
def running_count(self):
|
||||||
return sum(1 for job in list(self._jobs) if job.status is RUNNING)
|
return self._running_count
|
||||||
|
|
||||||
@property
|
|
||||||
def unfinished_count(self):
|
|
||||||
return sum(1 for job in list(self._jobs) if job.status in {PENDING, RUNNING})
|
|
||||||
|
|
||||||
def assert_not_closed(self):
|
def assert_not_closed(self):
|
||||||
'''
|
'''
|
||||||
If the pool is closed (because you called `join`), raise PoolClosed.
|
If the pool is closed (because you called `join`), raise PoolClosed.
|
||||||
Otherwise do nothing.
|
Otherwise do nothing.
|
||||||
'''
|
'''
|
||||||
if self.closed:
|
if self._closed:
|
||||||
raise PoolClosed()
|
raise PoolClosed()
|
||||||
|
|
||||||
def add(self, function, *, name=None, args=tuple(), kwargs=dict()):
|
def add(self, function, *, name=None, callback=None, args=tuple(), kwargs=dict()):
|
||||||
'''
|
'''
|
||||||
Add a new job to the pool. Jobs are run in the order they are added.
|
Add a new job to the pool.
|
||||||
|
|
||||||
Don't forget that in order to write a tuple of length 1 you must still
|
See the Job class for parameter details.
|
||||||
add a comma on the end. `add(print, args=(4))` is an error, you need to
|
|
||||||
`add(print, args=(4,))` or use a list instead: `add(print, args=[4])`.
|
|
||||||
|
|
||||||
name:
|
|
||||||
An optional value that will appear in the repr of the job and
|
|
||||||
has no other purpose. Use this if you intend to print(job) and want
|
|
||||||
a human friendly name string.
|
|
||||||
'''
|
'''
|
||||||
self.assert_not_closed()
|
self.assert_not_closed()
|
||||||
|
|
||||||
with self._job_manager_lock:
|
job = Job(
|
||||||
job = Job(
|
pool=self,
|
||||||
pool=self,
|
function=function,
|
||||||
function=function,
|
name=name,
|
||||||
name=name,
|
args=args,
|
||||||
args=args,
|
kwargs=kwargs,
|
||||||
kwargs=kwargs,
|
)
|
||||||
)
|
self._pending_jobs.append(job)
|
||||||
self._jobs.append(job)
|
|
||||||
|
|
||||||
if not self.paused:
|
if not self.paused:
|
||||||
self._clear_done_and_start_jobs()
|
self.start()
|
||||||
|
|
||||||
return job
|
return job
|
||||||
|
|
||||||
|
def add_generator(self, kwargs_gen):
|
||||||
|
'''
|
||||||
|
Add jobs from a generator which yields kwarg dictionaries. Unlike
|
||||||
|
`add` and `add_many`, the Job objects are not returned by this method
|
||||||
|
(since they don't exist yet!). If you want them, use `result_generator`
|
||||||
|
to iterate the pool's jobs as they complete. Otherwise, they should
|
||||||
|
have their own side effects or use a callback.
|
||||||
|
|
||||||
|
See the Job class for kwarg details.
|
||||||
|
'''
|
||||||
|
self.assert_not_closed()
|
||||||
|
|
||||||
|
these_jobs = (Job(pool=self, **kwargs) for kwargs in kwargs_gen)
|
||||||
|
self._pending_jobs.extend(these_jobs)
|
||||||
|
|
||||||
|
if not self.paused:
|
||||||
|
self.start()
|
||||||
|
|
||||||
def add_many(self, kwargss):
|
def add_many(self, kwargss):
|
||||||
'''
|
'''
|
||||||
Add multiple new jobs to the pool at once. Useful to prevent the
|
Add multiple new jobs to the pool at once. This is better than calling
|
||||||
excessive lock-waiting that you get from calling regular `add` in a
|
`add` in a loop because we only have to aquire the lock one time.
|
||||||
loop while other jobs are finishing and triggering queue maintenance.
|
|
||||||
|
|
||||||
Provide an iterable of kwarg dictionaries. That is:
|
Provide an iterable of kwarg dictionaries. That is:
|
||||||
[
|
[
|
||||||
{'function': print, 'args': [4], 'name': '4'},
|
{'function': print, 'args': [4], 'name': '4'},
|
||||||
{'function': sample, 'kwargs': {'x': 2}},
|
{'function': sample, 'kwargs': {'x': 2}},
|
||||||
]
|
]
|
||||||
|
|
||||||
|
See the Job class for kwarg details.
|
||||||
'''
|
'''
|
||||||
self.assert_not_closed()
|
self.assert_not_closed()
|
||||||
|
|
||||||
with self._job_manager_lock:
|
these_jobs = [Job(pool=self, **kwargs) for kwargs in kwargss]
|
||||||
these_jobs = []
|
self._pending_jobs.extend(these_jobs)
|
||||||
for kwargs in kwargss:
|
|
||||||
kwargs.pop('pool', None)
|
|
||||||
job = Job(pool=self, **kwargs)
|
|
||||||
these_jobs.append(job)
|
|
||||||
self._jobs.append(job)
|
|
||||||
|
|
||||||
if not self.paused:
|
if not self.paused:
|
||||||
self._clear_done_and_start_jobs()
|
self.start()
|
||||||
|
|
||||||
return these_jobs
|
return these_jobs
|
||||||
|
|
||||||
|
@ -153,33 +174,100 @@ class ThreadPool:
|
||||||
Permanently close the pool, preventing any new jobs from being added,
|
Permanently close the pool, preventing any new jobs from being added,
|
||||||
and block until all jobs are complete.
|
and block until all jobs are complete.
|
||||||
'''
|
'''
|
||||||
self.closed = True
|
self._closed = True
|
||||||
self.start()
|
self.start()
|
||||||
for job in self._jobs:
|
self._all_done_event.wait()
|
||||||
|
|
||||||
|
def result_generator(self):
|
||||||
|
'''
|
||||||
|
This generator will start the job pool, then yield finished/raised Job
|
||||||
|
objects in the order they were added. Note that a slow job will
|
||||||
|
therefore hold up the generator, though it will not stop the job pool
|
||||||
|
from running and spawning new jobs in their other threads.
|
||||||
|
|
||||||
|
For best results, you should create the pool in the paused state, add
|
||||||
|
your jobs, then use this method to start the pool. Any jobs that run
|
||||||
|
while the result_generator is not active will not be stored, since we
|
||||||
|
don't necessarily know if this method will ever be used. So, any jobs
|
||||||
|
that start before the result_generator is active will not be yielded
|
||||||
|
and will simply be lost to garbage collection.
|
||||||
|
|
||||||
|
If more jobs are added while the generator is running, they will be
|
||||||
|
yielded as expected.
|
||||||
|
|
||||||
|
When there are no more outstanding jobs, the generator will stop
|
||||||
|
iteration and return. If the pool was paused before generating, it
|
||||||
|
will be paused again.
|
||||||
|
'''
|
||||||
|
if self._result_queue is not None:
|
||||||
|
raise TypeError('The result generator is already open.')
|
||||||
|
self._result_queue = queue.Queue()
|
||||||
|
|
||||||
|
was_paused = self.paused
|
||||||
|
self.start()
|
||||||
|
while (not self._all_done_event.is_set()) or (not self._result_queue.empty()):
|
||||||
|
job = self._result_queue.get()
|
||||||
job.join()
|
job.join()
|
||||||
|
yield job
|
||||||
|
self._result_queue.task_done()
|
||||||
|
self._result_queue = None
|
||||||
|
if was_paused:
|
||||||
|
self.paused = True
|
||||||
|
|
||||||
def start(self):
|
def start(self):
|
||||||
'''
|
self.paused = False
|
||||||
Remove finished and raised jobs from the queue and start some new jobs.
|
|
||||||
|
|
||||||
The job queue is maintained automatically while adding new jobs and
|
|
||||||
when a job finishes, as long as the pool is not paused, so you should
|
|
||||||
not have to call it yourself. If you do pause the pool, use this method
|
|
||||||
to restart it.
|
|
||||||
|
|
||||||
Because the pool's internal job queue is flushed regularly, you should
|
|
||||||
store your own references to jobs to get their return values.
|
|
||||||
'''
|
|
||||||
with self._job_manager_lock:
|
with self._job_manager_lock:
|
||||||
self._clear_done_and_start_jobs()
|
available = self.max_size - self._running_count
|
||||||
self.paused = False
|
|
||||||
|
no_more_jobs = False
|
||||||
|
for x in range(available):
|
||||||
|
try:
|
||||||
|
job = next(self._pending_jobs)
|
||||||
|
except StopIteration:
|
||||||
|
no_more_jobs = True
|
||||||
|
break
|
||||||
|
|
||||||
|
self._all_done_event.clear()
|
||||||
|
job.start()
|
||||||
|
self._running_count += 1
|
||||||
|
if self._result_queue is not None:
|
||||||
|
self._result_queue.put(job)
|
||||||
|
|
||||||
|
if self._running_count == 0 and no_more_jobs:
|
||||||
|
self._all_done_event.set()
|
||||||
|
|
||||||
class Job:
|
class Job:
|
||||||
def __init__(self, pool, function, *, name=None, args=tuple(), kwargs=dict()):
|
'''
|
||||||
|
Each job contains one function that it will call when it is started.
|
||||||
|
|
||||||
|
If the function completes successfully you will find the return value in
|
||||||
|
`job.value`. If it raises an exception, you'll find it in `job.exception`,
|
||||||
|
although the thread itself will not raise.
|
||||||
|
|
||||||
|
All job threads are daemons and will not prevent the main thread from
|
||||||
|
terminating. Call `job.join()` or `pool.join()` in the main thread to
|
||||||
|
ensure jobs complete.
|
||||||
|
'''
|
||||||
|
def __init__(self, pool, function, *, name=None, callback=None, args=tuple(), kwargs=dict()):
|
||||||
|
'''
|
||||||
|
When this job is started, `function(*args, **kwargs)` will be called.
|
||||||
|
|
||||||
|
name:
|
||||||
|
An optional value that will appear in the repr of the job and
|
||||||
|
has no other purpose. Use this if you intend to print(job) and want
|
||||||
|
a human friendly name string.
|
||||||
|
|
||||||
|
callback:
|
||||||
|
An optional function which will be called as `callback(job)` after
|
||||||
|
the job is finished running. Use this for async-style processing of
|
||||||
|
the job. Note that the callback is called via the job's thread, so
|
||||||
|
make sure it is memory safe.
|
||||||
|
'''
|
||||||
self.pool = pool
|
self.pool = pool
|
||||||
self.name = name
|
self.name = name
|
||||||
self.status = PENDING
|
self.status = PENDING
|
||||||
self.function = function
|
self.function = function
|
||||||
|
self.callback = callback
|
||||||
self.args = args
|
self.args = args
|
||||||
self.kwargs = kwargs
|
self.kwargs = kwargs
|
||||||
self.value = NO_RETURN
|
self.value = NO_RETURN
|
||||||
|
@ -195,20 +283,22 @@ class Job:
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
if self.name:
|
if self.name:
|
||||||
return f'<{self.status} Job {repr(self.name)}>'
|
return f'<{self.status.name} Job {repr(self.name)}>'
|
||||||
else:
|
else:
|
||||||
return f'<{self.status} Job on {self.function}>'
|
return f'<{self.status.name} Job on {self.function}>'
|
||||||
|
|
||||||
def _run(self):
|
def _run(self):
|
||||||
try:
|
try:
|
||||||
self.value = self.function(*self.args, **self.kwargs)
|
self.value = self.function(*self.args, **self.kwargs)
|
||||||
self.status = FINISHED
|
self.status = FINISHED
|
||||||
except Exception as exc:
|
except BaseException as exc:
|
||||||
self.exception = exc
|
self.exception = exc
|
||||||
self.status = RAISED
|
self.status = RAISED
|
||||||
self._thread = None
|
self._thread = None
|
||||||
self.pool._job_finished()
|
|
||||||
self._joinme_lock.release()
|
self._joinme_lock.release()
|
||||||
|
self.pool._job_finished()
|
||||||
|
if self.callback is not None:
|
||||||
|
self.callback(self)
|
||||||
|
|
||||||
def join(self):
|
def join(self):
|
||||||
'''
|
'''
|
||||||
|
@ -218,11 +308,6 @@ class Job:
|
||||||
self._joinme_lock.release()
|
self._joinme_lock.release()
|
||||||
|
|
||||||
def start(self):
|
def start(self):
|
||||||
'''
|
|
||||||
Start the job. If the function completes successfully you will find the
|
|
||||||
return value in `value`. If it raises an exception, you'll find it in
|
|
||||||
`exception`, although the thread itself will not raise.
|
|
||||||
'''
|
|
||||||
self.status = RUNNING
|
self.status = RUNNING
|
||||||
self._thread = threading.Thread(target=self._run)
|
self._thread = threading.Thread(target=self._run)
|
||||||
self._thread.daemon = True
|
self._thread.daemon = True
|
||||||
|
|
Loading…
Reference in a new issue