Use updated threadpool, add some docstrings.
This commit is contained in:
parent
173dd60451
commit
29c80b2662
1 changed files with 18 additions and 9 deletions
27
droidz.py
27
droidz.py
|
@ -135,6 +135,9 @@ def request(url):
|
||||||
return response
|
return response
|
||||||
|
|
||||||
def scrape_direct(id, commit=True):
|
def scrape_direct(id, commit=True):
|
||||||
|
'''
|
||||||
|
Return the dict of Stick data for this ID.
|
||||||
|
'''
|
||||||
url = f'http://droidz.org/direct/{id}'
|
url = f'http://droidz.org/direct/{id}'
|
||||||
response = request(url)
|
response = request(url)
|
||||||
text = response.text
|
text = response.text
|
||||||
|
@ -186,6 +189,9 @@ def scrape_direct(id, commit=True):
|
||||||
return data
|
return data
|
||||||
|
|
||||||
def scrape_directs(ids, threads=1, commit=True):
|
def scrape_directs(ids, threads=1, commit=True):
|
||||||
|
'''
|
||||||
|
Given many Stick IDs, yield Stick datas.
|
||||||
|
'''
|
||||||
if threads < 1:
|
if threads < 1:
|
||||||
raise ValueError(threads)
|
raise ValueError(threads)
|
||||||
|
|
||||||
|
@ -199,15 +205,17 @@ def scrape_directs(ids, threads=1, commit=True):
|
||||||
{'function': scrape_direct, 'args': [id], 'name': id}
|
{'function': scrape_direct, 'args': [id], 'name': id}
|
||||||
for id in ids
|
for id in ids
|
||||||
]
|
]
|
||||||
jobs = pool.add_many(kwargss)
|
pool.add_many(kwargss)
|
||||||
while jobs:
|
for job in pool.result_generator():
|
||||||
job = jobs.pop(0)
|
|
||||||
job.join()
|
|
||||||
if job.exception:
|
if job.exception:
|
||||||
raise job.exception
|
raise job.exception
|
||||||
yield job.value
|
yield job.value
|
||||||
|
|
||||||
def scrape_category(category):
|
def scrape_category(category):
|
||||||
|
'''
|
||||||
|
Yield Stick IDs from all pages within this category. They are listed in
|
||||||
|
alphabetical order by Stick name.
|
||||||
|
'''
|
||||||
page = 1
|
page = 1
|
||||||
all_directs = set()
|
all_directs = set()
|
||||||
while True:
|
while True:
|
||||||
|
@ -225,6 +233,9 @@ def scrape_category(category):
|
||||||
yield id
|
yield id
|
||||||
|
|
||||||
def scrape_latest():
|
def scrape_latest():
|
||||||
|
'''
|
||||||
|
Yield the latest Stick IDs from the /stickmain homepage, most recent first.
|
||||||
|
'''
|
||||||
url = 'http://droidz.org/stickmain/'
|
url = 'http://droidz.org/stickmain/'
|
||||||
response = request(url)
|
response = request(url)
|
||||||
soup = bs4.BeautifulSoup(response.text, 'html.parser')
|
soup = bs4.BeautifulSoup(response.text, 'html.parser')
|
||||||
|
@ -256,8 +267,7 @@ def incremental_update(threads=1):
|
||||||
else:
|
else:
|
||||||
print('No new sticks for incremental update.')
|
print('No new sticks for incremental update.')
|
||||||
|
|
||||||
cur = sql.cursor()
|
cur = sql.execute('SELECT id FROM sticks WHERE retrieved IS NULL')
|
||||||
cur.execute('SELECT id FROM sticks WHERE retrieved IS NULL')
|
|
||||||
ids = [row[0] for row in cur.fetchall()]
|
ids = [row[0] for row in cur.fetchall()]
|
||||||
|
|
||||||
sticks = scrape_directs(ids, threads=threads)
|
sticks = scrape_directs(ids, threads=threads)
|
||||||
|
@ -288,8 +298,7 @@ def download_stick(id, overwrite=False, extract=False):
|
||||||
if directory.exists and not overwrite:
|
if directory.exists and not overwrite:
|
||||||
return directory
|
return directory
|
||||||
|
|
||||||
cur = sql.cursor()
|
cur = sql.execute('SELECT download_link FROM sticks WHERE id == ?', [id])
|
||||||
cur.execute('SELECT download_link FROM sticks WHERE id == ?', [id])
|
|
||||||
download_link = cur.fetchone()[0]
|
download_link = cur.fetchone()[0]
|
||||||
filename = re.search(r'file=(.+)', download_link).group(1)
|
filename = re.search(r'file=(.+)', download_link).group(1)
|
||||||
filepath = directory.with_child(filename)
|
filepath = directory.with_child(filename)
|
||||||
|
@ -302,7 +311,7 @@ def download_stick(id, overwrite=False, extract=False):
|
||||||
with filepath.open('wb') as handle:
|
with filepath.open('wb') as handle:
|
||||||
handle.write(response.content)
|
handle.write(response.content)
|
||||||
|
|
||||||
if extract and filepath.extension == 'zip':
|
if extract and WINRAR is not None and filepath.extension == 'zip':
|
||||||
# As much as I would like to use Python's zipfile module, I found that
|
# As much as I would like to use Python's zipfile module, I found that
|
||||||
# some of the .zips on the site are actually rars.
|
# some of the .zips on the site are actually rars.
|
||||||
command = [
|
command = [
|
||||||
|
|
Loading…
Reference in a new issue