droidz/droidz.py

433 lines
12 KiB
Python
Raw Normal View History

2020-02-07 23:22:24 +00:00
import argparse
import bs4
import datetime
import os
import re
import requests
import sqlite3
import subprocess
import sys
import types
2022-02-13 03:51:15 +00:00
from voussoirkit import betterhelp
2020-02-07 23:22:24 +00:00
from voussoirkit import pathclass
from voussoirkit import ratelimiter
from voussoirkit import sqlhelpers
from voussoirkit import threadpool
2020-02-07 23:22:24 +00:00
from voussoirkit import winwhich
CATEGORIES = [
'stickmen',
'stickpacks',
'vehicles',
'weapons',
'objects',
'random',
'effects',
'backgrounds',
]
DB_INIT = '''
BEGIN;
CREATE TABLE IF NOT EXISTS sticks(
id TEXT PRIMARY KEY NOT NULL,
name TEXT,
description TEXT,
date INT,
author TEXT,
download_link,
category TEXT,
downloads INT,
version TEXT,
vote_score INT,
usage_rating TEXT,
retrieved INT
);
CREATE INDEX IF NOT EXISTS index_sticks_id ON sticks(id);
COMMIT;
'''
SQL_COLUMNS = sqlhelpers.extract_table_column_map(DB_INIT)
sql = sqlite3.connect('sticks.db')
sql.executescript(DB_INIT)
2020-11-11 04:06:20 +00:00
USERAGENT = '''
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)
Chrome/79.0.3945.130 Safari/537.36
'''.replace('\n', ' ').strip()
2020-02-07 23:22:24 +00:00
HEADERS = {
'User-Agent': USERAGENT
}
2020-11-11 04:06:20 +00:00
session = requests.Session()
session.headers.update(HEADERS)
2020-02-07 23:22:24 +00:00
DOWNLOAD_RATELIMITER = ratelimiter.Ratelimiter(allowance=1, period=5)
WINRAR = winwhich.which('winrar')
def get_now():
return datetime.datetime.now(datetime.timezone.utc).timestamp()
def id_from_direct_url(direct_url):
id = direct_url.split('/direct/')[-1]
id = id.split('/')[0].split('?')[0]
return id
# DB FUNCTIONS
################################################################################
def select_stick(id):
cur = sql.cursor()
cur.execute('SELECT * FROM sticks WHERE id == ?', [id])
return cur.fetchone()
def insert_id(id, commit=True):
2020-02-07 23:22:24 +00:00
cur = sql.cursor()
cur.execute('SELECT 1 FROM sticks WHERE id == ?', [id])
existing = cur.fetchone()
if not existing:
data = {'id': id}
columns = SQL_COLUMNS['sticks']
(qmarks, bindings) = sqlhelpers.insert_filler(data)
2020-02-07 23:22:24 +00:00
query = f'INSERT INTO sticks {qmarks}'
2020-02-07 23:22:24 +00:00
cur.execute(query, bindings)
if commit:
sql.commit()
2020-02-07 23:22:24 +00:00
status = types.SimpleNamespace(id=id, is_new=not existing)
return status
def insert_ids(ids, commit=True):
for id in ids:
insert_id(id, commit=False)
if commit:
sql.commit()
def insert_stick(data, commit=True):
cur = sql.cursor()
cur.execute('SELECT 1 FROM sticks WHERE id == ?', [data['id']])
existing = cur.fetchone()
if existing:
(qmarks, bindings) = sqlhelpers.update_filler(data, 'id')
query = f'UPDATE sticks {qmarks}'
else:
(qmarks, bindings) = sqlhelpers.insert_filler(data)
query = f'INSERT INTO sticks {qmarks}'
cur.execute(query, bindings)
if commit:
sql.commit()
def insert_sticks(datas, commit=True):
for data in datas:
insert_stick(data, commit=False)
if commit:
sql.commit()
2020-02-07 23:22:24 +00:00
# SCRAPE
################################################################################
def request(url):
2020-02-07 23:22:24 +00:00
print(url)
2020-11-11 04:06:20 +00:00
response = session.get(url)
response.raise_for_status()
return response
def scrape_direct(id, commit=True):
'''
Return the dict of Stick data for this ID.
'''
url = f'http://droidz.org/direct/{id}'
2020-02-07 23:22:24 +00:00
response = request(url)
text = response.text
# I had a weird issue where some brs were not self-closing and they
# contained a bunch of other elements. This whitespace replacement fixed
# the issue but I didn't quite understand why.
text = re.sub(r'<\s*br\s*/\s*>', '<br/>', text)
soup = bs4.BeautifulSoup(text, 'html.parser')
for br in soup.find_all('br'):
br.replace_with('\n')
stick_info = soup.select('.content')[1].get_text()
author = soup.find('a', href=re.compile(r'search\.php\?searchq=')).get_text()
vote_score = int(re.search(r'Vote Score: ([-\d]+)\s*$', stick_info, flags=re.M).group(1))
downloads = int(re.search(r'Downloads: (\d+)\s*$', stick_info, flags=re.M).group(1))
category = re.search(r'Category: (.+?)\s*$', stick_info, flags=re.M).group(1)
version = re.search(r'Version: (.+?)\s*$', stick_info, flags=re.M).group(1)
usage_rating = re.search(r'Usage Rating: (.+?)\s*$', stick_info, flags=re.M).group(1)
date = re.search(r'Date Submitted: (.+?)\s*$', stick_info, flags=re.M).group(1)
date = datetime.datetime.strptime(date, '%B %d, %Y')
date = date.timestamp()
name = soup.select_one('.section .top h2').get_text().strip()
description = soup.select_one('.section .content').get_text().strip()
if description == f'{author}, has left no comments for this submission.':
description = None
else:
description = description.replace(f'{author} says, ', '')
download_link = soup.find('a', href=re.compile(r'/resources/grab\.php\?file='))['href']
retrieved = int(get_now())
data = {
'id': id,
'name': name,
'description': description,
'date': date,
'author': author,
'download_link': download_link,
'category': category,
'downloads': downloads,
'version': version,
'vote_score': vote_score,
'usage_rating': usage_rating,
'retrieved': retrieved,
}
return data
2020-02-07 23:22:24 +00:00
def scrape_directs(ids, threads=1, commit=True):
'''
Given many Stick IDs, yield Stick datas.
'''
if threads < 1:
raise ValueError(threads)
2020-02-07 23:22:24 +00:00
if threads == 1:
for id in ids:
yield scrape_direct(id)
else:
pool = threadpool.ThreadPool(size=threads)
kwargss = [
{'function': scrape_direct, 'args': [id], 'name': id}
for id in ids
]
pool.add_many(kwargss)
for job in pool.result_generator():
if job.exception:
raise job.exception
yield job.value
2020-02-07 23:22:24 +00:00
def scrape_category(category):
'''
Yield Stick IDs from all pages within this category. They are listed in
alphabetical order by Stick name.
'''
2020-02-07 23:22:24 +00:00
page = 1
all_directs = set()
while True:
url = f'http://droidz.org/stickmain/{category}.php?page={page}'
response = request(url)
soup = bs4.BeautifulSoup(response.text, 'html.parser')
this_directs = soup.find_all('a', href=re.compile(r'/direct/\d+'))
prev_count = len(all_directs)
all_directs.update(this_directs)
if len(all_directs) == prev_count:
break
page += 1
for direct in this_directs:
id = id_from_direct_url(direct['href'])
yield id
2020-02-07 23:22:24 +00:00
def scrape_latest():
'''
Yield the latest Stick IDs from the /stickmain homepage, most recent first.
'''
2020-02-07 23:22:24 +00:00
url = 'http://droidz.org/stickmain/'
response = request(url)
soup = bs4.BeautifulSoup(response.text, 'html.parser')
h2s = soup.find_all('h2')
for h2 in h2s:
if 'Latest 50 Accepted' in h2.get_text():
latest_50_h2 = h2
break
div = latest_50_h2.parent
directs = div.find_all('a', href=re.compile(r'/direct/\d+'))
for direct in directs:
id = id_from_direct_url(direct['href'])
yield id
2020-02-07 23:22:24 +00:00
# UPDATE
################################################################################
def incremental_update(threads=1):
latest_ids = scrape_latest()
for id in latest_ids:
status = insert_id(id, commit=False)
2020-02-07 23:22:24 +00:00
if status.is_new:
print('The Latest box didn\'t contain everything.')
print('Need to check the categories for new sticks.')
for category in CATEGORIES:
ids = scrape_category(category)
insert_ids(ids)
else:
print('No new sticks for incremental update.')
2020-02-07 23:22:24 +00:00
cur = sql.execute('SELECT id FROM sticks WHERE retrieved IS NULL')
2020-02-07 23:22:24 +00:00
ids = [row[0] for row in cur.fetchall()]
sticks = scrape_directs(ids, threads=threads)
try:
insert_sticks(sticks)
except KeyboardInterrupt:
sql.commit()
2020-02-07 23:22:24 +00:00
def full_update(threads=1):
2020-02-07 23:22:24 +00:00
for category in CATEGORIES:
ids = scrape_category(category)
insert_ids(ids)
2020-02-07 23:22:24 +00:00
cur = sql.cursor()
cur.execute('SELECT id FROM sticks ORDER BY retrieved ASC')
ids = [row[0] for row in cur.fetchall()]
sticks = scrape_directs(ids, threads=threads)
try:
insert_sticks(sticks)
except KeyboardInterrupt:
sql.commit()
2020-02-07 23:22:24 +00:00
# DOWNLOAD
################################################################################
def download_stick(id, overwrite=False, extract=False):
directory = pathclass.Path('download').with_child(id)
if directory.exists and not overwrite:
return directory
cur = sql.execute('SELECT download_link FROM sticks WHERE id == ?', [id])
2020-02-07 23:22:24 +00:00
download_link = cur.fetchone()[0]
filename = re.search(r'file=(.+)', download_link).group(1)
filepath = directory.with_child(filename)
DOWNLOAD_RATELIMITER.limit()
print(f'Downloading {id}')
response = request(download_link)
2020-09-24 21:18:41 +00:00
directory.makedirs(exist_ok=True)
2020-09-21 01:29:12 +00:00
with filepath.open('wb') as handle:
2020-02-07 23:22:24 +00:00
handle.write(response.content)
if extract and WINRAR is not None and filepath.extension == 'zip':
2020-02-07 23:22:24 +00:00
# As much as I would like to use Python's zipfile module, I found that
# some of the .zips on the site are actually rars.
command = [
WINRAR, 'x',
'-o+', '-ibck',
filepath.absolute_path,
'*.*',
directory.absolute_path + os.sep,
]
subprocess.run(command)
os.remove(filepath.absolute_path)
return directory
def download_all(overwrite=False, extract=False):
cur = sql.cursor()
cur.execute('SELECT id FROM sticks')
ids = [row[0] for row in cur.fetchall()]
for id in ids:
download_stick(id, overwrite=overwrite, extract=extract)
# COMMAND LINE
################################################################################
def update_argparse(args):
if args.full:
return full_update(threads=args.threads)
2020-02-07 23:22:24 +00:00
else:
return incremental_update(threads=args.threads)
2020-02-07 23:22:24 +00:00
def download_argparse(args):
if args.extract and not WINRAR:
raise Exception('The --extract flag requires you to have winrar on your path.')
if len(args.ids) == 1 and args.ids[0] == 'all':
return download_all(overwrite=args.overwrite, extract=args.extract)
else:
for id in args.ids:
return download_stick(id, overwrite=args.overwrite, extract=args.extract)
def main(argv):
2022-02-13 03:51:15 +00:00
parser = argparse.ArgumentParser(description='Scrape sticks from droidz.org.')
subparsers = parser.add_subparsers()
2022-02-13 03:51:15 +00:00
################################################################################################
p_update = subparsers.add_parser(
'update',
description='''
Update the database with stick info.
''',
)
p_update.add_argument(
'--full',
dest='full',
action='store_true',
help='''
Re-scrape all categories and all sticks to get fresh info.
Otherwise, only new sticks will be scraped.
''',
)
p_update.add_argument(
'--threads', dest='threads', type=int, default=1,
)
p_update.set_defaults(func=update_argparse)
2022-02-13 03:51:15 +00:00
################################################################################################
2022-02-13 03:51:15 +00:00
p_download = subparsers.add_parser(
'download',
description='''
Download the stick files.
''',
)
p_download.examples = [
'all',
'100 200 300 --overwrite',
]
p_download.add_argument(
'ids',
nargs='+',
default=None,
help='''
One or more stick IDs to download. You can use the word "all" to download
all sticks.
''',
)
2022-02-13 03:51:15 +00:00
p_download.add_argument(
'--overwrite',
dest='overwrite',
action='store_true',
help='''
Re-download any existing files. Otherwise they'll be skipped.
''',
)
p_download.add_argument(
'--extract',
dest='extract',
action='store_true',
help='''
Extract downloaded zip files.
NOTE: Some files on the site are labeled as .zip but are actually rars,
so this extraction process requires you to have winrar on your PATH.
Sorry.
''',
)
p_download.set_defaults(func=download_argparse)
return betterhelp.go(parser, argv)
2020-02-07 23:22:24 +00:00
if __name__ == '__main__':
raise SystemExit(main(sys.argv[1:]))