Initial commit.
This commit is contained in:
commit
2ee9d072fe
2 changed files with 356 additions and 0 deletions
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
|
@ -0,0 +1,2 @@
|
|||
download/
|
||||
*.db
|
354
droidz.py
Normal file
354
droidz.py
Normal file
|
@ -0,0 +1,354 @@
|
|||
import argparse
|
||||
import bs4
|
||||
import datetime
|
||||
import os
|
||||
import re
|
||||
import requests
|
||||
import sqlite3
|
||||
import subprocess
|
||||
import sys
|
||||
import types
|
||||
|
||||
from voussoirkit import pathclass
|
||||
from voussoirkit import ratelimiter
|
||||
from voussoirkit import sqlhelpers
|
||||
from voussoirkit import winwhich
|
||||
|
||||
CATEGORIES = [
|
||||
'stickmen',
|
||||
'stickpacks',
|
||||
'vehicles',
|
||||
'weapons',
|
||||
'objects',
|
||||
'random',
|
||||
'effects',
|
||||
'backgrounds',
|
||||
]
|
||||
|
||||
DB_INIT = '''
|
||||
BEGIN;
|
||||
CREATE TABLE IF NOT EXISTS sticks(
|
||||
id TEXT PRIMARY KEY NOT NULL,
|
||||
name TEXT,
|
||||
description TEXT,
|
||||
date INT,
|
||||
author TEXT,
|
||||
download_link,
|
||||
category TEXT,
|
||||
downloads INT,
|
||||
version TEXT,
|
||||
vote_score INT,
|
||||
usage_rating TEXT,
|
||||
retrieved INT
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS index_sticks_id ON sticks(id);
|
||||
COMMIT;
|
||||
'''
|
||||
|
||||
SQL_COLUMNS = sqlhelpers.extract_table_column_map(DB_INIT)
|
||||
|
||||
sql = sqlite3.connect('sticks.db')
|
||||
sql.executescript(DB_INIT)
|
||||
|
||||
USERAGENT = '''Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)
|
||||
Chrome/79.0.3945.130 Safari/537.36'''.replace('\n', ' ').strip()
|
||||
|
||||
HEADERS = {
|
||||
'User-Agent': USERAGENT
|
||||
}
|
||||
|
||||
REQUEST_RATELIMITER = ratelimiter.Ratelimiter(allowance=1, period=1)
|
||||
DOWNLOAD_RATELIMITER = ratelimiter.Ratelimiter(allowance=1, period=5)
|
||||
|
||||
WINRAR = winwhich.which('winrar')
|
||||
|
||||
def get_now():
|
||||
return datetime.datetime.now(datetime.timezone.utc).timestamp()
|
||||
|
||||
def request(url):
|
||||
REQUEST_RATELIMITER.limit()
|
||||
response = requests.get(url, headers=HEADERS)
|
||||
response.raise_for_status()
|
||||
return response
|
||||
|
||||
def id_from_direct_url(direct_url):
|
||||
id = direct_url.split('/direct/')[-1]
|
||||
id = id.split('/')[0].split('?')[0]
|
||||
return id
|
||||
|
||||
def insert_id(id):
|
||||
cur = sql.cursor()
|
||||
cur.execute('SELECT 1 FROM sticks WHERE id == ?', [id])
|
||||
existing = cur.fetchone()
|
||||
if not existing:
|
||||
data = {'id': id}
|
||||
columns = SQL_COLUMNS['sticks']
|
||||
(qmarks, bindings) = sqlhelpers.insert_filler(columns, data, require_all=False)
|
||||
|
||||
query = f'INSERT INTO sticks VALUES({qmarks})'
|
||||
cur.execute(query, bindings)
|
||||
|
||||
status = types.SimpleNamespace(id=id, is_new=not existing)
|
||||
return status
|
||||
|
||||
# SCRAPE
|
||||
################################################################################
|
||||
def scrape_direct(id):
|
||||
url = f'http://droidz.org/direct/{id}'
|
||||
print(url)
|
||||
response = request(url)
|
||||
text = response.text
|
||||
|
||||
# I had a weird issue where some brs were not self-closing and they
|
||||
# contained a bunch of other elements. This whitespace replacement fixed
|
||||
# the issue but I didn't quite understand why.
|
||||
text = re.sub(r'<\s*br\s*/\s*>', '<br/>', text)
|
||||
soup = bs4.BeautifulSoup(text, 'html.parser')
|
||||
|
||||
for br in soup.find_all('br'):
|
||||
br.replace_with('\n')
|
||||
|
||||
stick_info = soup.select('.content')[1].get_text()
|
||||
author = soup.find('a', href=re.compile(r'search\.php\?searchq=')).get_text()
|
||||
vote_score = int(re.search(r'Vote Score: ([-\d]+)\s*$', stick_info, flags=re.M).group(1))
|
||||
downloads = int(re.search(r'Downloads: (\d+)\s*$', stick_info, flags=re.M).group(1))
|
||||
category = re.search(r'Category: (.+?)\s*$', stick_info, flags=re.M).group(1)
|
||||
version = re.search(r'Version: (.+?)\s*$', stick_info, flags=re.M).group(1)
|
||||
usage_rating = re.search(r'Usage Rating: (.+?)\s*$', stick_info, flags=re.M).group(1)
|
||||
date = re.search(r'Date Submitted: (.+?)\s*$', stick_info, flags=re.M).group(1)
|
||||
date = datetime.datetime.strptime(date, '%B %d, %Y')
|
||||
date = date.timestamp()
|
||||
|
||||
name = soup.select_one('.section .top h2').get_text().strip()
|
||||
description = soup.select_one('.section .content').get_text().strip()
|
||||
if description == f'{author}, has left no comments for this submission.':
|
||||
description = None
|
||||
else:
|
||||
description = description.replace(f'{author} says, ', '')
|
||||
download_link = soup.find('a', href=re.compile(r'/resources/grab\.php\?file='))['href']
|
||||
retrieved = int(get_now())
|
||||
|
||||
data = {
|
||||
'id': id,
|
||||
'name': name,
|
||||
'description': description,
|
||||
'date': date,
|
||||
'author': author,
|
||||
'download_link': download_link,
|
||||
'category': category,
|
||||
'downloads': downloads,
|
||||
'version': version,
|
||||
'vote_score': vote_score,
|
||||
'usage_rating': usage_rating,
|
||||
'retrieved': retrieved,
|
||||
}
|
||||
|
||||
insert_id(id)
|
||||
|
||||
cur = sql.cursor()
|
||||
(qmarks, bindings) = sqlhelpers.update_filler(data, 'id')
|
||||
query = f'UPDATE sticks {qmarks}'
|
||||
cur.execute(query, bindings)
|
||||
sql.commit()
|
||||
|
||||
return data
|
||||
|
||||
def scrape_category(category):
|
||||
page = 1
|
||||
all_directs = set()
|
||||
while True:
|
||||
url = f'http://droidz.org/stickmain/{category}.php?page={page}'
|
||||
print(url)
|
||||
response = request(url)
|
||||
soup = bs4.BeautifulSoup(response.text, 'html.parser')
|
||||
this_directs = soup.find_all('a', href=re.compile(r'/direct/\d+'))
|
||||
prev_count = len(all_directs)
|
||||
all_directs.update(this_directs)
|
||||
if len(all_directs) == prev_count:
|
||||
break
|
||||
page += 1
|
||||
for direct in this_directs:
|
||||
id = id_from_direct_url(direct['href'])
|
||||
insert_id(id)
|
||||
|
||||
sql.commit()
|
||||
|
||||
print(f'Got {len(all_directs)} directs.')
|
||||
return all_directs
|
||||
|
||||
def scrape_latest():
|
||||
url = 'http://droidz.org/stickmain/'
|
||||
print(url)
|
||||
response = request(url)
|
||||
soup = bs4.BeautifulSoup(response.text, 'html.parser')
|
||||
h2s = soup.find_all('h2')
|
||||
for h2 in h2s:
|
||||
if 'Latest 50 Accepted' in h2.get_text():
|
||||
latest_50_h2 = h2
|
||||
break
|
||||
|
||||
div = latest_50_h2.parent
|
||||
directs = div.find_all('a', href=re.compile(r'/direct/\d+'))
|
||||
for direct in directs:
|
||||
id = id_from_direct_url(direct['href'])
|
||||
status = insert_id(id)
|
||||
if not status.is_new:
|
||||
break
|
||||
|
||||
return status
|
||||
|
||||
# UPDATE
|
||||
################################################################################
|
||||
def incremental_update():
|
||||
status = scrape_latest()
|
||||
if status.is_new:
|
||||
print('The Latest box didn\'t contain everything.')
|
||||
print('Need to check the categories for new sticks.')
|
||||
for category in CATEGORIES:
|
||||
scrape_category(category)
|
||||
|
||||
cur = sql.cursor()
|
||||
cur.execute('SELECT id FROM sticks WHERE retrieved IS NULL')
|
||||
ids = [row[0] for row in cur.fetchall()]
|
||||
|
||||
for id in ids:
|
||||
scrape_direct(id)
|
||||
|
||||
def full_update():
|
||||
for category in CATEGORIES:
|
||||
scrape_category(category)
|
||||
|
||||
cur = sql.cursor()
|
||||
cur.execute('SELECT id FROM sticks ORDER BY retrieved ASC')
|
||||
ids = [row[0] for row in cur.fetchall()]
|
||||
|
||||
for id in ids:
|
||||
scrape_direct(id)
|
||||
|
||||
# DOWNLOAD
|
||||
################################################################################
|
||||
def download_stick(id, overwrite=False, extract=False):
|
||||
directory = pathclass.Path('download').with_child(id)
|
||||
if directory.exists and not overwrite:
|
||||
return directory
|
||||
|
||||
cur = sql.cursor()
|
||||
cur.execute('SELECT download_link FROM sticks WHERE id == ?', [id])
|
||||
download_link = cur.fetchone()[0]
|
||||
filename = re.search(r'file=(.+)', download_link).group(1)
|
||||
filepath = directory.with_child(filename)
|
||||
|
||||
DOWNLOAD_RATELIMITER.limit()
|
||||
print(f'Downloading {id}')
|
||||
response = request(download_link)
|
||||
|
||||
os.makedirs(directory.absolute_path, exist_ok=True)
|
||||
with open(filepath.absolute_path, 'wb') as handle:
|
||||
handle.write(response.content)
|
||||
|
||||
if extract and filepath.extension == 'zip':
|
||||
# As much as I would like to use Python's zipfile module, I found that
|
||||
# some of the .zips on the site are actually rars.
|
||||
command = [
|
||||
WINRAR, 'x',
|
||||
'-o+', '-ibck',
|
||||
filepath.absolute_path,
|
||||
'*.*',
|
||||
directory.absolute_path + os.sep,
|
||||
]
|
||||
subprocess.run(command)
|
||||
os.remove(filepath.absolute_path)
|
||||
|
||||
return directory
|
||||
|
||||
def download_all(overwrite=False, extract=False):
|
||||
cur = sql.cursor()
|
||||
cur.execute('SELECT id FROM sticks')
|
||||
ids = [row[0] for row in cur.fetchall()]
|
||||
for id in ids:
|
||||
download_stick(id, overwrite=overwrite, extract=extract)
|
||||
|
||||
# COMMAND LINE
|
||||
################################################################################
|
||||
from voussoirkit import betterhelp
|
||||
|
||||
DOCSTRING = '''
|
||||
Scrape sticks from droidz.org.
|
||||
|
||||
{update}
|
||||
|
||||
{download}
|
||||
|
||||
TO SEE DETAILS ON EACH COMMAND, RUN
|
||||
> droidz.py <command> --help
|
||||
'''.lstrip()
|
||||
|
||||
SUB_DOCSTRINGS = dict(
|
||||
update='''
|
||||
update:
|
||||
Update the database with stick info.
|
||||
|
||||
> droidz.py update
|
||||
|
||||
flags:
|
||||
--full:
|
||||
Re-scrape all categories and all sticks to get fresh info.
|
||||
Otherwise, only new sticks will be scraped.
|
||||
'''.strip(),
|
||||
|
||||
download='''
|
||||
download:
|
||||
Download the stick files.
|
||||
|
||||
> droidz.py download all
|
||||
> droidz.py download [ids]
|
||||
|
||||
flags:
|
||||
--overwrite:
|
||||
Re-download any existing files. Otherwise they'll be skipped.
|
||||
|
||||
--extract:
|
||||
Extract downloaded zip files.
|
||||
NOTE: Some files on the site are labeled as .zip but are actually rars,
|
||||
so this extraction process requires you to have winrar on your PATH.
|
||||
Sorry.
|
||||
'''.strip(),
|
||||
)
|
||||
|
||||
DOCSTRING = betterhelp.add_previews(DOCSTRING, SUB_DOCSTRINGS)
|
||||
|
||||
def update_argparse(args):
|
||||
if args.full:
|
||||
return full_update()
|
||||
else:
|
||||
return incremental_update()
|
||||
|
||||
def download_argparse(args):
|
||||
if args.extract and not WINRAR:
|
||||
raise Exception('The --extract flag requires you to have winrar on your path.')
|
||||
if len(args.ids) == 1 and args.ids[0] == 'all':
|
||||
return download_all(overwrite=args.overwrite, extract=args.extract)
|
||||
else:
|
||||
for id in args.ids:
|
||||
return download_stick(id, overwrite=args.overwrite, extract=args.extract)
|
||||
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
subparsers = parser.add_subparsers()
|
||||
|
||||
p_update = subparsers.add_parser('update')
|
||||
p_update.add_argument('--full', dest='full', action='store_true')
|
||||
p_update.set_defaults(func=update_argparse)
|
||||
|
||||
p_download = subparsers.add_parser('download')
|
||||
p_download.add_argument('ids', nargs='+', default=None)
|
||||
p_download.add_argument('--overwrite', dest='overwrite', action='store_true')
|
||||
p_download.add_argument('--extract', dest='extract', action='store_true')
|
||||
p_download.set_defaults(func=download_argparse)
|
||||
|
||||
@betterhelp.subparser_betterhelp(parser, main_docstring=DOCSTRING, sub_docstrings=SUB_DOCSTRINGS)
|
||||
def main(argv):
|
||||
args = parser.parse_args(argv)
|
||||
return args.func(args)
|
||||
|
||||
if __name__ == '__main__':
|
||||
raise SystemExit(main(sys.argv[1:]))
|
Loading…
Reference in a new issue