Initial commit.

2020-02-07 15:22:24 -08:00 · 2020-02-07 15:22:24 -08:00 · 2ee9d072fe
commit 2ee9d072fe
2 changed files with 356 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,2 @@
 download/
 *.db
--- a/droidz.py
+++ b/droidz.py
@ -0,0 +1,354 @@
 import argparse
 import bs4
 import datetime
 import os
 import re
 import requests
 import sqlite3
 import subprocess
 import sys
 import types
 from voussoirkit import pathclass
 from voussoirkit import ratelimiter
 from voussoirkit import sqlhelpers
 from voussoirkit import winwhich
 CATEGORIES = [
    'stickmen',
    'stickpacks',
    'vehicles',
    'weapons',
    'objects',
    'random',
    'effects',
    'backgrounds',
 ]
 DB_INIT = '''
 BEGIN;
 CREATE TABLE IF NOT EXISTS sticks(
    id TEXT PRIMARY KEY NOT NULL,
    name TEXT,
    description TEXT,
    date INT,
    author TEXT,
    download_link,
    category TEXT,
    downloads INT,
    version TEXT,
    vote_score INT,
    usage_rating TEXT,
    retrieved INT
 );
 CREATE INDEX IF NOT EXISTS index_sticks_id ON sticks(id);
 COMMIT;
 '''
 SQL_COLUMNS = sqlhelpers.extract_table_column_map(DB_INIT)
 sql = sqlite3.connect('sticks.db')
 sql.executescript(DB_INIT)
 USERAGENT = '''Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)
 Chrome/79.0.3945.130 Safari/537.36'''.replace('\n', ' ').strip()
 HEADERS = {
    'User-Agent': USERAGENT
 }
 REQUEST_RATELIMITER = ratelimiter.Ratelimiter(allowance=1, period=1)
 DOWNLOAD_RATELIMITER = ratelimiter.Ratelimiter(allowance=1, period=5)
 WINRAR = winwhich.which('winrar')
 def get_now():
    return datetime.datetime.now(datetime.timezone.utc).timestamp()
 def request(url):
    REQUEST_RATELIMITER.limit()
    response = requests.get(url, headers=HEADERS)
    response.raise_for_status()
    return response
 def id_from_direct_url(direct_url):
    id = direct_url.split('/direct/')[-1]
    id = id.split('/')[0].split('?')[0]
    return id
 def insert_id(id):
    cur = sql.cursor()
    cur.execute('SELECT 1 FROM sticks WHERE id == ?', [id])
    existing = cur.fetchone()
    if not existing:
        data = {'id': id}
        columns = SQL_COLUMNS['sticks']
        (qmarks, bindings) = sqlhelpers.insert_filler(columns, data, require_all=False)
        query = f'INSERT INTO sticks VALUES({qmarks})'
        cur.execute(query, bindings)
    status = types.SimpleNamespace(id=id, is_new=not existing)
    return status
 # SCRAPE
 ################################################################################
 def scrape_direct(id):
    url = f'http://droidz.org/direct/{id}'
    print(url)
    response = request(url)
    text = response.text
    # I had a weird issue where some brs were not self-closing and they
    # contained a bunch of other elements. This whitespace replacement fixed
    # the issue but I didn't quite understand why.
    text = re.sub(r'<\s*br\s*/\s*>', '<br/>', text)
    soup = bs4.BeautifulSoup(text, 'html.parser')
    for br in soup.find_all('br'):
        br.replace_with('\n')
    stick_info = soup.select('.content')[1].get_text()
    author = soup.find('a', href=re.compile(r'search\.php\?searchq=')).get_text()
    vote_score = int(re.search(r'Vote Score: ([-\d]+)\s*$', stick_info, flags=re.M).group(1))
    downloads = int(re.search(r'Downloads: (\d+)\s*$', stick_info, flags=re.M).group(1))
    category = re.search(r'Category: (.+?)\s*$', stick_info, flags=re.M).group(1)
    version = re.search(r'Version: (.+?)\s*$', stick_info, flags=re.M).group(1)
    usage_rating = re.search(r'Usage Rating: (.+?)\s*$', stick_info, flags=re.M).group(1)
    date = re.search(r'Date Submitted: (.+?)\s*$', stick_info, flags=re.M).group(1)
    date = datetime.datetime.strptime(date, '%B %d, %Y')
    date = date.timestamp()
    name = soup.select_one('.section .top h2').get_text().strip()
    description = soup.select_one('.section .content').get_text().strip()
    if description == f'{author}, has left no comments for this submission.':
        description = None
    else:
        description = description.replace(f'{author} says, ', '')
    download_link = soup.find('a', href=re.compile(r'/resources/grab\.php\?file='))['href']
    retrieved = int(get_now())
    data = {
        'id': id,
        'name': name,
        'description': description,
        'date': date,
        'author': author,
        'download_link': download_link,
        'category': category,
        'downloads': downloads,
        'version': version,
        'vote_score': vote_score,
        'usage_rating': usage_rating,
        'retrieved': retrieved,
    }
    insert_id(id)
    cur = sql.cursor()
    (qmarks, bindings) = sqlhelpers.update_filler(data, 'id')
    query = f'UPDATE sticks {qmarks}'
    cur.execute(query, bindings)
    sql.commit()
    return data
 def scrape_category(category):
    page = 1
    all_directs = set()
    while True:
        url = f'http://droidz.org/stickmain/{category}.php?page={page}'
        print(url)
        response = request(url)
        soup = bs4.BeautifulSoup(response.text, 'html.parser')
        this_directs = soup.find_all('a', href=re.compile(r'/direct/\d+'))
        prev_count = len(all_directs)
        all_directs.update(this_directs)
        if len(all_directs) == prev_count:
            break
        page += 1
        for direct in this_directs:
            id = id_from_direct_url(direct['href'])
            insert_id(id)
        sql.commit()
    print(f'Got {len(all_directs)} directs.')
    return all_directs
 def scrape_latest():
    url = 'http://droidz.org/stickmain/'
    print(url)
    response = request(url)
    soup = bs4.BeautifulSoup(response.text, 'html.parser')
    h2s = soup.find_all('h2')
    for h2 in h2s:
        if 'Latest 50 Accepted' in h2.get_text():
            latest_50_h2 = h2
            break
    div = latest_50_h2.parent
    directs = div.find_all('a', href=re.compile(r'/direct/\d+'))
    for direct in directs:
        id = id_from_direct_url(direct['href'])
        status = insert_id(id)
        if not status.is_new:
            break
    return status
 # UPDATE
 ################################################################################
 def incremental_update():
    status = scrape_latest()
    if status.is_new:
        print('The Latest box didn\'t contain everything.')
        print('Need to check the categories for new sticks.')
        for category in CATEGORIES:
            scrape_category(category)
    cur = sql.cursor()
    cur.execute('SELECT id FROM sticks WHERE retrieved IS NULL')
    ids = [row[0] for row in cur.fetchall()]
    for id in ids:
        scrape_direct(id)
 def full_update():
    for category in CATEGORIES:
        scrape_category(category)
    cur = sql.cursor()
    cur.execute('SELECT id FROM sticks ORDER BY retrieved ASC')
    ids = [row[0] for row in cur.fetchall()]
    for id in ids:
        scrape_direct(id)
 # DOWNLOAD
 ################################################################################
 def download_stick(id, overwrite=False, extract=False):
    directory = pathclass.Path('download').with_child(id)
    if directory.exists and not overwrite:
        return directory
    cur = sql.cursor()
    cur.execute('SELECT download_link FROM sticks WHERE id == ?', [id])
    download_link = cur.fetchone()[0]
    filename = re.search(r'file=(.+)', download_link).group(1)
    filepath = directory.with_child(filename)
    DOWNLOAD_RATELIMITER.limit()
    print(f'Downloading {id}')
    response = request(download_link)
    os.makedirs(directory.absolute_path, exist_ok=True)
    with open(filepath.absolute_path, 'wb') as handle:
        handle.write(response.content)
    if extract and filepath.extension == 'zip':
        # As much as I would like to use Python's zipfile module, I found that
        # some of the .zips on the site are actually rars.
        command = [
            WINRAR, 'x',
            '-o+', '-ibck',
            filepath.absolute_path,
            '*.*',
            directory.absolute_path + os.sep,
        ]
        subprocess.run(command)
        os.remove(filepath.absolute_path)
    return directory
 def download_all(overwrite=False, extract=False):
    cur = sql.cursor()
    cur.execute('SELECT id FROM sticks')
    ids = [row[0] for row in cur.fetchall()]
    for id in ids:
        download_stick(id, overwrite=overwrite, extract=extract)
 # COMMAND LINE
 ################################################################################
 from voussoirkit import betterhelp
 DOCSTRING = '''
 Scrape sticks from droidz.org.
 {update}
 {download}
 TO SEE DETAILS ON EACH COMMAND, RUN
 > droidz.py <command> --help
 '''.lstrip()
 SUB_DOCSTRINGS = dict(
 update='''
 update:
    Update the database with stick info.
    > droidz.py update
    flags:
    --full:
        Re-scrape all categories and all sticks to get fresh info.
        Otherwise, only new sticks will be scraped.
 '''.strip(),
 download='''
 download:
    Download the stick files.
    > droidz.py download all
    > droidz.py download [ids]
    flags:
    --overwrite:
        Re-download any existing files. Otherwise they'll be skipped.
    --extract:
        Extract downloaded zip files.
        NOTE: Some files on the site are labeled as .zip but are actually rars,
        so this extraction process requires you to have winrar on your PATH.
        Sorry.
 '''.strip(),
 )
 DOCSTRING = betterhelp.add_previews(DOCSTRING, SUB_DOCSTRINGS)
 def update_argparse(args):
    if args.full:
        return full_update()
    else:
        return incremental_update()
 def download_argparse(args):
    if args.extract and not WINRAR:
        raise Exception('The --extract flag requires you to have winrar on your path.')
    if len(args.ids) == 1 and args.ids[0] == 'all':
        return download_all(overwrite=args.overwrite, extract=args.extract)
    else:
        for id in args.ids:
            return download_stick(id, overwrite=args.overwrite, extract=args.extract)
 parser = argparse.ArgumentParser(description=__doc__)
 subparsers = parser.add_subparsers()
 p_update = subparsers.add_parser('update')
 p_update.add_argument('--full', dest='full', action='store_true')
 p_update.set_defaults(func=update_argparse)
 p_download = subparsers.add_parser('download')
 p_download.add_argument('ids', nargs='+', default=None)
 p_download.add_argument('--overwrite', dest='overwrite', action='store_true')
 p_download.add_argument('--extract', dest='extract', action='store_true')
 p_download.set_defaults(func=download_argparse)
@betterhelp.subparser_betterhelp(parser, main_docstring=DOCSTRING, sub_docstrings=SUB_DOCSTRINGS)
 def main(argv):
    args = parser.parse_args(argv)
    return args.func(args)
 if __name__ == '__main__':
    raise SystemExit(main(sys.argv[1:]))