Add html_render command.

2021-11-08 11:43:13 -08:00 · 2021-11-08 11:43:13 -08:00 · 41a774fe8c
parent 884a3a7a6c
commit 41a774fe8c
1 changed files with 243 additions and 0 deletions
--- a/hnarchive.py
+++ b/hnarchive.py
@ -1,4 +1,7 @@
 import argparse
+import bs4
+import datetime
+import html
 import logging
 import requests
 import sqlite3
@ -13,6 +16,7 @@ from voussoirkit import operatornotify
 from voussoirkit import ratelimiter
 from voussoirkit import sqlhelpers
 from voussoirkit import threadpool
+from voussoirkit import treeclass
 from voussoirkit import vlogging

 log = vlogging.getLogger(__name__, 'hnarchive')
@ -280,6 +284,205 @@ def select_latest_id():
        return None
    return row[0]

+# RENDERING ########################################################################################
+
+def _fix_ptags(text):
+    '''
+    The text returned by HN only puts <p> in between paragraphs, they do
+    not add closing tags or put an opening <p> on the first paragraph.
+
+    If the user typed a literal <p> then it will have been stored with &lt; and
+    &gt; so it won't get messed up here.
+    '''
+    text = text.replace('<p>', '</p><p>')
+    text = '<p>' + text + '</p>'
+    return text
+
+def build_item_tree(*, id=None, item=None):
+    if id is not None and item is None:
+        item = select_item(id)
+        if item is None:
+            raise ValueError('We dont have that item in the database.')
+    elif item is not None and id is None:
+        id = item['id']
+    else:
+        raise TypeError('Please pass only one of id, item.')
+
+    tree = treeclass.Tree(str(id), data=item)
+    for child in select_child_items(id):
+        tree.add_child(build_item_tree(item=child))
+    return tree
+
+def html_render_comment(*, soup, item):
+    div = soup.new_tag('div')
+    div['class'] = item['type']
+    div['id'] = item['id']
+
+    userinfo = soup.new_tag('p')
+    div.append(userinfo)
+
+    author = item['author'] or '[deleted]'
+    username = soup.new_tag('a', href=f'https://news.ycombinator.com/user?id={author}')
+    username.append(author)
+    userinfo.append(username)
+
+    userinfo.append(' | ')
+
+    date = datetime.datetime.utcfromtimestamp(item['time'])
+    date = date.strftime('%Y %b %d %H:%M:%S')
+    timestamp = soup.new_tag('a', href=f'https://news.ycombinator.com/item?id={item["id"]}')
+    timestamp.append(date)
+    userinfo.append(timestamp)
+
+    text = item['text'] or '[deleted]'
+    text = bs4.BeautifulSoup(_fix_ptags(text), 'html.parser')
+    div.append(text)
+    return div
+
+def html_render_comment_tree(*, soup, tree):
+    div = html_render_comment(soup=soup, item=tree.data)
+
+    for child in tree.list_children(sort=lambda node: node.data['time']):
+        div.append(html_render_comment_tree(soup=soup, tree=child))
+
+    return div
+
+def html_render_job(*, soup, item):
+    div = soup.new_tag('div')
+    div['class'] = item['type']
+    div['id'] = item['id']
+
+    h = soup.new_tag('h1')
+    div.append(h)
+    h.append(item['title'])
+
+    if item['text']:
+        text = bs4.BeautifulSoup(_fix_ptags(item['text']), 'html.parser')
+        div.append(text)
+
+    return div
+
+def html_render_poll(*, soup, item):
+    options = select_poll_options(item['id'])
+    div = html_render_story(soup=soup, item=item)
+    for option in options:
+        div.append(html_render_pollopt(soup=soup, item=option))
+    return div
+
+def html_render_pollopt(*, soup, item):
+    div = soup.new_tag('div')
+    div['class'] = item['type']
+
+    text = bs4.BeautifulSoup(_fix_ptags(item['text']), 'html.parser')
+    div.append(text)
+
+    points = soup.new_tag('p')
+    points.append(f'{item["score"]} points')
+    div.append(points)
+
+    return div
+
+def html_render_story(*, soup, item):
+    div = soup.new_tag('div')
+    div['class'] = item['type']
+    div['id'] = item['id']
+
+    h = soup.new_tag('h1')
+    div.append(h)
+    if item['url']:
+        a = soup.new_tag('a', href=item['url'])
+        a.append(item['title'])
+        h.append(a)
+    else:
+        h.append(item['title'])
+    if item['text']:
+        text = bs4.BeautifulSoup(_fix_ptags(item['text']), 'html.parser')
+        div.append(text)
+
+    userinfo = soup.new_tag('p')
+    div.append(userinfo)
+
+    author = item['author']
+    username = soup.new_tag('a', href=f'https://news.ycombinator.com/user?id={author}')
+    username.append(author)
+    userinfo.append(username)
+
+    userinfo.append(' | ')
+
+    date = datetime.datetime.utcfromtimestamp(item['time'])
+    date = date.strftime('%Y %b %d %H:%M:%S')
+    timestamp = soup.new_tag('a', href=f'https://news.ycombinator.com/item?id={item["id"]}')
+    timestamp.append(date)
+    userinfo.append(timestamp)
+
+    userinfo.append(' | ')
+
+    points = soup.new_tag('span')
+    points.append(f'{item["score"]} points')
+    userinfo.append(points)
+    return div
+
+def html_render_page(tree):
+    soup = bs4.BeautifulSoup()
+    html = soup.new_tag('html')
+    soup.append(html)
+
+    head = soup.new_tag('head')
+    html.append(head)
+
+    style = soup.new_tag('style')
+    style.append('''
+    .comment,
+    .job,
+    .poll,
+    .pollopt,
+    .story
+    {
+        padding-left: 20px;
+        margin-top: 4px;
+        margin-right: 4px;
+        margin-bottom: 4px;
+    }
+    .job, .poll, .story
+    {
+        border: 2px solid blue;
+    }
+    body > .story + .comment,
+    body > .comment + .comment
+    {
+        margin-top: 10px;
+    }
+    .comment, .pollopt
+    {
+        border: 1px solid black;
+    }
+    ''')
+    head.append(style)
+
+    body = soup.new_tag('body')
+    html.append(body)
+
+    item = tree.data
+
+    if item['type'] == 'comment':
+        body.append(html_render_comment_tree(soup=soup, tree=tree))
+
+    elif item['type'] == 'job':
+        body.append(html_render_job(soup=soup, item=item))
+
+    elif item['type'] == 'poll':
+        body.append(html_render_poll(soup=soup, item=item))
+        for child in tree.list_children(sort=lambda node: node.data['time']):
+            body.append(html_render_comment_tree(soup=soup, tree=child))
+
+    elif item['type'] == 'story':
+        body.append(html_render_story(soup=soup, item=item))
+        for child in tree.list_children(sort=lambda node: node.data['time']):
+            body.append(html_render_comment_tree(soup=soup, tree=child))
+
+    return soup
+
+
 # COMMAND LINE #####################################################################################

 DOCSTRING = '''
@ -288,6 +491,8 @@ hnarchive.py

 {get}

+{html_render}
+
 {livestream}

 {update}
@ -303,6 +508,8 @@ get='''
 get:
    Get items between two IDs, inclusive.

+    > hnarchive get <flags>
+
    flags:
    --lower id:
        Lower bound item ID. If omitted, starts from 1.
@ -321,11 +528,30 @@ livestream='''
 livestream:
    Watch for new items in an infinite loop.

+    > hnarchive livestream <flags>
+
    flags:
    --commit_period X:
        Commit the database after every X insertions. Default = 200.
 '''.strip(),

+html_render='''
+html_render:
+    Render items to HTML -- stories, comment trees, etc.
+
+    > hnarchive html_render id [id id...] <flags>
+
+    In general, you probably want to start with the story's ID so you get the
+    whole page, but you can export an individual comment tree by passing the
+    root comment's ID. Polls and job ads should also render correctly.
+
+    flags:
+    --output X:
+        Save the html to the file named X. Your filename may include "{id}" and
+        the item's ID will be formatted into the string. This will be necessary
+        if you are rendering multiple IDs in a single invocation.
+'''.strip(),
+
 update='''
 update:
    Get new items, from the highest ID in the database to the present.
@ -379,6 +605,18 @@ def get_argparse(args):
    insert_items(items, commit_period=args.commit_period)
    return 0

+def html_render_argparse(args):
+    for id in args.ids:
+        tree = build_item_tree(id=id)
+        soup = html_render_page(tree)
+        html = str(soup)
+        if args.output:
+            filename = args.output.format(id=id)
+            with open(filename, 'w', encoding='utf-8') as handle:
+                handle.write(html)
+        else:
+            print(html)
+
@ctrlc_commit
 def livestream_argparse(args):
    NOTIFY_EVERY_LINE.set(True)
@ -438,6 +676,11 @@ def main(argv):
    p_get.add_argument('--commit_period', '--commit-period', type=int, default=200)
    p_get.set_defaults(func=get_argparse)

+    p_html_render = subparsers.add_parser('html_render', aliases=['html-render'])
+    p_html_render.add_argument('ids', nargs='+')
+    p_html_render.add_argument('--output', default=None)
+    p_html_render.set_defaults(func=html_render_argparse)
+
    p_livestream = subparsers.add_parser('livestream')
    p_livestream.add_argument('--commit_period', '--commit-period', type=int, default=200)
    p_livestream.set_defaults(func=livestream_argparse)