From 41a774fe8c36838b57b5f8ca40f7f5dc298b49d0 Mon Sep 17 00:00:00 2001 From: Ethan Dalool Date: Mon, 8 Nov 2021 11:43:13 -0800 Subject: [PATCH] Add html_render command. --- hnarchive.py | 243 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 243 insertions(+) diff --git a/hnarchive.py b/hnarchive.py index 5f3dac8..9b53793 100644 --- a/hnarchive.py +++ b/hnarchive.py @@ -1,4 +1,7 @@ import argparse +import bs4 +import datetime +import html import logging import requests import sqlite3 @@ -13,6 +16,7 @@ from voussoirkit import operatornotify from voussoirkit import ratelimiter from voussoirkit import sqlhelpers from voussoirkit import threadpool +from voussoirkit import treeclass from voussoirkit import vlogging log = vlogging.getLogger(__name__, 'hnarchive') @@ -280,6 +284,205 @@ def select_latest_id(): return None return row[0] +# RENDERING ######################################################################################## + +def _fix_ptags(text): + ''' + The text returned by HN only puts

in between paragraphs, they do + not add closing tags or put an opening

on the first paragraph. + + If the user typed a literal

then it will have been stored with < and + > so it won't get messed up here. + ''' + text = text.replace('

', '

') + text = '

' + text + '

' + return text + +def build_item_tree(*, id=None, item=None): + if id is not None and item is None: + item = select_item(id) + if item is None: + raise ValueError('We dont have that item in the database.') + elif item is not None and id is None: + id = item['id'] + else: + raise TypeError('Please pass only one of id, item.') + + tree = treeclass.Tree(str(id), data=item) + for child in select_child_items(id): + tree.add_child(build_item_tree(item=child)) + return tree + +def html_render_comment(*, soup, item): + div = soup.new_tag('div') + div['class'] = item['type'] + div['id'] = item['id'] + + userinfo = soup.new_tag('p') + div.append(userinfo) + + author = item['author'] or '[deleted]' + username = soup.new_tag('a', href=f'https://news.ycombinator.com/user?id={author}') + username.append(author) + userinfo.append(username) + + userinfo.append(' | ') + + date = datetime.datetime.utcfromtimestamp(item['time']) + date = date.strftime('%Y %b %d %H:%M:%S') + timestamp = soup.new_tag('a', href=f'https://news.ycombinator.com/item?id={item["id"]}') + timestamp.append(date) + userinfo.append(timestamp) + + text = item['text'] or '[deleted]' + text = bs4.BeautifulSoup(_fix_ptags(text), 'html.parser') + div.append(text) + return div + +def html_render_comment_tree(*, soup, tree): + div = html_render_comment(soup=soup, item=tree.data) + + for child in tree.list_children(sort=lambda node: node.data['time']): + div.append(html_render_comment_tree(soup=soup, tree=child)) + + return div + +def html_render_job(*, soup, item): + div = soup.new_tag('div') + div['class'] = item['type'] + div['id'] = item['id'] + + h = soup.new_tag('h1') + div.append(h) + h.append(item['title']) + + if item['text']: + text = bs4.BeautifulSoup(_fix_ptags(item['text']), 'html.parser') + div.append(text) + + return div + +def html_render_poll(*, soup, item): + options = select_poll_options(item['id']) + div = html_render_story(soup=soup, item=item) + for option in options: + div.append(html_render_pollopt(soup=soup, item=option)) + return div + +def html_render_pollopt(*, soup, item): + div = soup.new_tag('div') + div['class'] = item['type'] + + text = bs4.BeautifulSoup(_fix_ptags(item['text']), 'html.parser') + div.append(text) + + points = soup.new_tag('p') + points.append(f'{item["score"]} points') + div.append(points) + + return div + +def html_render_story(*, soup, item): + div = soup.new_tag('div') + div['class'] = item['type'] + div['id'] = item['id'] + + h = soup.new_tag('h1') + div.append(h) + if item['url']: + a = soup.new_tag('a', href=item['url']) + a.append(item['title']) + h.append(a) + else: + h.append(item['title']) + if item['text']: + text = bs4.BeautifulSoup(_fix_ptags(item['text']), 'html.parser') + div.append(text) + + userinfo = soup.new_tag('p') + div.append(userinfo) + + author = item['author'] + username = soup.new_tag('a', href=f'https://news.ycombinator.com/user?id={author}') + username.append(author) + userinfo.append(username) + + userinfo.append(' | ') + + date = datetime.datetime.utcfromtimestamp(item['time']) + date = date.strftime('%Y %b %d %H:%M:%S') + timestamp = soup.new_tag('a', href=f'https://news.ycombinator.com/item?id={item["id"]}') + timestamp.append(date) + userinfo.append(timestamp) + + userinfo.append(' | ') + + points = soup.new_tag('span') + points.append(f'{item["score"]} points') + userinfo.append(points) + return div + +def html_render_page(tree): + soup = bs4.BeautifulSoup() + html = soup.new_tag('html') + soup.append(html) + + head = soup.new_tag('head') + html.append(head) + + style = soup.new_tag('style') + style.append(''' + .comment, + .job, + .poll, + .pollopt, + .story + { + padding-left: 20px; + margin-top: 4px; + margin-right: 4px; + margin-bottom: 4px; + } + .job, .poll, .story + { + border: 2px solid blue; + } + body > .story + .comment, + body > .comment + .comment + { + margin-top: 10px; + } + .comment, .pollopt + { + border: 1px solid black; + } + ''') + head.append(style) + + body = soup.new_tag('body') + html.append(body) + + item = tree.data + + if item['type'] == 'comment': + body.append(html_render_comment_tree(soup=soup, tree=tree)) + + elif item['type'] == 'job': + body.append(html_render_job(soup=soup, item=item)) + + elif item['type'] == 'poll': + body.append(html_render_poll(soup=soup, item=item)) + for child in tree.list_children(sort=lambda node: node.data['time']): + body.append(html_render_comment_tree(soup=soup, tree=child)) + + elif item['type'] == 'story': + body.append(html_render_story(soup=soup, item=item)) + for child in tree.list_children(sort=lambda node: node.data['time']): + body.append(html_render_comment_tree(soup=soup, tree=child)) + + return soup + + # COMMAND LINE ##################################################################################### DOCSTRING = ''' @@ -288,6 +491,8 @@ hnarchive.py {get} +{html_render} + {livestream} {update} @@ -303,6 +508,8 @@ get=''' get: Get items between two IDs, inclusive. + > hnarchive get + flags: --lower id: Lower bound item ID. If omitted, starts from 1. @@ -321,11 +528,30 @@ livestream=''' livestream: Watch for new items in an infinite loop. + > hnarchive livestream + flags: --commit_period X: Commit the database after every X insertions. Default = 200. '''.strip(), +html_render=''' +html_render: + Render items to HTML -- stories, comment trees, etc. + + > hnarchive html_render id [id id...] + + In general, you probably want to start with the story's ID so you get the + whole page, but you can export an individual comment tree by passing the + root comment's ID. Polls and job ads should also render correctly. + + flags: + --output X: + Save the html to the file named X. Your filename may include "{id}" and + the item's ID will be formatted into the string. This will be necessary + if you are rendering multiple IDs in a single invocation. +'''.strip(), + update=''' update: Get new items, from the highest ID in the database to the present. @@ -379,6 +605,18 @@ def get_argparse(args): insert_items(items, commit_period=args.commit_period) return 0 +def html_render_argparse(args): + for id in args.ids: + tree = build_item_tree(id=id) + soup = html_render_page(tree) + html = str(soup) + if args.output: + filename = args.output.format(id=id) + with open(filename, 'w', encoding='utf-8') as handle: + handle.write(html) + else: + print(html) + @ctrlc_commit def livestream_argparse(args): NOTIFY_EVERY_LINE.set(True) @@ -438,6 +676,11 @@ def main(argv): p_get.add_argument('--commit_period', '--commit-period', type=int, default=200) p_get.set_defaults(func=get_argparse) + p_html_render = subparsers.add_parser('html_render', aliases=['html-render']) + p_html_render.add_argument('ids', nargs='+') + p_html_render.add_argument('--output', default=None) + p_html_render.set_defaults(func=html_render_argparse) + p_livestream = subparsers.add_parser('livestream') p_livestream.add_argument('--commit_period', '--commit-period', type=int, default=200) p_livestream.set_defaults(func=livestream_argparse)