Add html_render command.
This commit is contained in:
parent
884a3a7a6c
commit
41a774fe8c
1 changed files with 243 additions and 0 deletions
243
hnarchive.py
243
hnarchive.py
|
@ -1,4 +1,7 @@
|
||||||
import argparse
|
import argparse
|
||||||
|
import bs4
|
||||||
|
import datetime
|
||||||
|
import html
|
||||||
import logging
|
import logging
|
||||||
import requests
|
import requests
|
||||||
import sqlite3
|
import sqlite3
|
||||||
|
@ -13,6 +16,7 @@ from voussoirkit import operatornotify
|
||||||
from voussoirkit import ratelimiter
|
from voussoirkit import ratelimiter
|
||||||
from voussoirkit import sqlhelpers
|
from voussoirkit import sqlhelpers
|
||||||
from voussoirkit import threadpool
|
from voussoirkit import threadpool
|
||||||
|
from voussoirkit import treeclass
|
||||||
from voussoirkit import vlogging
|
from voussoirkit import vlogging
|
||||||
|
|
||||||
log = vlogging.getLogger(__name__, 'hnarchive')
|
log = vlogging.getLogger(__name__, 'hnarchive')
|
||||||
|
@ -280,6 +284,205 @@ def select_latest_id():
|
||||||
return None
|
return None
|
||||||
return row[0]
|
return row[0]
|
||||||
|
|
||||||
|
# RENDERING ########################################################################################
|
||||||
|
|
||||||
|
def _fix_ptags(text):
|
||||||
|
'''
|
||||||
|
The text returned by HN only puts <p> in between paragraphs, they do
|
||||||
|
not add closing tags or put an opening <p> on the first paragraph.
|
||||||
|
|
||||||
|
If the user typed a literal <p> then it will have been stored with < and
|
||||||
|
> so it won't get messed up here.
|
||||||
|
'''
|
||||||
|
text = text.replace('<p>', '</p><p>')
|
||||||
|
text = '<p>' + text + '</p>'
|
||||||
|
return text
|
||||||
|
|
||||||
|
def build_item_tree(*, id=None, item=None):
|
||||||
|
if id is not None and item is None:
|
||||||
|
item = select_item(id)
|
||||||
|
if item is None:
|
||||||
|
raise ValueError('We dont have that item in the database.')
|
||||||
|
elif item is not None and id is None:
|
||||||
|
id = item['id']
|
||||||
|
else:
|
||||||
|
raise TypeError('Please pass only one of id, item.')
|
||||||
|
|
||||||
|
tree = treeclass.Tree(str(id), data=item)
|
||||||
|
for child in select_child_items(id):
|
||||||
|
tree.add_child(build_item_tree(item=child))
|
||||||
|
return tree
|
||||||
|
|
||||||
|
def html_render_comment(*, soup, item):
|
||||||
|
div = soup.new_tag('div')
|
||||||
|
div['class'] = item['type']
|
||||||
|
div['id'] = item['id']
|
||||||
|
|
||||||
|
userinfo = soup.new_tag('p')
|
||||||
|
div.append(userinfo)
|
||||||
|
|
||||||
|
author = item['author'] or '[deleted]'
|
||||||
|
username = soup.new_tag('a', href=f'https://news.ycombinator.com/user?id={author}')
|
||||||
|
username.append(author)
|
||||||
|
userinfo.append(username)
|
||||||
|
|
||||||
|
userinfo.append(' | ')
|
||||||
|
|
||||||
|
date = datetime.datetime.utcfromtimestamp(item['time'])
|
||||||
|
date = date.strftime('%Y %b %d %H:%M:%S')
|
||||||
|
timestamp = soup.new_tag('a', href=f'https://news.ycombinator.com/item?id={item["id"]}')
|
||||||
|
timestamp.append(date)
|
||||||
|
userinfo.append(timestamp)
|
||||||
|
|
||||||
|
text = item['text'] or '[deleted]'
|
||||||
|
text = bs4.BeautifulSoup(_fix_ptags(text), 'html.parser')
|
||||||
|
div.append(text)
|
||||||
|
return div
|
||||||
|
|
||||||
|
def html_render_comment_tree(*, soup, tree):
|
||||||
|
div = html_render_comment(soup=soup, item=tree.data)
|
||||||
|
|
||||||
|
for child in tree.list_children(sort=lambda node: node.data['time']):
|
||||||
|
div.append(html_render_comment_tree(soup=soup, tree=child))
|
||||||
|
|
||||||
|
return div
|
||||||
|
|
||||||
|
def html_render_job(*, soup, item):
|
||||||
|
div = soup.new_tag('div')
|
||||||
|
div['class'] = item['type']
|
||||||
|
div['id'] = item['id']
|
||||||
|
|
||||||
|
h = soup.new_tag('h1')
|
||||||
|
div.append(h)
|
||||||
|
h.append(item['title'])
|
||||||
|
|
||||||
|
if item['text']:
|
||||||
|
text = bs4.BeautifulSoup(_fix_ptags(item['text']), 'html.parser')
|
||||||
|
div.append(text)
|
||||||
|
|
||||||
|
return div
|
||||||
|
|
||||||
|
def html_render_poll(*, soup, item):
|
||||||
|
options = select_poll_options(item['id'])
|
||||||
|
div = html_render_story(soup=soup, item=item)
|
||||||
|
for option in options:
|
||||||
|
div.append(html_render_pollopt(soup=soup, item=option))
|
||||||
|
return div
|
||||||
|
|
||||||
|
def html_render_pollopt(*, soup, item):
|
||||||
|
div = soup.new_tag('div')
|
||||||
|
div['class'] = item['type']
|
||||||
|
|
||||||
|
text = bs4.BeautifulSoup(_fix_ptags(item['text']), 'html.parser')
|
||||||
|
div.append(text)
|
||||||
|
|
||||||
|
points = soup.new_tag('p')
|
||||||
|
points.append(f'{item["score"]} points')
|
||||||
|
div.append(points)
|
||||||
|
|
||||||
|
return div
|
||||||
|
|
||||||
|
def html_render_story(*, soup, item):
|
||||||
|
div = soup.new_tag('div')
|
||||||
|
div['class'] = item['type']
|
||||||
|
div['id'] = item['id']
|
||||||
|
|
||||||
|
h = soup.new_tag('h1')
|
||||||
|
div.append(h)
|
||||||
|
if item['url']:
|
||||||
|
a = soup.new_tag('a', href=item['url'])
|
||||||
|
a.append(item['title'])
|
||||||
|
h.append(a)
|
||||||
|
else:
|
||||||
|
h.append(item['title'])
|
||||||
|
if item['text']:
|
||||||
|
text = bs4.BeautifulSoup(_fix_ptags(item['text']), 'html.parser')
|
||||||
|
div.append(text)
|
||||||
|
|
||||||
|
userinfo = soup.new_tag('p')
|
||||||
|
div.append(userinfo)
|
||||||
|
|
||||||
|
author = item['author']
|
||||||
|
username = soup.new_tag('a', href=f'https://news.ycombinator.com/user?id={author}')
|
||||||
|
username.append(author)
|
||||||
|
userinfo.append(username)
|
||||||
|
|
||||||
|
userinfo.append(' | ')
|
||||||
|
|
||||||
|
date = datetime.datetime.utcfromtimestamp(item['time'])
|
||||||
|
date = date.strftime('%Y %b %d %H:%M:%S')
|
||||||
|
timestamp = soup.new_tag('a', href=f'https://news.ycombinator.com/item?id={item["id"]}')
|
||||||
|
timestamp.append(date)
|
||||||
|
userinfo.append(timestamp)
|
||||||
|
|
||||||
|
userinfo.append(' | ')
|
||||||
|
|
||||||
|
points = soup.new_tag('span')
|
||||||
|
points.append(f'{item["score"]} points')
|
||||||
|
userinfo.append(points)
|
||||||
|
return div
|
||||||
|
|
||||||
|
def html_render_page(tree):
|
||||||
|
soup = bs4.BeautifulSoup()
|
||||||
|
html = soup.new_tag('html')
|
||||||
|
soup.append(html)
|
||||||
|
|
||||||
|
head = soup.new_tag('head')
|
||||||
|
html.append(head)
|
||||||
|
|
||||||
|
style = soup.new_tag('style')
|
||||||
|
style.append('''
|
||||||
|
.comment,
|
||||||
|
.job,
|
||||||
|
.poll,
|
||||||
|
.pollopt,
|
||||||
|
.story
|
||||||
|
{
|
||||||
|
padding-left: 20px;
|
||||||
|
margin-top: 4px;
|
||||||
|
margin-right: 4px;
|
||||||
|
margin-bottom: 4px;
|
||||||
|
}
|
||||||
|
.job, .poll, .story
|
||||||
|
{
|
||||||
|
border: 2px solid blue;
|
||||||
|
}
|
||||||
|
body > .story + .comment,
|
||||||
|
body > .comment + .comment
|
||||||
|
{
|
||||||
|
margin-top: 10px;
|
||||||
|
}
|
||||||
|
.comment, .pollopt
|
||||||
|
{
|
||||||
|
border: 1px solid black;
|
||||||
|
}
|
||||||
|
''')
|
||||||
|
head.append(style)
|
||||||
|
|
||||||
|
body = soup.new_tag('body')
|
||||||
|
html.append(body)
|
||||||
|
|
||||||
|
item = tree.data
|
||||||
|
|
||||||
|
if item['type'] == 'comment':
|
||||||
|
body.append(html_render_comment_tree(soup=soup, tree=tree))
|
||||||
|
|
||||||
|
elif item['type'] == 'job':
|
||||||
|
body.append(html_render_job(soup=soup, item=item))
|
||||||
|
|
||||||
|
elif item['type'] == 'poll':
|
||||||
|
body.append(html_render_poll(soup=soup, item=item))
|
||||||
|
for child in tree.list_children(sort=lambda node: node.data['time']):
|
||||||
|
body.append(html_render_comment_tree(soup=soup, tree=child))
|
||||||
|
|
||||||
|
elif item['type'] == 'story':
|
||||||
|
body.append(html_render_story(soup=soup, item=item))
|
||||||
|
for child in tree.list_children(sort=lambda node: node.data['time']):
|
||||||
|
body.append(html_render_comment_tree(soup=soup, tree=child))
|
||||||
|
|
||||||
|
return soup
|
||||||
|
|
||||||
|
|
||||||
# COMMAND LINE #####################################################################################
|
# COMMAND LINE #####################################################################################
|
||||||
|
|
||||||
DOCSTRING = '''
|
DOCSTRING = '''
|
||||||
|
@ -288,6 +491,8 @@ hnarchive.py
|
||||||
|
|
||||||
{get}
|
{get}
|
||||||
|
|
||||||
|
{html_render}
|
||||||
|
|
||||||
{livestream}
|
{livestream}
|
||||||
|
|
||||||
{update}
|
{update}
|
||||||
|
@ -303,6 +508,8 @@ get='''
|
||||||
get:
|
get:
|
||||||
Get items between two IDs, inclusive.
|
Get items between two IDs, inclusive.
|
||||||
|
|
||||||
|
> hnarchive get <flags>
|
||||||
|
|
||||||
flags:
|
flags:
|
||||||
--lower id:
|
--lower id:
|
||||||
Lower bound item ID. If omitted, starts from 1.
|
Lower bound item ID. If omitted, starts from 1.
|
||||||
|
@ -321,11 +528,30 @@ livestream='''
|
||||||
livestream:
|
livestream:
|
||||||
Watch for new items in an infinite loop.
|
Watch for new items in an infinite loop.
|
||||||
|
|
||||||
|
> hnarchive livestream <flags>
|
||||||
|
|
||||||
flags:
|
flags:
|
||||||
--commit_period X:
|
--commit_period X:
|
||||||
Commit the database after every X insertions. Default = 200.
|
Commit the database after every X insertions. Default = 200.
|
||||||
'''.strip(),
|
'''.strip(),
|
||||||
|
|
||||||
|
html_render='''
|
||||||
|
html_render:
|
||||||
|
Render items to HTML -- stories, comment trees, etc.
|
||||||
|
|
||||||
|
> hnarchive html_render id [id id...] <flags>
|
||||||
|
|
||||||
|
In general, you probably want to start with the story's ID so you get the
|
||||||
|
whole page, but you can export an individual comment tree by passing the
|
||||||
|
root comment's ID. Polls and job ads should also render correctly.
|
||||||
|
|
||||||
|
flags:
|
||||||
|
--output X:
|
||||||
|
Save the html to the file named X. Your filename may include "{id}" and
|
||||||
|
the item's ID will be formatted into the string. This will be necessary
|
||||||
|
if you are rendering multiple IDs in a single invocation.
|
||||||
|
'''.strip(),
|
||||||
|
|
||||||
update='''
|
update='''
|
||||||
update:
|
update:
|
||||||
Get new items, from the highest ID in the database to the present.
|
Get new items, from the highest ID in the database to the present.
|
||||||
|
@ -379,6 +605,18 @@ def get_argparse(args):
|
||||||
insert_items(items, commit_period=args.commit_period)
|
insert_items(items, commit_period=args.commit_period)
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
def html_render_argparse(args):
|
||||||
|
for id in args.ids:
|
||||||
|
tree = build_item_tree(id=id)
|
||||||
|
soup = html_render_page(tree)
|
||||||
|
html = str(soup)
|
||||||
|
if args.output:
|
||||||
|
filename = args.output.format(id=id)
|
||||||
|
with open(filename, 'w', encoding='utf-8') as handle:
|
||||||
|
handle.write(html)
|
||||||
|
else:
|
||||||
|
print(html)
|
||||||
|
|
||||||
@ctrlc_commit
|
@ctrlc_commit
|
||||||
def livestream_argparse(args):
|
def livestream_argparse(args):
|
||||||
NOTIFY_EVERY_LINE.set(True)
|
NOTIFY_EVERY_LINE.set(True)
|
||||||
|
@ -438,6 +676,11 @@ def main(argv):
|
||||||
p_get.add_argument('--commit_period', '--commit-period', type=int, default=200)
|
p_get.add_argument('--commit_period', '--commit-period', type=int, default=200)
|
||||||
p_get.set_defaults(func=get_argparse)
|
p_get.set_defaults(func=get_argparse)
|
||||||
|
|
||||||
|
p_html_render = subparsers.add_parser('html_render', aliases=['html-render'])
|
||||||
|
p_html_render.add_argument('ids', nargs='+')
|
||||||
|
p_html_render.add_argument('--output', default=None)
|
||||||
|
p_html_render.set_defaults(func=html_render_argparse)
|
||||||
|
|
||||||
p_livestream = subparsers.add_parser('livestream')
|
p_livestream = subparsers.add_parser('livestream')
|
||||||
p_livestream.add_argument('--commit_period', '--commit-period', type=int, default=200)
|
p_livestream.add_argument('--commit_period', '--commit-period', type=int, default=200)
|
||||||
p_livestream.set_defaults(func=livestream_argparse)
|
p_livestream.set_defaults(func=livestream_argparse)
|
||||||
|
|
Loading…
Reference in a new issue