Add readme.
This commit is contained in:
parent
02ce43bdfe
commit
1a889a62ef
3 changed files with 110 additions and 16 deletions
74
README.md
Normal file
74
README.md
Normal file
|
@ -0,0 +1,74 @@
|
||||||
|
hnarchive
|
||||||
|
=========
|
||||||
|
|
||||||
|
hnarchive downloads all HN items (threads and comments) into an SQLite database. At this time, my database is 23.18 GiB with just over 25,000,000 items. I'd be happy to share it.
|
||||||
|
|
||||||
|
Please `pip install requests` and `pip install voussoirkit`.
|
||||||
|
|
||||||
|
According to the [HN API docs](https://github.com/HackerNews/API) there is no enforced ratelimit, so just use a `threads` count that seems polite.
|
||||||
|
|
||||||
|
To get started, just run `python hnarchive.py update` and it will start from 1. In the future, you can run `update` on a cronjob or use `livestream` to get new items forever. Note, `update` always starts from the highest ID in the database. If you use `get` to get a range of IDs that is ahead of your update schedule, your next `update` will miss the skipped IDs.
|
||||||
|
|
||||||
|
Here are all of the subcommands:
|
||||||
|
|
||||||
|
get:
|
||||||
|
Get items between two IDs, inclusive.
|
||||||
|
|
||||||
|
flags:
|
||||||
|
--lower id:
|
||||||
|
Lower bound item ID.
|
||||||
|
|
||||||
|
--upper id:
|
||||||
|
Upper bound item ID.
|
||||||
|
|
||||||
|
--threads X:
|
||||||
|
Use X threads to download items. Default = 1 thread.
|
||||||
|
|
||||||
|
--commit_period X:
|
||||||
|
Commit the database after every X insertions. Default = 200.
|
||||||
|
|
||||||
|
livestream:
|
||||||
|
Watch for new items in an infinite loop.
|
||||||
|
|
||||||
|
flags:
|
||||||
|
--commit_period X:
|
||||||
|
Commit the database after every X insertions. Default = 200.
|
||||||
|
|
||||||
|
update:
|
||||||
|
Get new items, from the highest ID in the database to the present.
|
||||||
|
|
||||||
|
flags:
|
||||||
|
--threads X:
|
||||||
|
Use X threads to download items. Default = 1 thread.
|
||||||
|
|
||||||
|
--commit_period X:
|
||||||
|
Commit the database after every X insertions. Default = 200.
|
||||||
|
|
||||||
|
update_items:
|
||||||
|
Redownload items to update their scores, descendant counts, etc.
|
||||||
|
|
||||||
|
flags:
|
||||||
|
--days X:
|
||||||
|
Update items where the retrieval date is less than X days ahead of the
|
||||||
|
submission date.
|
||||||
|
Stories are only open for comments for 14 days, so the `descendants`
|
||||||
|
count of any story younger than 14 days should be considered volatile.
|
||||||
|
It seems the upvote button does not disappear at any age, though I
|
||||||
|
don't know whether votes on old submissions will actually count.
|
||||||
|
Regardless, votes and comments tend to solidify within a day or two
|
||||||
|
after submission so a small number should be sufficient.
|
||||||
|
|
||||||
|
--threads X:
|
||||||
|
Use X threads to download items. Default = 1 thread.
|
||||||
|
|
||||||
|
--only_mature:
|
||||||
|
If True, only update items where the submission date is more than 14
|
||||||
|
days ago. Without this, you will be updating items which are very close
|
||||||
|
to the present time, an effort which you may find wasteful.
|
||||||
|
|
||||||
|
--commit_period X:
|
||||||
|
Commit the database after every X insertions. Default = 200.
|
||||||
|
|
||||||
|
https://github.com/voussoir/hnarchive
|
||||||
|
|
||||||
|
https://gitlab.com/voussoir/hnarchive
|
50
hnarchive.py
50
hnarchive.py
|
@ -255,11 +255,14 @@ hnarchive.py
|
||||||
|
|
||||||
{get}
|
{get}
|
||||||
|
|
||||||
{update}
|
|
||||||
|
|
||||||
{livestream}
|
{livestream}
|
||||||
|
|
||||||
|
{update}
|
||||||
|
|
||||||
{update_items}
|
{update_items}
|
||||||
|
|
||||||
|
TO SEE DETAILS ON EACH COMMAND, RUN
|
||||||
|
> hnarchive.py <command> --help
|
||||||
'''.lstrip()
|
'''.lstrip()
|
||||||
|
|
||||||
SUB_DOCSTRINGS = dict(
|
SUB_DOCSTRINGS = dict(
|
||||||
|
@ -268,17 +271,26 @@ get:
|
||||||
Get items between two IDs, inclusive.
|
Get items between two IDs, inclusive.
|
||||||
|
|
||||||
flags:
|
flags:
|
||||||
--lower:
|
--lower id:
|
||||||
Lower bound item ID.
|
Lower bound item ID.
|
||||||
|
|
||||||
--upper:
|
--upper id:
|
||||||
Upper bound item ID.
|
Upper bound item ID.
|
||||||
|
|
||||||
--threads X:
|
--threads X:
|
||||||
Use X threads to download items. Default = 1 thread.
|
Use X threads to download items. Default = 1 thread.
|
||||||
|
|
||||||
--commit_period X:
|
--commit_period X:
|
||||||
Commit the database after every X insertions. Default = 1000.
|
Commit the database after every X insertions. Default = 200.
|
||||||
|
'''.strip(),
|
||||||
|
|
||||||
|
livestream='''
|
||||||
|
livestream:
|
||||||
|
Watch for new items in an infinite loop.
|
||||||
|
|
||||||
|
flags:
|
||||||
|
--commit_period X:
|
||||||
|
Commit the database after every X insertions. Default = 200.
|
||||||
'''.strip(),
|
'''.strip(),
|
||||||
|
|
||||||
update='''
|
update='''
|
||||||
|
@ -288,6 +300,9 @@ update:
|
||||||
flags:
|
flags:
|
||||||
--threads X:
|
--threads X:
|
||||||
Use X threads to download items. Default = 1 thread.
|
Use X threads to download items. Default = 1 thread.
|
||||||
|
|
||||||
|
--commit_period X:
|
||||||
|
Commit the database after every X insertions. Default = 200.
|
||||||
'''.strip(),
|
'''.strip(),
|
||||||
|
|
||||||
update_items='''
|
update_items='''
|
||||||
|
@ -298,11 +313,12 @@ update_items:
|
||||||
--days X:
|
--days X:
|
||||||
Update items where the retrieval date is less than X days ahead of the
|
Update items where the retrieval date is less than X days ahead of the
|
||||||
submission date.
|
submission date.
|
||||||
Stories are only open for comments for 14 days.
|
Stories are only open for comments for 14 days, so the `descendants`
|
||||||
|
count of any story younger than 14 days should be considered volatile.
|
||||||
It seems the upvote button does not disappear at any age, though I
|
It seems the upvote button does not disappear at any age, though I
|
||||||
don't know whether votes on old submissions will actually count.
|
don't know whether votes on old submissions will actually count.
|
||||||
Regardless, votes tend to solidify within a day or two after
|
Regardless, votes and comments tend to solidify within a day or two
|
||||||
submission so a small number should be sufficient.
|
after submission so a small number should be sufficient.
|
||||||
|
|
||||||
--threads X:
|
--threads X:
|
||||||
Use X threads to download items. Default = 1 thread.
|
Use X threads to download items. Default = 1 thread.
|
||||||
|
@ -311,11 +327,9 @@ update_items:
|
||||||
If True, only update items where the submission date is more than 14
|
If True, only update items where the submission date is more than 14
|
||||||
days ago. Without this, you will be updating items which are very close
|
days ago. Without this, you will be updating items which are very close
|
||||||
to the present time, an effort which you may find wasteful.
|
to the present time, an effort which you may find wasteful.
|
||||||
'''.strip(),
|
|
||||||
|
|
||||||
livestream='''
|
--commit_period X:
|
||||||
livestream:
|
Commit the database after every X insertions. Default = 200.
|
||||||
Watch for new items in an infinite loop.
|
|
||||||
'''.strip(),
|
'''.strip(),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -335,7 +349,7 @@ def get_argparse(args):
|
||||||
|
|
||||||
def livestream_argparse(args):
|
def livestream_argparse(args):
|
||||||
try:
|
try:
|
||||||
insert_items(livestream())
|
insert_items(livestream(), commit_period=args.commit_period)
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
commit()
|
commit()
|
||||||
|
|
||||||
|
@ -350,7 +364,7 @@ def update_argparse(args):
|
||||||
ids = range(lower, upper+1)
|
ids = range(lower, upper+1)
|
||||||
items = get_items(ids, threads=args.threads)
|
items = get_items(ids, threads=args.threads)
|
||||||
|
|
||||||
insert_items(items)
|
insert_items(items, commit_period=args.commit_period)
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
commit()
|
commit()
|
||||||
|
|
||||||
|
@ -366,11 +380,12 @@ def update_items_argparse(args):
|
||||||
cur = sql.execute(query, bindings)
|
cur = sql.execute(query, bindings)
|
||||||
ids = cur.fetchall()
|
ids = cur.fetchall()
|
||||||
|
|
||||||
|
log.info('Updating %d items.', len(ids))
|
||||||
ids = [id for (id,) in ids]
|
ids = [id for (id,) in ids]
|
||||||
items = get_items(ids, threads=args.threads)
|
items = get_items(ids, threads=args.threads)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
insert_items(items)
|
insert_items(items, commit_period=args.commit_period)
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
commit()
|
commit()
|
||||||
|
|
||||||
|
@ -384,20 +399,23 @@ def main(argv):
|
||||||
p_get.add_argument('--lower', type=int, default=None)
|
p_get.add_argument('--lower', type=int, default=None)
|
||||||
p_get.add_argument('--upper', type=int, default=None)
|
p_get.add_argument('--upper', type=int, default=None)
|
||||||
p_get.add_argument('--threads', type=int, default=None)
|
p_get.add_argument('--threads', type=int, default=None)
|
||||||
p_get.add_argument('--commit_period', '--commit-period', type=int, default=1000)
|
p_get.add_argument('--commit_period', '--commit-period', type=int, default=200)
|
||||||
p_get.set_defaults(func=get_argparse)
|
p_get.set_defaults(func=get_argparse)
|
||||||
|
|
||||||
p_livestream = subparsers.add_parser('livestream')
|
p_livestream = subparsers.add_parser('livestream')
|
||||||
|
p_livestream.add_argument('--commit_period', '--commit-period', type=int, default=200)
|
||||||
p_livestream.set_defaults(func=livestream_argparse)
|
p_livestream.set_defaults(func=livestream_argparse)
|
||||||
|
|
||||||
p_update = subparsers.add_parser('update')
|
p_update = subparsers.add_parser('update')
|
||||||
p_update.add_argument('--threads', type=int, default=None)
|
p_update.add_argument('--threads', type=int, default=None)
|
||||||
|
p_update.add_argument('--commit_period', '--commit-period', type=int, default=200)
|
||||||
p_update.set_defaults(func=update_argparse)
|
p_update.set_defaults(func=update_argparse)
|
||||||
|
|
||||||
p_update_items = subparsers.add_parser('update_items', aliases=['update-items'])
|
p_update_items = subparsers.add_parser('update_items', aliases=['update-items'])
|
||||||
p_update_items.add_argument('--days', type=float, required=True)
|
p_update_items.add_argument('--days', type=float, required=True)
|
||||||
p_update_items.add_argument('--threads', type=int, default=None)
|
p_update_items.add_argument('--threads', type=int, default=None)
|
||||||
p_update_items.add_argument('--only_mature', '--only-mature', action='store_true')
|
p_update_items.add_argument('--only_mature', '--only-mature', action='store_true')
|
||||||
|
p_update_items.add_argument('--commit_period', '--commit-period', type=int, default=200)
|
||||||
p_update_items.set_defaults(func=update_items_argparse)
|
p_update_items.set_defaults(func=update_items_argparse)
|
||||||
|
|
||||||
return betterhelp.subparser_main(
|
return betterhelp.subparser_main(
|
||||||
|
|
2
requirements.txt
Normal file
2
requirements.txt
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
requests
|
||||||
|
voussoirkit
|
Loading…
Reference in a new issue