commit 708c774e52c03fb21dcfbd4af5131b5272729124 Author: Ethan Dalool Date: Mon Nov 13 19:13:19 2017 -0800 Initial migratory commit from voussoir/reddit. diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..3612116 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,30 @@ +# Auto detect text files and perform LF normalization +* text=auto + +# Custom for Visual Studio +*.cs diff=csharp +*.sln merge=union +*.csproj merge=union +*.vbproj merge=union +*.fsproj merge=union +*.dbproj merge=union + +*.psd binary +*.zip binary +*.db binary +*.png binary +*.jpg binary +*.ico binary +*.exe binary + +# Standard to msysgit +*.doc diff=astextplain +*.DOC diff=astextplain +*.docx diff=astextplain +*.DOCX diff=astextplain +*.dot diff=astextplain +*.DOT diff=astextplain +*.pdf diff=astextplain +*.PDF diff=astextplain +*.rtf diff=astextplain +*.RTF diff=astextplain diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..05c1d1c --- /dev/null +++ b/.gitignore @@ -0,0 +1,231 @@ +databases/* +@hangman.md +hangman.py +merge_database.py +migrate_20160605.py +timesearch_backup.py + +*.ignore +*.db-journal +*.pydevproject +.project +.metadata +bin/ +tmp/ +*.tmp +*.bak +*.swp +*~.nib +local.properties +.classpath +.settings/ +.loadpath + +# External tool builders +.externalToolBuilders/ + +# Locally stored "Eclipse launch configurations" +*.launch + +# CDT-specific +.cproject + +# PDT-specific +.buildpath + + +################# +## Visual Studio +################# + +## Ignore Visual Studio temporary files, build results, and +## files generated by popular Visual Studio add-ons. + +# User-specific files +*.suo +*.user +*.sln.docstates + +# Build results + +[Dd]ebug/ +[Rr]elease/ +x64/ +build/ +[Bb]in/ +[Oo]bj/ + +# MSTest test Results +[Tt]est[Rr]esult*/ +[Bb]uild[Ll]og.* + +*_i.c +*_p.c +*.ilk +*.meta +*.obj +*.pch +*.pdb +*.pgc +*.pgd +*.rsp +*.sbr +*.tlb +*.tli +*.tlh +*.tmp +*.tmp_proj +*.log +*.vspscc +*.vssscc +.builds +*.pidb +*.log +*.scc + +# Visual C++ cache files +ipch/ +*.aps +*.ncb +*.opensdf +*.sdf +*.cachefile + +# Visual Studio profiler +*.psess +*.vsp +*.vspx + +# Guidance Automation Toolkit +*.gpState + +# ReSharper is a .NET coding add-in +_ReSharper*/ +*.[Rr]e[Ss]harper + +# TeamCity is a build add-in +_TeamCity* + +# DotCover is a Code Coverage Tool +*.dotCover + +# NCrunch +*.ncrunch* +.*crunch*.local.xml + +# Installshield output folder +[Ee]xpress/ + +# DocProject is a documentation generator add-in +DocProject/buildhelp/ +DocProject/Help/*.HxT +DocProject/Help/*.HxC +DocProject/Help/*.hhc +DocProject/Help/*.hhk +DocProject/Help/*.hhp +DocProject/Help/Html2 +DocProject/Help/html + +# Click-Once directory +publish/ + +# Publish Web Output +*.Publish.xml +*.pubxml + +# NuGet Packages Directory +## TODO: If you have NuGet Package Restore enabled, uncomment the next line +#packages/ + +# Windows Azure Build Output +csx +*.build.csdef + +# Windows Store app package directory +AppPackages/ + +# Others +sql/ +*.Cache +ClientBin/ +[Ss]tyle[Cc]op.* +~$* +*~ +*.dbmdl +*.[Pp]ublish.xml +*.pfx +*.publishsettings + +# RIA/Silverlight projects +Generated_Code/ + +# Backup & report files from converting an old project file to a newer +# Visual Studio version. Backup files are not needed, because we have git ;-) +_UpgradeReport_Files/ +Backup*/ +UpgradeLog*.XML +UpgradeLog*.htm + +# SQL Server files +App_Data/*.mdf +App_Data/*.ldf + +############# +## Windows detritus +############# + +# Windows image file caches +Thumbs.db +ehthumbs.db + +# Folder config file +Desktop.ini + +# Recycle Bin used on file shares +$RECYCLE.BIN/ + +# Mac crap +.DS_Store + + +############# +## Python +############# + +*.py[co] + +# Packages +*.egg +*.egg-info +dist/ +build/ +eggs/ +parts/ +var/ +sdist/ +develop-eggs/ +.installed.cfg + +# Installer logs +pip-log.txt + +# Unit test / coverage reports +.coverage +.tox + +#Translations +*.mo + +#Mr Developer +.mr.developer.cfg +======= +*~ +*.egg +*.pyc +.coverage +*.egg-info/ +_build/ +build/ +dist/ +.DS_Store + diff --git a/README.md b/README.md new file mode 100644 index 0000000..b6c9e8d --- /dev/null +++ b/README.md @@ -0,0 +1,114 @@ +timesearch +========== + +I don't have a test suite. You're my test suite! Messages go to [/u/GoldenSights](https://reddit.com/u/GoldenSights). + +Timesearch is a collection of utilities for archiving subreddits. + +### Make sure you have: +- Installed [Python](https://www.python.org/download). I use Python 3.6. +- Installed PRAW >= 4, as well as the other modules in `requirements.txt`. Try `pip install -r requirements.txt` to get them all. +- Created an OAuth app at https://reddit.com/prefs/apps. Make it `script` type, and set the redirect URI to `http://localhost:8080`. The title and description can be anything you want, and the about URL is not required. +- Used [this PRAW script](https://praw.readthedocs.io/en/latest/tutorials/refresh_token.html) to generate a refresh token. Just save it as a .py file somewhere and run it through your terminal / command line. For simplicity's sake, I just choose `all` for the scopes. +- Downloaded a copy of [this file](https://github.com/voussoir/reddit/blob/master/bot4.py) and saved it as `bot.py`. Fill out the variables using your OAuth information, and read the instructions to see where to put it. The Useragent is a description of your API usage. Typically "/u/username's praw client" is sufficient. + +### This package consists of: + +- **timesearch**: If you try to page through `/new` on a subreddit, you'll hit a limit at or before 1,000 posts. Timesearch uses the `timestamp` cloudsearch query parameter to step from the beginning of a subreddit to present time, to collect as many submissions as possible. Read more about timestamp searching [here](https://www.reddit.com/r/reddittips/comments/2ix73n/use_cloudsearch_to_search_for_posts_on_reddit/). + `> timesearch.py timesearch -r subredditname ` + `> timesearch.py timesearch -u username ` + +- **commentaugment**: Although we can search for submissions, we cannot search for comments. After performing a timesearch, you can use commentaugment to download the comment tree for each submission. + Note: commentaugment only gets the comments attached to the submissions that you found in your timesearch scan. If you're trying to commentaugment on a user, you're going to get comments that were made on their submissions, **not** comments they made on other people's submissions. Therefore, comprehensively collecting a user's activity is not possible. You will have to use someone else's dataset like that of [/u/Stuck_in_the_Matrix](https://reddit.com/u/Stuck_in_the_Matrix) at [pushshift.io](https://pushshift.io). + `> timesearch.py commentaugment -r subredditname ` + `> timesearch.py commentaugment -u username ` + +- **livestream**: timesearch+commentaugment is great for starting your database and getting historical posts, but it's not the best for staying up-to-date. Instead, livestream monitors `/new` and `/comments` to continuously ingest data. + `> timesearch.py livestream -r subredditname ` + `> timesearch.py livestream -u username ` + +- **getstyles**: Downloads the stylesheet and CSS images. + `> timesearch.py getstyles -r subredditname` + +- **getwiki**: Downloads the wiki pages, sidebar, etc. from /wiki/pages. + `> timesearch.py getwiki -r subredditname` + +- **offline_reading**: Renders comment threads into HTML via markdown. + Note: I'm currently using the [markdown library from pypi](https://pypi.python.org/pypi/Markdown), and it doesn't do reddit's custom markdown like `/r/` or `/u/`, obviously. So far I don't think anybody really uses o_r so I haven't invested much time into improving it. + `> timesearch.py offline_reading -r subredditname ` + `> timesearch.py offline_reading -u username ` + +- **redmash**: Generates plaintext or HTML lists of submissions, sorted by a property of your choosing. You can order by date, author, flair, etc. + `> timesearch.py redmash -r subredditname ` + `> timesearch.py redmash -u username ` + +- **breakdown**: Produces a JSON file indicating which users make the most posts in a subreddit, or which subreddits a user posts in. + `> timesearch.py breakdown -r subredditname` + `> timesearch.py breakdown -u username` + +- **mergedb**: Copy all new data from one timesearch database into another. Useful for syncing or merging two scans of the same subreddit. + `> timesearch.py mergedb --from filepath/database1.db --to filepath/database2.db` + +### To use it + +You will need both the `timesearch` package (folder) and the external `timesearch.py` file. You can click the green "Clone or Download" button in the upper right. When you run the .py file, it sends your commandline arguments into the package. You can view a summarized version of all the help text with just `timesearch.py`, or you can view a specific docstring with `timesearch.py livestream`, etc. + +I recommend [sqlitebrowser](https://github.com/sqlitebrowser/sqlitebrowser/releases) if you want to inspect the database yourself. + +### Changelog +- 2017 11 13 + - Gave timesearch its own Github repository so that (1) it will be easier for people to download it and (2) it has a cleaner, more independent URL. [voussoir/timesearch](https://github.com/voussoir/timesearch) + +- 2017 11 05 + - Added a try-except inside livestream helper to prevent generator from terminating. + +- 2017 11 04 + - For timesearch, I switched from using my custom cloudsearch iterator to the one that comes with PRAW4+. + +- 2017 10 12 + - Added the `mergedb` utility for combining databases. + +- 2017 06 02 + - You can use `commentaugment -s abcdef` to get a particular thread even if you haven't scraped anything else from that subreddit. Previously `-s` only worked if the database already existed and you specified it via `-r`. Now it is inferred from the submission itself. + +- 2017 04 28 + - Complete restructure into package, started using PRAW4. + +- 2016 08 10 + - Started merging redmash and wrote its argparser + +- 2016 07 03 + - Improved docstring clarity. + +- 2016 07 02 + - Added `livestream` argparse + +- 2016 06 07 + - Offline_reading has been merged with the main timesearch file + - `get_all_posts` renamed to `timesearch` + - Timesearch parameter `usermode` renamed to `username`; `maxupper` renamed to `upper`. + - Everything now accessible via commandline arguments. Read the docstring at the top of the file. + +- 2016 06 05 + - NEW DATABASE SCHEME. Submissions and comments now live in different tables like they should have all along. Submission table has two new columns for a little bit of commentaugment metadata. This allows commentaugment to only scan threads that are new. + - You can use the `migrate_20160605.py` script to convert old databases into new ones. + +- 2015 11 11 + - created `offline_reading.py` which converts a timesearch database into a comment tree that can be rendered into HTML + +- 2015 09 07 + - fixed bug which allowed `livestream` to crash because `bot.refresh()` was outside of the try-catch. + +- 2015 08 19 + - fixed bug in which updatescores stopped iterating early if you had more than 100 comments in a row in the db + - commentaugment has been completely merged into the timesearch.py file. you can use commentaugment_prompt() to input the parameters, or use the commentaugment() function directly. + + +____ + + +I want to live in a future where everyone uses UTC and agrees on daylight savings. + +

+ Timesearch +

diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..59d1286 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +markdown +praw +voussoirkit diff --git a/timesearch.py b/timesearch.py new file mode 100644 index 0000000..4f48192 --- /dev/null +++ b/timesearch.py @@ -0,0 +1,5 @@ +import sys +import timesearch + +status_code = timesearch.main(sys.argv[1:]) +raise SystemExit(status_code) diff --git a/timesearch/__init__.py b/timesearch/__init__.py new file mode 100644 index 0000000..5ebe981 --- /dev/null +++ b/timesearch/__init__.py @@ -0,0 +1,436 @@ +import argparse +import sys + +from . import exceptions + +# NOTE: Originally I wanted the docstring for each module to be within their +# file. However, this means that composing the global helptext would require +# importing those modules, which will subsequently import PRAW and a whole lot +# of other things. This made TS very slow to load which is okay when you're +# actually using it but really terrible when you're just viewing the help text. +DOCSTRING = ''' +Timesearch +The subreddit archiver + +The basics: +1. Collect a subreddit's submissions + > timesearch.py timesearch -r subredditname + +2. Collect the comments for those submissions + > timesearch.py commentaugment -r subredditname + +3. Stay up-to-date + > timesearch.py livestream -r subredditname + + +Commands for collecting: +{timesearch} +{commentaugment} +{livestream} +{getstyles} +{getwiki} + +Commands for processing: +{offline_reading} +{redmash} +{breakdown} +{mergedb} + +TO SEE DETAILS ON EACH COMMAND, RUN +> timesearch.py +''' + +MODULE_DOCSTRINGS = { + 'breakdown': ''' +breakdown: + Give the comment / submission counts for users in a subreddit, or + the subreddits that a user posts to. + + Automatically dumps into a _breakdown.json file + in the same directory as the database. + + > timesearch.py breakdown -r subredditname + > timesearch.py breakdown -u username + + flags: + -r "test" | --subreddit "test": + The subreddit database to break down. + + -u "test" | --username "test": + The username database to break down. + + --sort "name" | "submissions" | "comments" | "total_posts" + Sort the output. +''', + + 'commentaugment': ''' +commentaugment: + Collect comments for the submissions in the database. + NOTE - if you did a timesearch scan on a username, this function is + mostly useless. It collects comments that were made on OP's submissions + but it does not find OP's comments on other people's submissions which + is what you probably wanted. Unfortunately that's not possible. + + > timesearch.py commentaugment -r subredditname + > timesearch.py commentaugment -u username + + flags: + -l 18 | --limit 18: + The number of MoreComments objects to replace. + Default: No limit + + -t 5 | --threshold 5: + The number of comments a MoreComments object must claim to have + for us to open it. + Actual number received may be lower. + Default: >= 0 + + -n 4 | --num_thresh 4: + The number of comments a submission must claim to have for us to + scan it at all. + Actual number received may be lower. + Default: >= 1 + + -s "t3_xxxxxx" | --specific "t3_xxxxxx": + Given a submission ID, t3_xxxxxx, scan only that submission. + + -v | --verbose: + If provided, print more stuff while working. +''', + + 'getstyles': ''' +getstyles: + Collect the stylesheet, and css images. + + > timesearch.py getstyles -r subredditname +''', + + 'getwiki': ''' +getwiki: + Collect all available wiki pages. + + > timesearch.py getwiki -r subredditname +''', + + 'mergedb': ''' +mergedb: + Copy all new posts from one timesearch database into another. + + > timesearch mergedb --from redditdev1.db --to redditdev2.db + + flags: + --from: + The database file containing the posts you wish to copy. + + --to: + The database file to which you will copy the posts. + The database is modified in-place. + Existing posts will be ignored and not updated. +''', + + 'livestream': ''' +livestream: + Continously collect submissions and/or comments. + + > timesearch.py livestream -r subredditname + > timesearch.py livestream -u username + + flags: + -r "test" | --subreddit "test": + The subreddit to collect from. + + -u "test" | --username "test": + The redditor to collect from. + + -s | --submissions: + If provided, do collect submissions. Otherwise don't. + + -c | --comments: + If provided, do collect comments. Otherwise don't. + + If submissions and comments are BOTH left unspecified, then they will + BOTH be collected. + + -v | --verbose: + If provided, print extra information to the screen. + + -w 30 | --wait 30: + The number of seconds to wait between cycles. + + -1 | --once: + If provided, only do a single loop. Otherwise go forever. +''', + + 'offline_reading': ''' +offline_reading: + Render submissions and comment threads to HTML via Markdown. + + > timesearch.py offline_reading -r subredditname + > timesearch.py offline_reading -u username + + flags: + -s "t3_xxxxxx" | --specific "t3_xxxxxx": + Given a submission ID, t3_xxxxxx, render only that submission. + Otherwise render every submission in the database. +''', + + 'redmash': ''' +redmash: + Dump submission listings to a plaintext or HTML file. + + > timesearch.py redmash -r subredditname + > timesearch.py redmash -u username + + flags: + -r "test" | --subreddit "test": + The subreddit database to dump + + -u "test" | --username "test": + The username database to dump + + --html: + Write HTML files instead of plain text. + + -st 50 | --score_threshold 50: + Only mash posts with at least this many points. + Applies to ALL mashes! + + --all: + Perform all of the mashes listed below. + + --date: + Perform a mash sorted by date. + + --title: + Perform a mash sorted by title. + + --score: + Perform a mash sorted by score. + + --author: + For subreddit databases only. + Perform a mash sorted by author. + + --sub: + For username databases only. + Perform a mash sorted by subreddit. + + --flair: + Perform a mash sorted by flair. + + examples: + `timesearch redmash -r botwatch --date` + does only the date file. + + `timesearch redmash -r botwatch --score --title` + does both the score and title files. + + `timesearch redmash -r botwatch --score --score_threshold 50` + only shows submissions with >= 50 points. + + `timesearch redmash -r botwatch --all` + performs all of the different mashes. +''', + + 'timesearch': ''' +timesearch: + Collect submissions from the subreddit across all of history, or + Collect submissions by a user (as many as possible). + + > timesearch.py timesearch -r subredditname + > timesearch.py timesearch -u username + + -r "test" | --subreddit "test": + The subreddit to scan. Mutually exclusive with username. + + -u "test" | --username "test": + The user to scan. Mutually exclusive with subreddit. + + -l "update" | --lower "update": + If a number - the unix timestamp to start at. + If "update" - continue from latest submission in db. + Default: update + + -up 1467460221 | --upper 1467460221: + If a number - the unix timestamp to stop at. + If not provided - stop at current time. + Default: current time + + -i 86400 | --interval 86400: + The initial interval for the scanning window, in seconds. + This is only a starting value. The window will shrink and stretch + as necessary based on received submission counts. + Default: 86400 +''', +} + + +def docstring_preview(text): + ''' + Return the brief description at the top of the text. + User can get full text by looking at each specifically. + ''' + return text.split('\n\n')[0] + +def listget(li, index, fallback=None): + try: + return li[index] + except IndexError: + return fallback + +def indent(text, spaces=4): + spaces = ' ' * spaces + return '\n'.join(spaces + line if line.strip() != '' else line for line in text.split('\n')) + +docstring_headers = { + key: indent(docstring_preview(value)) + for (key, value) in MODULE_DOCSTRINGS.items() +} + +DOCSTRING = DOCSTRING.format(**docstring_headers) + +#################################################################################################### +#################################################################################################### + +def breakdown_gateway(args): + from . import breakdown + breakdown.breakdown_argparse(args) + +def commentaugment_gateway(args): + from . import commentaugment + commentaugment.commentaugment_argparse(args) + +def getstyles_gateway(args): + from . import getstyles + getstyles.getstyles_argparse(args) + +def getwiki_gateway(args): + from . import getwiki + getwiki.getwiki_argparse(args) + +def livestream_gateway(args): + from . import livestream + livestream.livestream_argparse(args) + +def mergedb_gateway(args): + from . import mergedb + mergedb.mergedb_argparse(args) + +def offline_reading_gateway(args): + from . import offline_reading + offline_reading.offline_reading_argparse(args) + +def redmash_gateway(args): + from . import redmash + redmash.redmash_argparse(args) + +def timesearch_gateway(args): + from . import timesearch + timesearch.timesearch_argparse(args) + + +parser = argparse.ArgumentParser() +subparsers = parser.add_subparsers() + +p_breakdown = subparsers.add_parser('breakdown') +p_breakdown.add_argument('--sort', dest='sort', default=None) +p_breakdown.add_argument('-r', '--subreddit', dest='subreddit', default=None) +p_breakdown.add_argument('-u', '--user', dest='username', default=None) +p_breakdown.set_defaults(func=breakdown_gateway) + +p_commentaugment = subparsers.add_parser('commentaugment') +p_commentaugment.add_argument('-l', '--limit', dest='limit', default=None) +p_commentaugment.add_argument('-n', '--num_thresh', dest='num_thresh', default=1) +p_commentaugment.add_argument('-r', '--subreddit', dest='subreddit', default=None) +p_commentaugment.add_argument('-s', '--specific', dest='specific_submission', default=None) +p_commentaugment.add_argument('-t', '--threshold', dest='threshold', default=0) +p_commentaugment.add_argument('-u', '--user', dest='username', default=None) +p_commentaugment.add_argument('-v', '--verbose', dest='verbose', action='store_true') +p_commentaugment.set_defaults(func=commentaugment_gateway) + +p_getstyles = subparsers.add_parser('getstyles') +p_getstyles.add_argument('-r', '--subreddit', dest='subreddit') +p_getstyles.set_defaults(func=getstyles_gateway) + +p_getwiki = subparsers.add_parser('getwiki') +p_getwiki.add_argument('-r', '--subreddit', dest='subreddit') +p_getwiki.set_defaults(func=getwiki_gateway) + +p_livestream = subparsers.add_parser('livestream') +p_livestream.add_argument('-1', '--once', dest='once', action='store_true') +p_livestream.add_argument('-c', '--comments', dest='comments', action='store_true') +p_livestream.add_argument('-l', '--limit', dest='limit', default=None) +p_livestream.add_argument('-r', '--subreddit', dest='subreddit', default=None) +p_livestream.add_argument('-s', '--submissions', dest='submissions', action='store_true') +p_livestream.add_argument('-u', '--user', dest='username', default=None) +p_livestream.add_argument('-v', '--verbose', dest='verbose', action='store_true') +p_livestream.add_argument('-w', '--wait', dest='sleepy', default=30) +p_livestream.set_defaults(func=livestream_gateway) + +p_mergedb = subparsers.add_parser('mergedb') +p_mergedb.add_argument('--from', dest='from_db_path', required=True) +p_mergedb.add_argument('--to', dest='to_db_path', required=True) +p_mergedb.set_defaults(func=mergedb_gateway) + +p_offline_reading = subparsers.add_parser('offline_reading') +p_offline_reading.add_argument('-r', '--subreddit', dest='subreddit', default=None) +p_offline_reading.add_argument('-s', '--specific', dest='specific_submission', default=None) +p_offline_reading.add_argument('-u', '--user', dest='username', default=None) +p_offline_reading.set_defaults(func=offline_reading_gateway) + +p_redmash = subparsers.add_parser('redmash') +p_redmash.add_argument('--all', dest='do_all', action='store_true') +p_redmash.add_argument('--author', dest='do_author', action='store_true') +p_redmash.add_argument('--date', dest='do_date', action='store_true') +p_redmash.add_argument('--flair', dest='do_flair', action='store_true') +p_redmash.add_argument('--html', dest='html', action='store_true') +p_redmash.add_argument('--score', dest='do_score', action='store_true') +p_redmash.add_argument('--sub', dest='do_subreddit', action='store_true') +p_redmash.add_argument('--title', dest='do_title', action='store_true') +p_redmash.add_argument('-r', '--subreddit', dest='subreddit', default=None) +p_redmash.add_argument('-st', '--score_threshold', dest='score_threshold', default=0) +p_redmash.add_argument('-u', '--user', dest='username', default=None) +p_redmash.set_defaults(func=redmash_gateway) + +p_timesearch = subparsers.add_parser('timesearch') +p_timesearch.add_argument('-i', '--interval', dest='interval', default=86400) +p_timesearch.add_argument('-l', '--lower', dest='lower', default='update') +p_timesearch.add_argument('-r', '--subreddit', dest='subreddit', default=None) +p_timesearch.add_argument('-u', '--user', dest='username', default=None) +p_timesearch.add_argument('-up', '--upper', dest='upper', default=None) +p_timesearch.set_defaults(func=timesearch_gateway) + +def main(argv): + helpstrings = {'', 'help', '-h', '--help'} + + command = listget(argv, 0, '').lower() + + # The user did not enter a command, or entered something unrecognized. + if command not in MODULE_DOCSTRINGS: + print(DOCSTRING) + if command == '': + print('You are seeing the default help text because you did not choose a command.') + elif command not in helpstrings: + print('You are seeing the default help text because "%s" was not recognized' % command) + return 1 + + # The user entered a command, but no further arguments, or just help. + argument = listget(argv, 1, '').lower() + if argument in helpstrings: + print(MODULE_DOCSTRINGS[command]) + return 1 + + args = parser.parse_args(argv) + try: + args.func(args) + except exceptions.DBNotFound as e: + message = '"%s" is not an existing database.' + message += '\nHave you used any of the other utilities to collect data?' + message = message % e.path.absolute_path + print(message) + return 1 + + return 0 + +if __name__ == '__main__': + raise SystemExit(main(sys.argv[1:])) diff --git a/timesearch/breakdown.py b/timesearch/breakdown.py new file mode 100644 index 0000000..8257db8 --- /dev/null +++ b/timesearch/breakdown.py @@ -0,0 +1,103 @@ +import os +import json + +from . import common +from . import tsdb + + +def breakdown_database(subreddit=None, username=None): + ''' + Given a database, return a json dict breaking down the submission / comment count for + users (if a subreddit database) or subreddits (if a user database). + ''' + if (subreddit is None) == (username is None): + raise Exception('Enter subreddit or username but not both') + + breakdown_results = {} + def _ingest(names, subkey): + for name in names: + breakdown_results.setdefault(name, {}) + breakdown_results[name].setdefault(subkey, 0) + breakdown_results[name][subkey] += 1 + + if subreddit: + database = tsdb.TSDB.for_subreddit(subreddit, do_create=False) + else: + database = tsdb.TSDB.for_user(username, do_create=False) + cur = database.sql.cursor() + + for table in ['submissions', 'comments']: + if subreddit: + cur.execute('SELECT author FROM %s' % table) + elif username: + cur.execute('SELECT subreddit FROM %s' % table) + + names = (row[0] for row in common.fetchgenerator(cur)) + _ingest(names, table) + + for name in breakdown_results: + breakdown_results[name].setdefault('submissions', 0) + breakdown_results[name].setdefault('comments', 0) + + return breakdown_results + +def breakdown_argparse(args): + if args.subreddit: + database = tsdb.TSDB.for_subreddit(args.subreddit, do_create=False) + else: + database = tsdb.TSDB.for_user(args.username, do_create=False) + + breakdown_results = breakdown_database( + subreddit=args.subreddit, + username=args.username, + ) + + def sort_name(name): + return name.lower() + def sort_submissions(name): + invert_score = -1 * breakdown_results[name]['submissions'] + return (invert_score, name.lower()) + def sort_comments(name): + invert_score = -1 * breakdown_results[name]['comments'] + return (invert_score, name.lower()) + def sort_total_posts(name): + invert_score = breakdown_results[name]['submissions'] + breakdown_results[name]['comments'] + invert_score = -1 * invert_score + return (invert_score, name.lower()) + breakdown_sorters = { + 'name': sort_name, + 'submissions': sort_submissions, + 'comments': sort_comments, + 'total_posts': sort_total_posts, + } + + breakdown_names = list(breakdown_results.keys()) + if args.sort is not None: + try: + sorter = breakdown_sorters[args.sort.lower()] + except KeyError: + message = '{sorter} is not a sorter. Choose from {options}' + message = message.format(sorter=args.sort, options=list(breakdown_sorters.keys())) + raise KeyError(message) + breakdown_names.sort(key=sorter) + dump = ' "{name}": {{"submissions": {submissions}, "comments": {comments}}}' + dump = [dump.format(name=name, **breakdown_results[name]) for name in breakdown_names] + dump = ',\n'.join(dump) + dump = '{\n' + dump + '\n}\n' + else: + dump = json.dumps(breakdown_results) + + if args.sort is None: + breakdown_basename = '%s_breakdown.json' + else: + breakdown_basename = '%%s_breakdown_%s.json' % args.sort + + breakdown_basename = breakdown_basename % database.filepath.replace_extension('').basename + breakdown_filepath = database.breakdown_dir.with_child(breakdown_basename) + os.makedirs(breakdown_filepath.parent.absolute_path, exist_ok=True) + breakdown_file = open(breakdown_filepath.absolute_path, 'w') + with breakdown_file: + breakdown_file.write(dump) + print('Wrote', breakdown_filepath.relative_path) + + return breakdown_results diff --git a/timesearch/commentaugment.py b/timesearch/commentaugment.py new file mode 100644 index 0000000..73d622a --- /dev/null +++ b/timesearch/commentaugment.py @@ -0,0 +1,179 @@ +import traceback + +from . import common +from . import tsdb + + +def commentaugment( + subreddit=None, + username=None, + limit=0, + num_thresh=0, + specific_submission=None, + threshold=0, + verbose=0, + ): + ''' + Take the IDs of collected submissions, and gather comments from those threads. + Please see the global DOCSTRING_COMMENTAUGMENT variable. + ''' + common.bot.login(common.r) + if specific_submission is not None: + if not specific_submission.startswith('t3_'): + specific_submission = 't3_' + specific_submission + specific_submission_obj = common.r.submission(specific_submission[3:]) + subreddit = specific_submission_obj.subreddit.display_name + + if (subreddit is None) == (username is None): + raise Exception('Enter subreddit or username but not both') + + if subreddit: + if specific_submission is None: + database = tsdb.TSDB.for_subreddit(subreddit, do_create=False) + else: + database = tsdb.TSDB.for_subreddit(subreddit, do_create=True) + else: + database = tsdb.TSDB.for_user(username, do_create=False) + cur = database.sql.cursor() + + if limit == 0: + limit = None + + if specific_submission is None: + query = ''' + SELECT idstr FROM submissions + WHERE idstr IS NOT NULL + AND augmented_at IS NULL + AND num_comments >= ? + ORDER BY num_comments DESC + ''' + bindings = [num_thresh] + cur.execute(query, bindings) + fetchall = [item[0] for item in cur.fetchall()] + else: + # Make sure the object we're augmenting is in the table too! + database.insert(specific_submission_obj) + fetchall = [specific_submission] + + totalthreads = len(fetchall) + + if verbose: + spacer = '\n\t' + else: + spacer = ' ' + + scannedthreads = 0 + get_submission = common.nofailrequest(get_submission_immediately) + while len(fetchall) > 0: + id_batch = fetchall[:100] + fetchall = fetchall[100:] + + for submission in id_batch: + submission = get_submission(submission.split('_')[-1]) + message = 'Processing {fullname}{spacer}expecting {num_comments} | ' + message = message.format( + fullname=submission.fullname, + spacer=spacer, + num_comments=submission.num_comments, + ) + + print(message, end='', flush=True) + if verbose: + print() + + comments = get_comments_for_thread(submission, limit, threshold, verbose) + + database.insert(comments, commit=False) + query = ''' + UPDATE submissions + set augmented_at = ?, + augmented_count = ? + WHERE idstr == ? + ''' + bindings = [common.get_now(), len(comments), submission.fullname] + cur.execute(query, bindings) + database.sql.commit() + + scannedthreads += 1 + if verbose: + print('\t', end='') + message = 'Found {count} |{spacer}{scannedthreads} / {totalthreads}' + message = message.format( + count=len(comments), + spacer=spacer, + scannedthreads=scannedthreads, + totalthreads=totalthreads, + ) + print(message) + +def get_comments_for_thread(submission, limit, threshold, verbose): + comments = common.nofailrequest(lambda x: x.comments)(submission) + # PRAW4 flatten is just list(). + comments = manually_replace_comments(comments, limit, threshold, verbose) + return comments + +def get_submission_immediately(submission_id): + submission = common.r.submission(submission_id) + # force the lazyloader + submission.title = submission.title + return submission + +def manually_replace_comments(incomments, limit=None, threshold=0, verbose=False): + ''' + PRAW's replace_more_comments method cannot continue + where it left off in the case of an Ow! screen. + So I'm writing my own function to get each MoreComments item individually + + Furthermore, this function will maximize the number of retrieved comments by + sorting the MoreComments objects and getting the big chunks before worrying + about the tail ends. + ''' + incomments = incomments.list() + comments = [] + morecomments = [] + while len(incomments) > 0: + item = incomments.pop() + if isinstance(item, common.praw.models.MoreComments) and item.count >= threshold: + morecomments.append(item) + elif isinstance(item, common.praw.models.Comment): + comments.append(item) + + while True: + try: + if limit is not None and limit <= 0: + break + if len(morecomments) == 0: + break + morecomments.sort(key=lambda x: x.count) + mc = morecomments.pop() + additional = common.nofailrequest(mc.comments)() + additionals = 0 + if limit is not None: + limit -= 1 + for item in additional: + if isinstance(item, common.praw.models.MoreComments) and item.count >= threshold: + morecomments.append(item) + elif isinstance(item, common.praw.models.Comment): + comments.append(item) + additionals += 1 + if verbose: + s = '\tGot %d more, %d so far.' % (additionals, len(comments)) + if limit is not None: + s += ' Can perform %d more replacements' % limit + print(s) + except KeyboardInterrupt: + raise + except Exception: + traceback.print_exc() + return comments + +def commentaugment_argparse(args): + return commentaugment( + subreddit=args.subreddit, + username=args.username, + limit=common.int_none(args.limit), + threshold=common.int_none(args.threshold), + num_thresh=common.int_none(args.num_thresh), + verbose=args.verbose, + specific_submission=args.specific_submission, + ) diff --git a/timesearch/common.py b/timesearch/common.py new file mode 100644 index 0000000..dfd7960 --- /dev/null +++ b/timesearch/common.py @@ -0,0 +1,104 @@ +import datetime +import os +import time +import traceback + +try: + import praw +except ImportError: + praw = None +if praw is None or praw.__version__.startswith('3.'): + import praw4 + praw = praw4 + +try: + import bot +except ImportError: + bot = None +if bot is None or bot.praw != praw: + import bot4 + bot = bot4 + + +r = bot.anonymous() + +def assert_file_exists(filepath): + if not os.path.exists(filepath): + raise FileNotFoundError(filepath) + +def b36(i): + if isinstance(i, int): + return base36encode(i) + return base36decode(i) + +def base36decode(number): + return int(number, 36) + +def base36encode(number, alphabet='0123456789abcdefghijklmnopqrstuvwxyz'): + """Converts an integer to a base36 string.""" + if not isinstance(number, (int)): + raise TypeError('number must be an integer') + base36 = '' + sign = '' + if number < 0: + sign = '-' + number = -number + if 0 <= number < len(alphabet): + return sign + alphabet[number] + while number != 0: + number, i = divmod(number, len(alphabet)) + base36 = alphabet[i] + base36 + return sign + base36 + +def fetchgenerator(cursor): + while True: + item = cursor.fetchone() + if item is None: + break + yield item + +def generator_chunker(generator, chunk_size): + chunk = [] + for item in generator: + chunk.append(item) + if len(chunk) == chunk_size: + yield chunk + chunk = [] + if len(chunk) != 0: + yield chunk + +def get_now(stamp=True): + now = datetime.datetime.now(datetime.timezone.utc) + if stamp: + return int(now.timestamp()) + return now + +def human(timestamp): + x = datetime.datetime.utcfromtimestamp(timestamp) + x = datetime.datetime.strftime(x, "%b %d %Y %H:%M:%S") + return x + +def int_none(x): + if x is None: + return None + return int(x) + +def nofailrequest(function): + ''' + Creates a function that will retry until it succeeds. + This function accepts 1 parameter, a function, and returns a modified + version of that function that will try-catch, sleep, and loop until it + finally returns. + ''' + def a(*args, **kwargs): + while True: + try: + result = function(*args, **kwargs) + return result + except KeyboardInterrupt: + raise + except Exception: + traceback.print_exc() + print('Retrying in 2...') + time.sleep(2) + return a diff --git a/timesearch/exceptions.py b/timesearch/exceptions.py new file mode 100644 index 0000000..b2142d8 --- /dev/null +++ b/timesearch/exceptions.py @@ -0,0 +1,3 @@ +class DBNotFound(FileNotFoundError): + def __init__(self, path): + self.path = path diff --git a/timesearch/getstyles.py b/timesearch/getstyles.py new file mode 100644 index 0000000..5d8dca9 --- /dev/null +++ b/timesearch/getstyles.py @@ -0,0 +1,31 @@ +import os +import requests + +from . import common +from . import tsdb + + +def getstyles(subreddit): + print('Getting styles for /r/%s' % subreddit) + subreddit = common.r.subreddit(subreddit) + + styles = subreddit.stylesheet() + database = tsdb.TSDB.for_subreddit(subreddit.display_name) + + os.makedirs(database.styles_dir.absolute_path, exist_ok=True) + + stylesheet_filepath = database.styles_dir.with_child('stylesheet.css') + print('Downloading %s' % stylesheet_filepath.relative_path) + with open(stylesheet_filepath.absolute_path, 'w', encoding='utf-8') as stylesheet: + stylesheet.write(styles.stylesheet) + + for image in styles.images: + image_basename = image['name'] + '.' + image['url'].split('.')[-1] + image_filepath = database.styles_dir.with_child(image_basename) + print('Downloading %s' % image_filepath.relative_path) + with open(image_filepath.absolute_path, 'wb') as image_file: + response = requests.get(image['url']) + image_file.write(response.content) + +def getstyles_argparse(args): + return getstyles(args.subreddit) diff --git a/timesearch/getwiki.py b/timesearch/getwiki.py new file mode 100644 index 0000000..ee0d2ca --- /dev/null +++ b/timesearch/getwiki.py @@ -0,0 +1,23 @@ +import os + +from . import common +from . import tsdb + + +def getwiki(subreddit): + print('Getting wiki pages for /r/%s' % subreddit) + subreddit = common.r.subreddit(subreddit) + database = tsdb.TSDB.for_subreddit(subreddit) + + for wikipage in subreddit.wiki: + if wikipage.name == 'config/stylesheet': + continue + + wikipage_path = database.wiki_dir.join(wikipage.name).replace_extension('md') + os.makedirs(wikipage_path.parent.absolute_path, exist_ok=True) + with open(wikipage_path.absolute_path, 'w', encoding='utf-8') as handle: + handle.write(wikipage.content_md) + print('Wrote', wikipage_path.relative_path) + +def getwiki_argparse(args): + return getwiki(args.subreddit) diff --git a/timesearch/livestream.py b/timesearch/livestream.py new file mode 100644 index 0000000..acb5928 --- /dev/null +++ b/timesearch/livestream.py @@ -0,0 +1,175 @@ +import copy +import time +import traceback + +from . import common +from . import tsdb + + +def livestream( + subreddit=None, + username=None, + verbose=False, + as_a_generator=False, + do_submissions=True, + do_comments=True, + limit=100, + only_once=False, + sleepy=30, + ): + ''' + Continuously get posts from this source + and insert them into the database + + as_a_generator: + return a generator where every iteration does a single livestream loop. + This is good if you want to manage multiple livestreams yourself by + calling `next` on each of them, instead of getting stuck in here. + ''' + if bool(subreddit) == bool(username): + raise Exception('Require either username / subreddit parameter, but not both') + if bool(do_submissions) is bool(do_comments) is False: + raise Exception('Require do_submissions and/or do_comments parameter') + common.bot.login(common.r) + + if subreddit: + print('Getting subreddit %s' % subreddit) + database = tsdb.TSDB.for_subreddit(subreddit) + subreddit = common.r.subreddit(subreddit) + submissions = subreddit.new if do_submissions else None + comments = subreddit.comments if do_comments else None + else: + print('Getting redditor %s' % username) + database = tsdb.TSDB.for_user(username) + user = common.r.redditor(username) + submissions = user.submissions.new if do_submissions else None + comments = user.comments.new if do_comments else None + + generator = _livestream_as_a_generator( + database, + submission_function=submissions, + comment_function=comments, + limit=limit, + params={'show': 'all'}, + verbose=verbose, + ) + if as_a_generator: + return generator + + while True: + try: + step = next(generator) + newtext = '%ds, %dc' % (step['new_submissions'], step['new_comments']) + totalnew = step['new_submissions'] + step['new_comments'] + status = '{now} +{new}'.format(now=common.human(common.get_now()), new=newtext) + print(status, end='', flush=True) + if totalnew == 0 and verbose is False: + # Since there were no news, allow the next line to overwrite status + print('\r', end='') + else: + print() + + if verbose: + print('Loop finished.') + if only_once: + break + time.sleep(sleepy) + + except KeyboardInterrupt: + print() + return + + except Exception: + traceback.print_exc() + print('Retrying in 5...') + time.sleep(5) + +hangman = lambda: livestream( + username='gallowboob', + do_submissions=True, + do_comments=True, + sleepy=60, +) + +def _livestream_as_a_generator( + database, + submission_function, + comment_function, + limit, + params, + verbose, + ): + while True: + #common.r.handler.clear_cache() + try: + items = _livestream_helper( + submission_function=submission_function, + comment_function=comment_function, + limit=limit, + params=params, + verbose=verbose, + ) + newitems = database.insert(items) + yield newitems + except Exception: + traceback.print_exc() + print('Retrying in 5...') + time.sleep(5) + + +def _livestream_helper( + submission_function=None, + comment_function=None, + verbose=False, + *args, + **kwargs, + ): + ''' + Given a submission-retrieving function and/or a comment-retrieving function, + collect submissions and comments in a list together and return that. + + args and kwargs go into the collecting functions. + ''' + if bool(submission_function) is bool(comment_function) is False: + raise Exception('Require submissions and/or comments parameter') + results = [] + + if submission_function: + if verbose: + print('Getting submissions', args, kwargs) + this_kwargs = copy.deepcopy(kwargs) + submission_batch = submission_function(*args, **this_kwargs) + results.extend(submission_batch) + if comment_function: + if verbose: + print('Getting comments', args, kwargs) + this_kwargs = copy.deepcopy(kwargs) + comment_batch = comment_function(*args, **this_kwargs) + results.extend(comment_batch) + if verbose: + print('Collected. Saving...') + return results + +def livestream_argparse(args): + if args.submissions is args.comments is False: + args.submissions = True + args.comments = True + if args.limit is None: + limit = 100 + else: + limit = int(args.limit) + + if args.submissions is False and args.comments is False: + args.submissions = True + args.comments = True + + return livestream( + subreddit=args.subreddit, + username=args.username, + do_comments=args.comments, + do_submissions=args.submissions, + limit=limit, + verbose=args.verbose, + only_once=args.once, + sleepy=common.int_none(args.sleepy), + ) diff --git a/timesearch/mergedb.py b/timesearch/mergedb.py new file mode 100644 index 0000000..77b0f3a --- /dev/null +++ b/timesearch/mergedb.py @@ -0,0 +1,35 @@ +import os +import requests + +from . import common +from . import tsdb + + +MIGRATE_QUERY = ''' +INSERT INTO {tablename} +SELECT othertable.* FROM other.{tablename} othertable +LEFT JOIN {tablename} mytable ON mytable.idint == othertable.idint +WHERE mytable.idint IS NULL; +''' + +def _migrate_helper(db, tablename): + oldcount = db.cur.execute('SELECT count(*) FROM %s' % tablename).fetchone()[0] + + query = MIGRATE_QUERY.format(tablename=tablename) + print(query) + db.cur.execute(query) + db.sql.commit() + + newcount = db.cur.execute('SELECT count(*) FROM %s' % tablename).fetchone()[0] + print('Gained %d items.' % (newcount - oldcount)) + +def mergedb(from_db_path, to_db_path): + to_db = tsdb.TSDB(to_db_path) + from_db = tsdb.TSDB(from_db_path) + + to_db.cur.execute('ATTACH DATABASE "%s" AS other' % from_db_path) + _migrate_helper(to_db, 'submissions') + _migrate_helper(to_db, 'comments') + +def mergedb_argparse(args): + return mergedb(args.from_db_path, args.to_db_path) diff --git a/timesearch/offline_reading.py b/timesearch/offline_reading.py new file mode 100644 index 0000000..968074b --- /dev/null +++ b/timesearch/offline_reading.py @@ -0,0 +1,340 @@ +import os +import markdown + +from . import common +from . import tsdb + + +class DBEntry: + def __init__(self, fetch): + if fetch[1].startswith('t3_'): + columns = tsdb.SQL_SUBMISSION_COLUMNS + self.object_type = 'submission' + else: + columns = tsdb.SQL_COMMENT_COLUMNS + self.object_type = 'comment' + + self.id = None + self.idstr = None + for (index, attribute) in enumerate(columns): + setattr(self, attribute, fetch[index]) + + def __repr__(self): + return 'DBEntry(\'%s\')' % self.id + + +class TreeNode: + def __init__(self, identifier, data, parent=None): + assert isinstance(identifier, str) + assert '\\' not in identifier + self.identifier = identifier + self.data = data + self.parent = parent + self.children = {} + + def __getitem__(self, key): + return self.children[key] + + def __repr__(self): + return 'TreeNode %s' % self.abspath() + + def abspath(self): + node = self + nodes = [node] + while node.parent is not None: + node = node.parent + nodes.append(node) + nodes.reverse() + nodes = [node.identifier for node in nodes] + return '\\'.join(nodes) + + def add_child(self, other_node, overwrite_parent=False): + self.check_child_availability(other_node.identifier) + if other_node.parent is not None and not overwrite_parent: + raise ValueError('That node already has a parent. Try `overwrite_parent=True`') + + other_node.parent = self + self.children[other_node.identifier] = other_node + return other_node + + def check_child_availability(self, identifier): + if ':' in identifier: + raise Exception('Only roots may have a colon') + if identifier in self.children: + raise Exception('Node %s already has child %s' % (self.identifier, identifier)) + + def detach(self): + del self.parent.children[self.identifier] + self.parent = None + + def listnodes(self, customsort=None): + items = list(self.children.items()) + if customsort is None: + items.sort(key=lambda x: x[0].lower()) + else: + items.sort(key=customsort) + return [item[1] for item in items] + + def merge_other(self, othertree, otherroot=None): + newroot = None + if ':' in othertree.identifier: + if otherroot is None: + raise Exception('Must specify a new name for the other tree\'s root') + else: + newroot = otherroot + else: + newroot = othertree.identifier + othertree.identifier = newroot + othertree.parent = self + self.check_child_availability(newroot) + self.children[newroot] = othertree + + def printtree(self, customsort=None): + for node in self.walk(customsort): + print(node.abspath()) + + def walk(self, customsort=None): + yield self + for child in self.listnodes(customsort=customsort): + #print(child) + #print(child.listnodes()) + yield from child.walk(customsort=customsort) + +def html_format_comment(comment): + text = ''' +
+

+ {usernamelink} + | {score} points + | {human} +

+ +

{body}

+ +

+ {permalink} +

+ {children} +
+ '''.format( + id=comment.idstr, + body=sanitize_braces(render_markdown(comment.body)), + usernamelink=html_helper_userlink(comment), + score=comment.score, + human=common.human(comment.created), + permalink=html_helper_permalink(comment), + children='{children}', + ) + return text + +def html_format_submission(submission): + text = ''' +
+ +

+ {usernamelink} + | {score} points + | {human} +

+ + {title} +

{url_or_text}

+ +

+ {permalink} +

+
+ {children} + '''.format( + id=submission.idstr, + title=sanitize_braces(submission.title), + usernamelink=html_helper_userlink(submission), + score=submission.score, + human=common.human(submission.created), + permalink=html_helper_permalink(submission), + url_or_text=html_helper_urlortext(submission), + children='{children}', + ) + return text + +def html_from_database(subreddit=None, username=None, specific_submission=None): + ''' + Given a timesearch database filename, produce .html files for each + of the submissions it contains (or one particular submission fullname) + ''' + if markdown is None: + raise ImportError('Page cannot be rendered without the markdown module') + + if (subreddit is None) == (username is None): + raise Exception('Enter subreddit or username but not both') + + if subreddit: + database = tsdb.TSDB.for_subreddit(subreddit, do_create=False) + else: + database = tsdb.TSDB.for_user(username, do_create=False) + + submission_trees = trees_from_database(database, specific_submission) + for submission_tree in submission_trees: + page = html_from_tree(submission_tree, sort=lambda x: x.data.score * -1) + os.makedirs(database.offline_reading_dir.absolute_path, exist_ok=True) + html_basename = '%s.html' % submission_tree.identifier + html_filepath = database.offline_reading_dir.with_child(html_basename) + html_handle = open(html_filepath.absolute_path, 'w', encoding='utf-8') + html_handle.write('') + html_handle.write(page) + html_handle.write('') + html_handle.close() + print('Wrote', html_filepath.relative_path) + +def html_from_tree(tree, sort=None): + ''' + Given a tree *whose root is the submission*, return + HTML-formatted text representing each submission's comment page. + ''' + if tree.data.object_type == 'submission': + page = html_format_submission(tree.data) + elif tree.data.object_type == 'comment': + page = html_format_comment(tree.data) + children = tree.listnodes() + if sort is not None: + children.sort(key=sort) + children = [html_from_tree(child, sort) for child in children] + if len(children) == 0: + children = '' + else: + children = '\n\n'.join(children) + try: + page = page.format(children=children) + except IndexError: + print(page) + raise + return page + +def html_helper_permalink(item): + link = 'https://www.reddit.com/r/%s/comments/' % item.subreddit + if item.object_type == 'submission': + link += item.idstr[3:] + elif item.object_type == 'comment': + link += '%s/_/%s' % (item.submission[3:], item.idstr[3:]) + link = 'permalink' % link + return link + +def html_helper_urlortext(submission): + if submission.url: + text = '{url}'.format(url=submission.url) + elif submission.selftext: + text = render_markdown(submission.selftext) + else: + text = '' + text = sanitize_braces(text) + return text + +def html_helper_userlink(item): + name = item.author + if name.lower() == '[deleted]': + return '[deleted]' + link = 'https://www.reddit.com/u/{name}' + link = '{name}' % link + link = link.format(name=name) + return link + +def render_markdown(text): + text = markdown.markdown(text, output_format='html5') + return text + +def sanitize_braces(text): + text = text.replace('{', '{{') + text = text.replace('}', '}}') + return text + +def trees_from_database(database, specific_submission=None): + ''' + Given a timesearch database filename, take all of the submission + ids, take all of the comments for each submission id, and run them + through `tree_from_submission`. + + Yield each submission's tree as it is generated. + ''' + cur1 = database.sql.cursor() + cur2 = database.sql.cursor() + + if specific_submission is None: + cur1.execute('SELECT idstr FROM submissions ORDER BY created ASC') + submission_ids = common.fetchgenerator(cur1) + else: + specific_submission = 't3_' + specific_submission.split('_')[-1] + # Insert as a tuple to behave like the sql fetch results + submission_ids = [(specific_submission, None)] + + found_some_posts = False + for submission_id in submission_ids: + # Extract sql fetch + submission_id = submission_id[0] + found_some_posts = True + cur2.execute('SELECT * FROM submissions WHERE idstr == ?', [submission_id]) + submission = cur2.fetchone() + cur2.execute('SELECT * FROM comments WHERE submission == ?', [submission_id]) + fetched_comments = cur2.fetchall() + submission_tree = tree_from_submission(submission, fetched_comments) + yield submission_tree + + if not found_some_posts: + raise Exception('Found no submissions!') + +def tree_from_submission(submission, commentpool): + ''' + Given the sqlite data for a submission and all of its comments, + return a tree with the submission id as the root + ''' + submission = DBEntry(submission) + commentpool = [DBEntry(c) for c in commentpool] + commentpool.sort(key=lambda x: x.created) + + print('Building tree for %s (%d comments)' % (submission.idstr, len(commentpool))) + # Thanks Martin Schmidt for the algorithm + # http://stackoverflow.com/a/29942118/5430534 + tree = TreeNode(identifier=submission.idstr, data=submission) + node_map = {} + + for comment in commentpool: + # Ensure this comment is in a node of its own + this_node = node_map.get(comment.idstr, None) + if this_node: + # This ID was detected as a parent of a previous iteration + # Now we're actually filling it in. + this_node.data = comment + else: + this_node = TreeNode(comment.idstr, comment) + node_map[comment.idstr] = this_node + + # Attach this node to the parent. + if comment.parent.startswith('t3_'): + tree.add_child(this_node) + else: + parent_node = node_map.get(comment.parent, None) + if not parent_node: + parent_node = TreeNode(comment.parent, data=None) + node_map[comment.parent] = parent_node + parent_node.add_child(this_node) + this_node.parent = parent_node + return tree + +def offline_reading_argparse(args): + return html_from_database( + subreddit=args.subreddit, + username=args.username, + specific_submission=args.specific_submission, + ) diff --git a/timesearch/redmash.py b/timesearch/redmash.py new file mode 100644 index 0000000..01cd482 --- /dev/null +++ b/timesearch/redmash.py @@ -0,0 +1,177 @@ +import datetime +import os + +from . import common +from . import tsdb + + +LINE_FORMAT_TXT = ''' +{timestamp}: [{title}]({shortlink}) - /u/{author} (+{score}) +'''.replace('\n', '') + +LINE_FORMAT_HTML = ''' +{timestamp}: [{flairtext}] {title} - {author} (+{score})
+'''.replace('\n', '') + +TIMESTAMP_FORMAT = '%Y %b %d' +#The time format. +# "%Y %b %d" = "2016 August 10" +# See http://strftime.org/ + +HTML_HEADER = ''' + + + + + + + +''' + +HTML_FOOTER = ''' + + +''' + + +def redmash( + subreddit=None, + username=None, + do_all=False, + do_date=False, + do_title=False, + do_score=False, + do_author=False, + do_subreddit=False, + do_flair=False, + html=False, + score_threshold=0, + ): + if (subreddit is None) == (username is None): + raise Exception('Enter subreddit or username but not both') + + if subreddit: + database = tsdb.TSDB.for_subreddit(subreddit, do_create=False) + else: + database = tsdb.TSDB.for_user(username, do_create=False) + + kwargs = {'html': html, 'score_threshold': score_threshold} + wrote = None + + if do_all or do_date: + print('Writing time file') + wrote = redmash_worker(database, suffix='_date', orderby='created ASC', **kwargs) + + if do_all or do_title: + print('Writing title file') + wrote = redmash_worker(database, suffix='_title', orderby='title ASC', **kwargs) + + if do_all or do_score: + print('Writing score file') + wrote = redmash_worker(database, suffix='_score', orderby='score DESC', **kwargs) + + if not username and (do_all or do_author): + print('Writing author file') + wrote = redmash_worker(database, suffix='_author', orderby='author ASC', **kwargs) + + if username and (do_all or do_subreddit): + print('Writing subreddit file') + wrote = redmash_worker(database, suffix='_subreddit', orderby='subreddit ASC', **kwargs) + + if do_all or do_flair: + print('Writing flair file') + # Items with flair come before items without. Each group is sorted by time separately. + orderby = 'flair_text IS NULL ASC, created ASC' + wrote = redmash_worker(database, suffix='_flair', orderby=orderby, **kwargs) + + if not wrote: + raise Exception('No sorts selected! Read the docstring') + print('Done.') + +def redmash_worker( + database, + suffix, + orderby, + score_threshold=0, + html=False, + ): + cur = database.sql.cursor() + statement = 'SELECT * FROM submissions WHERE score >= {threshold} ORDER BY {order}' + statement = statement.format(threshold=score_threshold, order=orderby) + cur.execute(statement) + + os.makedirs(database.redmash_dir.absolute_path, exist_ok=True) + + extension = '.html' if html else '.txt' + mash_basename = database.filepath.replace_extension('').basename + mash_basename += suffix + extension + mash_filepath = database.redmash_dir.with_child(mash_basename) + + mash_handle = open(mash_filepath.absolute_path, 'w', encoding='UTF-8') + if html: + mash_handle.write(HTML_HEADER) + line_format = LINE_FORMAT_HTML + else: + line_format = LINE_FORMAT_TXT + + do_timestamp = '{timestamp}' in line_format + + for item in common.fetchgenerator(cur): + if do_timestamp: + timestamp = int(item[tsdb.SQL_SUBMISSION['created']]) + timestamp = datetime.datetime.utcfromtimestamp(timestamp) + timestamp = timestamp.strftime(TIMESTAMP_FORMAT) + else: + timestamp = '' + + short_link = 'https://redd.it/%s' % item[tsdb.SQL_SUBMISSION['idstr']][3:] + author = item[tsdb.SQL_SUBMISSION['author']] + if author.lower() == '[deleted]': + author_link = '#' + else: + author_link = 'https://reddit.com/u/%s' % author + line = line_format.format( + author=author, + authorlink=author_link, + flaircss=item[tsdb.SQL_SUBMISSION['flair_css_class']] or '', + flairtext=item[tsdb.SQL_SUBMISSION['flair_text']] or '', + id=item[tsdb.SQL_SUBMISSION['idstr']], + numcomments=item[tsdb.SQL_SUBMISSION['num_comments']], + score=item[tsdb.SQL_SUBMISSION['score']], + shortlink=short_link, + subreddit=item[tsdb.SQL_SUBMISSION['subreddit']], + timestamp=timestamp, + title=item[tsdb.SQL_SUBMISSION['title']].replace('\n', ' '), + url=item[tsdb.SQL_SUBMISSION['url']] or short_link, + ) + line += '\n' + mash_handle.write(line) + + if html: + mash_handle.write(HTML_FOOTER) + mash_handle.close() + print('Wrote', mash_filepath.relative_path) + return mash_filepath + +def redmash_argparse(args): + if args.subreddit is args.username is None: + raise ValueError('-r subreddit OR -u username must be provided') + + return redmash( + subreddit=args.subreddit, + username=args.username, + do_all=args.do_all, + do_date=args.do_date, + do_title=args.do_title, + do_score=args.do_score, + do_author=args.do_author, + do_subreddit=args.do_subreddit, + do_flair=args.do_flair, + html=args.html, + score_threshold=common.int_none(args.score_threshold), + ) diff --git a/timesearch/timesearch.py b/timesearch/timesearch.py new file mode 100644 index 0000000..5e61e54 --- /dev/null +++ b/timesearch/timesearch.py @@ -0,0 +1,147 @@ +import time +import traceback + +from . import common +from . import tsdb + + +# The maximum amount by which it can multiply the interval +# when not enough posts are found. +MAXIMUM_EXPANSION_MULTIPLIER = 2 + + +def timesearch( + subreddit=None, + username=None, + lower=None, + upper=None, + interval=86400, + ): + ''' + Collect submissions across time. + Please see the global DOCSTRING variable. + ''' + if (subreddit is None) == (username is None): + raise Exception('Enter subreddit or username but not both') + + common.bot.login(common.r) + + if subreddit: + database = tsdb.TSDB.for_subreddit(subreddit) + else: + # When searching, we'll take the user's submissions from anywhere. + subreddit = 'all' + database = tsdb.TSDB.for_user(username) + cur = database.sql.cursor() + + if lower == 'update': + # Start from the latest submission + cur.execute('SELECT * FROM submissions ORDER BY idint DESC LIMIT 1') + f = cur.fetchone() + if f: + lower = f[tsdb.SQL_SUBMISSION['created']] + print(f[tsdb.SQL_SUBMISSION['idstr']], common.human(lower), lower) + else: + lower = None + + if not isinstance(subreddit, common.praw.models.Subreddit): + subreddit = common.r.subreddit(subreddit) + + if subreddit != 'all': + if isinstance(subreddit, common.praw.models.Subreddit): + creation = subreddit.created_utc + else: + subreddits = subreddit.split('+') + subreddits = [common.r.subreddit(sr) for sr in subreddits] + creation = min([sr.created_utc for sr in subreddits]) + else: + if not isinstance(username, common.praw.models.Redditor): + user = common.r.redditor(username) + creation = user.created_utc + + if lower is None or lower < creation: + lower = creation + + maxupper = upper + if maxupper is None: + maxupper = common.get_now() + 86400 + + form = '{upper} - {lower} +{gain}' + submissions = subreddit.submissions(start=lower, end=maxupper) + submissions = common.generator_chunker(submissions, 100) + for chunk in submissions: + chunk.sort(key=lambda x: x.created_utc, reverse=True) + new_count = database.insert(chunk)['new_submissions'] + message = form.format( + upper=common.human(chunk[0].created_utc), + lower=common.human(chunk[-1].created_utc), + gain=new_count, + ) + print(message) + + #upper = lower + interval + #toomany_inarow = 0 + # while lower < maxupper: + # print('\nCurrent interval:', interval, 'seconds') + # print('Lower:', common.human(lower), lower) + # print('Upper:', common.human(upper), upper) + # if username: + # query = '(and author:"%s" (and timestamp:%d..%d))' % (username, lower, upper) + # else: + # query = 'timestamp:%d..%d' % (lower, upper) + + # try: + # searchresults = subreddit.search( + # query, + # sort='new', + # limit=100, + # syntax='cloudsearch' + # ) + # searchresults = list(searchresults) + # except Exception: + # traceback.print_exc() + # print('resuming in 5...') + # time.sleep(5) + # continue + + # searchresults.sort(key=lambda x: x.created_utc) + # print([i.id for i in searchresults]) + + # itemsfound = len(searchresults) + # print('Found', itemsfound, 'items.') + # if itemsfound < 50: + # print('Too few results, increasing interval', end='') + # diff = (1 - (itemsfound / 75)) + 1 + # diff = min(MAXIMUM_EXPANSION_MULTIPLIER, diff) + # interval = int(interval * diff) + # if itemsfound > 99: + # #Intentionally not elif + # print('Too many results, reducing interval', end='') + # interval = int(interval * (0.8 - (0.05 * toomany_inarow))) + # upper = lower + interval + # toomany_inarow += 1 + # else: + # lower = upper + # upper = lower + interval + # toomany_inarow = max(0, toomany_inarow-1) + # print(database.insert(searchresults)) + # print() + + cur.execute('SELECT COUNT(idint) FROM submissions') + itemcount = cur.fetchone()[0] + + print('Ended with %d items in %s' % (itemcount, database.filepath.basename)) + +def timesearch_argparse(args): + if args.lower == 'update': + lower = 'update' + else: + lower = common.int_none(args.lower) + + return timesearch( + subreddit=args.subreddit, + username=args.username, + lower=lower, + upper=common.int_none(args.upper), + interval=common.int_none(args.interval), + ) diff --git a/timesearch/tsdb.py b/timesearch/tsdb.py new file mode 100644 index 0000000..51d03c2 --- /dev/null +++ b/timesearch/tsdb.py @@ -0,0 +1,335 @@ +import os +import sqlite3 +import types + +from . import common +from . import exceptions + +from voussoirkit import pathclass + + +# For backwards compatibility reasons, this list of format strings will help +# timesearch find databases that are using the old filename style. +# The final element will be used if none of the previous ones were found. +DB_FORMATS_SUBREDDIT = [ + '.\\{name}.db', + '.\\subreddits\\{name}\\{name}.db', + '.\\{name}\\{name}.db', + '.\\databases\\{name}.db', + '.\\subreddits\\{name}\\{name}.db', +] +DB_FORMATS_USER = [ + '.\\@{name}.db', + '.\\users\\@{name}\\@{name}.db', + '.\\@{name}\\@{name}.db', + '.\\databases\\@{name}.db', + '.\\users\\@{name}\\@{name}.db', +] + +DB_INIT = ''' +CREATE TABLE IF NOT EXISTS submissions( + idint INT, + idstr TEXT, + created INT, + self INT, + nsfw INT, + author TEXT, + title TEXT, + url TEXT, + selftext TEXT, + score INT, + subreddit TEXT, + distinguish INT, + textlen INT, + num_comments INT, + flair_text TEXT, + flair_css_class TEXT, + augmented_at INT, + augmented_count INT +); +CREATE INDEX IF NOT EXISTS submission_index ON submissions(idstr); +---------------------------------------------------------------------------------------------------- +CREATE TABLE IF NOT EXISTS comments( + idint INT, + idstr TEXT, + created INT, + author TEXT, + parent TEXT, + submission TEXT, + body TEXT, + score INT, + subreddit TEXT, + distinguish TEXT, + textlen INT +); +CREATE INDEX IF NOT EXISTS comment_index ON comments(idstr); +'''.strip() + +SQL_SUBMISSION_COLUMNS = [ + 'idint', + 'idstr', + 'created', + 'self', + 'nsfw', + 'author', + 'title', + 'url', + 'selftext', + 'score', + 'subreddit', + 'distinguish', + 'textlen', + 'num_comments', + 'flair_text', + 'flair_css_class', + 'augmented_at', + 'augmented_count', +] + +SQL_COMMENT_COLUMNS = [ + 'idint', + 'idstr', + 'created', + 'author', + 'parent', + 'submission', + 'body', + 'score', + 'subreddit', + 'distinguish', + 'textlen', +] + +SQL_SUBMISSION = {key:index for (index, key) in enumerate(SQL_SUBMISSION_COLUMNS)} +SQL_COMMENT = {key:index for (index, key) in enumerate(SQL_COMMENT_COLUMNS)} + + +class TSDB: + def __init__(self, filepath, do_create=True): + self.filepath = pathclass.Path(filepath) + if not self.filepath.is_file: + if not do_create: + raise exceptions.DBNotFound(self.filepath) + print('New database', self.filepath.relative_path) + + os.makedirs(self.filepath.parent.absolute_path, exist_ok=True) + + self.breakdown_dir = self.filepath.parent.with_child('breakdown') + self.offline_reading_dir = self.filepath.parent.with_child('offline_reading') + self.redmash_dir = self.filepath.parent.with_child('redmash') + self.styles_dir = self.filepath.parent.with_child('styles') + self.wiki_dir = self.filepath.parent.with_child('wiki') + + self.sql = sqlite3.connect(self.filepath.absolute_path) + self.cur = self.sql.cursor() + statements = DB_INIT.split(';') + for statement in statements: + self.cur.execute(statement) + self.sql.commit() + + def __repr__(self): + return 'TSDB(%s)' % self.filepath + + @staticmethod + def _pick_filepath(formats, name): + ''' + Starting with the most specific and preferred filename format, check + if there is an existing database that matches the name we're looking + for, and return that path. If none of them exist, then use the most + preferred filepath. + ''' + paths = [pathclass.Path(format.format(name=name)) for format in formats] + for path in paths: + if path.is_file: + return path + return paths[-1] + + @classmethod + def for_subreddit(cls, name, do_create=True): + if isinstance(name, common.praw.models.Subreddit): + name = name.display_name + elif not isinstance(name, str): + raise TypeError(name, 'should be str or Subreddit.') + + filepath = cls._pick_filepath(formats=DB_FORMATS_SUBREDDIT, name=name) + return cls(filepath=filepath, do_create=do_create) + + @classmethod + def for_user(cls, name, do_create=True): + if isinstance(name, common.praw.models.Redditor): + name = name.name + elif not isinstance(name, str): + raise TypeError(name, 'should be str or Redditor.') + + filepath = cls._pick_filepath(formats=DB_FORMATS_USER, name=name) + return cls(filepath=filepath, do_create=do_create) + + def insert(self, objects, commit=True): + if not isinstance(objects, (list, tuple, types.GeneratorType)): + objects = [objects] + + new_values = { + 'new_submissions': 0, + 'new_comments': 0, + } + methods = { + common.praw.models.Submission: (self.insert_submission, 'new_submissions'), + common.praw.models.Comment: (self.insert_comment, 'new_comments'), + } + for obj in objects: + (method, key) = methods.get(type(obj), (None, None)) + if method is None: + raise TypeError('Unsupported', type(obj), obj) + status = method(obj) + new_values[key] += status + + if commit: + self.sql.commit() + + return new_values + + def insert_submission(self, submission): + cur = self.sql.cursor() + cur.execute('SELECT * FROM submissions WHERE idstr == ?', [submission.fullname]) + existing_entry = cur.fetchone() + + if submission.author is None: + author = '[DELETED]' + else: + author = submission.author.name + + if not existing_entry: + if submission.is_self: + # Selfpost's URL leads back to itself, so just ignore it. + url = None + else: + url = submission.url + + postdata = { + 'idint': common.b36(submission.id), + 'idstr': submission.fullname, + 'created': submission.created_utc, + 'self': submission.is_self, + 'nsfw': submission.over_18, + 'author': author, + 'title': submission.title, + 'url': url, + 'selftext': submission.selftext, + 'score': submission.score, + 'subreddit': submission.subreddit.display_name, + 'distinguish': submission.distinguished, + 'textlen': len(submission.selftext), + 'num_comments': submission.num_comments, + 'flair_text': submission.link_flair_text, + 'flair_css_class': submission.link_flair_css_class, + 'augmented_at': None, + 'augmented_count': None, + } + (qmarks, bindings) = binding_filler(SQL_SUBMISSION_COLUMNS, postdata, require_all=True) + query = 'INSERT INTO submissions VALUES(%s)' % qmarks + cur.execute(query, bindings) + + else: + if submission.author is None: + # This post is deleted, therefore its text probably says [deleted] or [removed]. + # Discard that, and keep the data we already had here. + selftext = existing_entry[SQL_SUBMISSION['selftext']] + else: + selftext = submission.selftext + + query = ''' + UPDATE submissions SET + nsfw = coalesce(?, nsfw), + score = coalesce(?, score), + selftext = coalesce(?, selftext), + distinguish = coalesce(?, distinguish), + num_comments = coalesce(?, num_comments), + flair_text = coalesce(?, flair_text), + flair_css_class = coalesce(?, flair_css_class) + WHERE idstr == ? + ''' + bindings = [ + submission.over_18, + submission.score, + selftext, + submission.distinguished, + submission.num_comments, + submission.link_flair_text, + submission.link_flair_css_class, + submission.fullname + ] + cur.execute(query, bindings) + + return existing_entry is None + + def insert_comment(self, comment): + cur = self.sql.cursor() + cur.execute('SELECT * FROM comments WHERE idstr == ?', [comment.fullname]) + existing_entry = cur.fetchone() + + if comment.author is None: + author = '[DELETED]' + else: + author = comment.author.name + + if not existing_entry: + postdata = { + 'idint': common.b36(comment.id), + 'idstr': comment.fullname, + 'created': comment.created_utc, + 'author': author, + 'parent': comment.parent_id, + 'submission': comment.link_id, + 'body': comment.body, + 'score': comment.score, + 'subreddit': comment.subreddit.display_name, + 'distinguish': comment.distinguished, + 'textlen': len(comment.body), + } + (qmarks, bindings) = binding_filler(SQL_COMMENT_COLUMNS, postdata, require_all=True) + query = 'INSERT INTO comments VALUES(%s)' % qmarks + cur.execute(query, bindings) + + else: + greasy = ['has been overwritten', 'pastebin.com/64GuVi2F'] + if comment.author is None or any(grease in comment.body for grease in greasy): + body = existing_entry[SQL_COMMENT['body']] + else: + body = comment.body + + query = ''' + UPDATE comments SET + score = coalesce(?, score), + body = coalesce(?, body), + distinguish = coalesce(?, distinguish) + WHERE idstr == ? + ''' + bindings = [ + comment.score, + body, + comment.distinguished, + comment.fullname + ] + cur.execute(query, bindings) + + return existing_entry is None + + +def binding_filler(column_names, values, require_all=True): + ''' + Manually aligning question marks and bindings is annoying. + Given the table's column names and a dictionary of {column: value}, + return the question marks and the list of bindings in the right order. + ''' + values = values.copy() + for column in column_names: + if column in values: + continue + if require_all: + raise ValueError('Missing column "%s"' % column) + else: + values.setdefault(column, None) + qmarks = '?' * len(column_names) + qmarks = ', '.join(qmarks) + bindings = [values[column] for column in column_names] + return (qmarks, bindings)