Initial migratory commit from voussoir/reddit.

master
Ethan Dalool 2017-11-13 19:13:19 -08:00
commit 708c774e52
18 changed files with 2471 additions and 0 deletions

30
.gitattributes vendored Normal file
View File

@ -0,0 +1,30 @@
# Auto detect text files and perform LF normalization
* text=auto
# Custom for Visual Studio
*.cs diff=csharp
*.sln merge=union
*.csproj merge=union
*.vbproj merge=union
*.fsproj merge=union
*.dbproj merge=union
*.psd binary
*.zip binary
*.db binary
*.png binary
*.jpg binary
*.ico binary
*.exe binary
# Standard to msysgit
*.doc diff=astextplain
*.DOC diff=astextplain
*.docx diff=astextplain
*.DOCX diff=astextplain
*.dot diff=astextplain
*.DOT diff=astextplain
*.pdf diff=astextplain
*.PDF diff=astextplain
*.rtf diff=astextplain
*.RTF diff=astextplain

231
.gitignore vendored Normal file
View File

@ -0,0 +1,231 @@
databases/*
@hangman.md
hangman.py
merge_database.py
migrate_20160605.py
timesearch_backup.py
*.ignore
*.db-journal
*.pydevproject
.project
.metadata
bin/
tmp/
*.tmp
*.bak
*.swp
*~.nib
local.properties
.classpath
.settings/
.loadpath
# External tool builders
.externalToolBuilders/
# Locally stored "Eclipse launch configurations"
*.launch
# CDT-specific
.cproject
# PDT-specific
.buildpath
#################
## Visual Studio
#################
## Ignore Visual Studio temporary files, build results, and
## files generated by popular Visual Studio add-ons.
# User-specific files
*.suo
*.user
*.sln.docstates
# Build results
[Dd]ebug/
[Rr]elease/
x64/
build/
[Bb]in/
[Oo]bj/
# MSTest test Results
[Tt]est[Rr]esult*/
[Bb]uild[Ll]og.*
*_i.c
*_p.c
*.ilk
*.meta
*.obj
*.pch
*.pdb
*.pgc
*.pgd
*.rsp
*.sbr
*.tlb
*.tli
*.tlh
*.tmp
*.tmp_proj
*.log
*.vspscc
*.vssscc
.builds
*.pidb
*.log
*.scc
# Visual C++ cache files
ipch/
*.aps
*.ncb
*.opensdf
*.sdf
*.cachefile
# Visual Studio profiler
*.psess
*.vsp
*.vspx
# Guidance Automation Toolkit
*.gpState
# ReSharper is a .NET coding add-in
_ReSharper*/
*.[Rr]e[Ss]harper
# TeamCity is a build add-in
_TeamCity*
# DotCover is a Code Coverage Tool
*.dotCover
# NCrunch
*.ncrunch*
.*crunch*.local.xml
# Installshield output folder
[Ee]xpress/
# DocProject is a documentation generator add-in
DocProject/buildhelp/
DocProject/Help/*.HxT
DocProject/Help/*.HxC
DocProject/Help/*.hhc
DocProject/Help/*.hhk
DocProject/Help/*.hhp
DocProject/Help/Html2
DocProject/Help/html
# Click-Once directory
publish/
# Publish Web Output
*.Publish.xml
*.pubxml
# NuGet Packages Directory
## TODO: If you have NuGet Package Restore enabled, uncomment the next line
#packages/
# Windows Azure Build Output
csx
*.build.csdef
# Windows Store app package directory
AppPackages/
# Others
sql/
*.Cache
ClientBin/
[Ss]tyle[Cc]op.*
~$*
*~
*.dbmdl
*.[Pp]ublish.xml
*.pfx
*.publishsettings
# RIA/Silverlight projects
Generated_Code/
# Backup & report files from converting an old project file to a newer
# Visual Studio version. Backup files are not needed, because we have git ;-)
_UpgradeReport_Files/
Backup*/
UpgradeLog*.XML
UpgradeLog*.htm
# SQL Server files
App_Data/*.mdf
App_Data/*.ldf
#############
## Windows detritus
#############
# Windows image file caches
Thumbs.db
ehthumbs.db
# Folder config file
Desktop.ini
# Recycle Bin used on file shares
$RECYCLE.BIN/
# Mac crap
.DS_Store
#############
## Python
#############
*.py[co]
# Packages
*.egg
*.egg-info
dist/
build/
eggs/
parts/
var/
sdist/
develop-eggs/
.installed.cfg
# Installer logs
pip-log.txt
# Unit test / coverage reports
.coverage
.tox
#Translations
*.mo
#Mr Developer
.mr.developer.cfg
=======
*~
*.egg
*.pyc
.coverage
*.egg-info/
_build/
build/
dist/
.DS_Store

114
README.md Normal file
View File

@ -0,0 +1,114 @@
timesearch
==========
I don't have a test suite. You're my test suite! Messages go to [/u/GoldenSights](https://reddit.com/u/GoldenSights).
Timesearch is a collection of utilities for archiving subreddits.
### Make sure you have:
- Installed [Python](https://www.python.org/download). I use Python 3.6.
- Installed PRAW >= 4, as well as the other modules in `requirements.txt`. Try `pip install -r requirements.txt` to get them all.
- Created an OAuth app at https://reddit.com/prefs/apps. Make it `script` type, and set the redirect URI to `http://localhost:8080`. The title and description can be anything you want, and the about URL is not required.
- Used [this PRAW script](https://praw.readthedocs.io/en/latest/tutorials/refresh_token.html) to generate a refresh token. Just save it as a .py file somewhere and run it through your terminal / command line. For simplicity's sake, I just choose `all` for the scopes.
- Downloaded a copy of [this file](https://github.com/voussoir/reddit/blob/master/bot4.py) and saved it as `bot.py`. Fill out the variables using your OAuth information, and read the instructions to see where to put it. The Useragent is a description of your API usage. Typically "/u/username's praw client" is sufficient.
### This package consists of:
- **timesearch**: If you try to page through `/new` on a subreddit, you'll hit a limit at or before 1,000 posts. Timesearch uses the `timestamp` cloudsearch query parameter to step from the beginning of a subreddit to present time, to collect as many submissions as possible. Read more about timestamp searching [here](https://www.reddit.com/r/reddittips/comments/2ix73n/use_cloudsearch_to_search_for_posts_on_reddit/).
`> timesearch.py timesearch -r subredditname <flags>`
`> timesearch.py timesearch -u username <flags>`
- **commentaugment**: Although we can search for submissions, we cannot search for comments. After performing a timesearch, you can use commentaugment to download the comment tree for each submission.
Note: commentaugment only gets the comments attached to the submissions that you found in your timesearch scan. If you're trying to commentaugment on a user, you're going to get comments that were made on their submissions, **not** comments they made on other people's submissions. Therefore, comprehensively collecting a user's activity is not possible. You will have to use someone else's dataset like that of [/u/Stuck_in_the_Matrix](https://reddit.com/u/Stuck_in_the_Matrix) at [pushshift.io](https://pushshift.io).
`> timesearch.py commentaugment -r subredditname <flags>`
`> timesearch.py commentaugment -u username <flags>`
- **livestream**: timesearch+commentaugment is great for starting your database and getting historical posts, but it's not the best for staying up-to-date. Instead, livestream monitors `/new` and `/comments` to continuously ingest data.
`> timesearch.py livestream -r subredditname <flags>`
`> timesearch.py livestream -u username <flags>`
- **getstyles**: Downloads the stylesheet and CSS images.
`> timesearch.py getstyles -r subredditname`
- **getwiki**: Downloads the wiki pages, sidebar, etc. from /wiki/pages.
`> timesearch.py getwiki -r subredditname`
- **offline_reading**: Renders comment threads into HTML via markdown.
Note: I'm currently using the [markdown library from pypi](https://pypi.python.org/pypi/Markdown), and it doesn't do reddit's custom markdown like `/r/` or `/u/`, obviously. So far I don't think anybody really uses o_r so I haven't invested much time into improving it.
`> timesearch.py offline_reading -r subredditname <flags>`
`> timesearch.py offline_reading -u username <flags>`
- **redmash**: Generates plaintext or HTML lists of submissions, sorted by a property of your choosing. You can order by date, author, flair, etc.
`> timesearch.py redmash -r subredditname <flags>`
`> timesearch.py redmash -u username <flags>`
- **breakdown**: Produces a JSON file indicating which users make the most posts in a subreddit, or which subreddits a user posts in.
`> timesearch.py breakdown -r subredditname` <flags>
`> timesearch.py breakdown -u username` <flags>
- **mergedb**: Copy all new data from one timesearch database into another. Useful for syncing or merging two scans of the same subreddit.
`> timesearch.py mergedb --from filepath/database1.db --to filepath/database2.db`
### To use it
You will need both the `timesearch` package (folder) and the external `timesearch.py` file. You can click the green "Clone or Download" button in the upper right. When you run the .py file, it sends your commandline arguments into the package. You can view a summarized version of all the help text with just `timesearch.py`, or you can view a specific docstring with `timesearch.py livestream`, etc.
I recommend [sqlitebrowser](https://github.com/sqlitebrowser/sqlitebrowser/releases) if you want to inspect the database yourself.
### Changelog
- 2017 11 13
- Gave timesearch its own Github repository so that (1) it will be easier for people to download it and (2) it has a cleaner, more independent URL. [voussoir/timesearch](https://github.com/voussoir/timesearch)
- 2017 11 05
- Added a try-except inside livestream helper to prevent generator from terminating.
- 2017 11 04
- For timesearch, I switched from using my custom cloudsearch iterator to the one that comes with PRAW4+.
- 2017 10 12
- Added the `mergedb` utility for combining databases.
- 2017 06 02
- You can use `commentaugment -s abcdef` to get a particular thread even if you haven't scraped anything else from that subreddit. Previously `-s` only worked if the database already existed and you specified it via `-r`. Now it is inferred from the submission itself.
- 2017 04 28
- Complete restructure into package, started using PRAW4.
- 2016 08 10
- Started merging redmash and wrote its argparser
- 2016 07 03
- Improved docstring clarity.
- 2016 07 02
- Added `livestream` argparse
- 2016 06 07
- Offline_reading has been merged with the main timesearch file
- `get_all_posts` renamed to `timesearch`
- Timesearch parameter `usermode` renamed to `username`; `maxupper` renamed to `upper`.
- Everything now accessible via commandline arguments. Read the docstring at the top of the file.
- 2016 06 05
- NEW DATABASE SCHEME. Submissions and comments now live in different tables like they should have all along. Submission table has two new columns for a little bit of commentaugment metadata. This allows commentaugment to only scan threads that are new.
- You can use the `migrate_20160605.py` script to convert old databases into new ones.
- 2015 11 11
- created `offline_reading.py` which converts a timesearch database into a comment tree that can be rendered into HTML
- 2015 09 07
- fixed bug which allowed `livestream` to crash because `bot.refresh()` was outside of the try-catch.
- 2015 08 19
- fixed bug in which updatescores stopped iterating early if you had more than 100 comments in a row in the db
- commentaugment has been completely merged into the timesearch.py file. you can use commentaugment_prompt() to input the parameters, or use the commentaugment() function directly.
____
I want to live in a future where everyone uses UTC and agrees on daylight savings.
<p align="center">
<img src="https://github.com/voussoir/reddit/blob/master/.GitImages/timesearch_logo_256.png?raw=true" alt="Timesearch"/>
</p>

3
requirements.txt Normal file
View File

@ -0,0 +1,3 @@
markdown
praw
voussoirkit

5
timesearch.py Normal file
View File

@ -0,0 +1,5 @@
import sys
import timesearch
status_code = timesearch.main(sys.argv[1:])
raise SystemExit(status_code)

436
timesearch/__init__.py Normal file
View File

@ -0,0 +1,436 @@
import argparse
import sys
from . import exceptions
# NOTE: Originally I wanted the docstring for each module to be within their
# file. However, this means that composing the global helptext would require
# importing those modules, which will subsequently import PRAW and a whole lot
# of other things. This made TS very slow to load which is okay when you're
# actually using it but really terrible when you're just viewing the help text.
DOCSTRING = '''
Timesearch
The subreddit archiver
The basics:
1. Collect a subreddit's submissions
> timesearch.py timesearch -r subredditname
2. Collect the comments for those submissions
> timesearch.py commentaugment -r subredditname
3. Stay up-to-date
> timesearch.py livestream -r subredditname
Commands for collecting:
{timesearch}
{commentaugment}
{livestream}
{getstyles}
{getwiki}
Commands for processing:
{offline_reading}
{redmash}
{breakdown}
{mergedb}
TO SEE DETAILS ON EACH COMMAND, RUN
> timesearch.py <command>
'''
MODULE_DOCSTRINGS = {
'breakdown': '''
breakdown:
Give the comment / submission counts for users in a subreddit, or
the subreddits that a user posts to.
Automatically dumps into a <database>_breakdown.json file
in the same directory as the database.
> timesearch.py breakdown -r subredditname
> timesearch.py breakdown -u username
flags:
-r "test" | --subreddit "test":
The subreddit database to break down.
-u "test" | --username "test":
The username database to break down.
--sort "name" | "submissions" | "comments" | "total_posts"
Sort the output.
''',
'commentaugment': '''
commentaugment:
Collect comments for the submissions in the database.
NOTE - if you did a timesearch scan on a username, this function is
mostly useless. It collects comments that were made on OP's submissions
but it does not find OP's comments on other people's submissions which
is what you probably wanted. Unfortunately that's not possible.
> timesearch.py commentaugment -r subredditname <flags>
> timesearch.py commentaugment -u username <flags>
flags:
-l 18 | --limit 18:
The number of MoreComments objects to replace.
Default: No limit
-t 5 | --threshold 5:
The number of comments a MoreComments object must claim to have
for us to open it.
Actual number received may be lower.
Default: >= 0
-n 4 | --num_thresh 4:
The number of comments a submission must claim to have for us to
scan it at all.
Actual number received may be lower.
Default: >= 1
-s "t3_xxxxxx" | --specific "t3_xxxxxx":
Given a submission ID, t3_xxxxxx, scan only that submission.
-v | --verbose:
If provided, print more stuff while working.
''',
'getstyles': '''
getstyles:
Collect the stylesheet, and css images.
> timesearch.py getstyles -r subredditname
''',
'getwiki': '''
getwiki:
Collect all available wiki pages.
> timesearch.py getwiki -r subredditname
''',
'mergedb': '''
mergedb:
Copy all new posts from one timesearch database into another.
> timesearch mergedb --from redditdev1.db --to redditdev2.db
flags:
--from:
The database file containing the posts you wish to copy.
--to:
The database file to which you will copy the posts.
The database is modified in-place.
Existing posts will be ignored and not updated.
''',
'livestream': '''
livestream:
Continously collect submissions and/or comments.
> timesearch.py livestream -r subredditname <flags>
> timesearch.py livestream -u username <flags>
flags:
-r "test" | --subreddit "test":
The subreddit to collect from.
-u "test" | --username "test":
The redditor to collect from.
-s | --submissions:
If provided, do collect submissions. Otherwise don't.
-c | --comments:
If provided, do collect comments. Otherwise don't.
If submissions and comments are BOTH left unspecified, then they will
BOTH be collected.
-v | --verbose:
If provided, print extra information to the screen.
-w 30 | --wait 30:
The number of seconds to wait between cycles.
-1 | --once:
If provided, only do a single loop. Otherwise go forever.
''',
'offline_reading': '''
offline_reading:
Render submissions and comment threads to HTML via Markdown.
> timesearch.py offline_reading -r subredditname <flags>
> timesearch.py offline_reading -u username <flags>
flags:
-s "t3_xxxxxx" | --specific "t3_xxxxxx":
Given a submission ID, t3_xxxxxx, render only that submission.
Otherwise render every submission in the database.
''',
'redmash': '''
redmash:
Dump submission listings to a plaintext or HTML file.
> timesearch.py redmash -r subredditname <flags>
> timesearch.py redmash -u username <flags>
flags:
-r "test" | --subreddit "test":
The subreddit database to dump
-u "test" | --username "test":
The username database to dump
--html:
Write HTML files instead of plain text.
-st 50 | --score_threshold 50:
Only mash posts with at least this many points.
Applies to ALL mashes!
--all:
Perform all of the mashes listed below.
--date:
Perform a mash sorted by date.
--title:
Perform a mash sorted by title.
--score:
Perform a mash sorted by score.
--author:
For subreddit databases only.
Perform a mash sorted by author.
--sub:
For username databases only.
Perform a mash sorted by subreddit.
--flair:
Perform a mash sorted by flair.
examples:
`timesearch redmash -r botwatch --date`
does only the date file.
`timesearch redmash -r botwatch --score --title`
does both the score and title files.
`timesearch redmash -r botwatch --score --score_threshold 50`
only shows submissions with >= 50 points.
`timesearch redmash -r botwatch --all`
performs all of the different mashes.
''',
'timesearch': '''
timesearch:
Collect submissions from the subreddit across all of history, or
Collect submissions by a user (as many as possible).
> timesearch.py timesearch -r subredditname <flags>
> timesearch.py timesearch -u username <flags>
-r "test" | --subreddit "test":
The subreddit to scan. Mutually exclusive with username.
-u "test" | --username "test":
The user to scan. Mutually exclusive with subreddit.
-l "update" | --lower "update":
If a number - the unix timestamp to start at.
If "update" - continue from latest submission in db.
Default: update
-up 1467460221 | --upper 1467460221:
If a number - the unix timestamp to stop at.
If not provided - stop at current time.
Default: current time
-i 86400 | --interval 86400:
The initial interval for the scanning window, in seconds.
This is only a starting value. The window will shrink and stretch
as necessary based on received submission counts.
Default: 86400
''',
}
def docstring_preview(text):
'''
Return the brief description at the top of the text.
User can get full text by looking at each specifically.
'''
return text.split('\n\n')[0]
def listget(li, index, fallback=None):
try:
return li[index]
except IndexError:
return fallback
def indent(text, spaces=4):
spaces = ' ' * spaces
return '\n'.join(spaces + line if line.strip() != '' else line for line in text.split('\n'))
docstring_headers = {
key: indent(docstring_preview(value))
for (key, value) in MODULE_DOCSTRINGS.items()
}
DOCSTRING = DOCSTRING.format(**docstring_headers)
####################################################################################################
####################################################################################################
def breakdown_gateway(args):
from . import breakdown
breakdown.breakdown_argparse(args)
def commentaugment_gateway(args):
from . import commentaugment
commentaugment.commentaugment_argparse(args)
def getstyles_gateway(args):
from . import getstyles
getstyles.getstyles_argparse(args)
def getwiki_gateway(args):
from . import getwiki
getwiki.getwiki_argparse(args)
def livestream_gateway(args):
from . import livestream
livestream.livestream_argparse(args)
def mergedb_gateway(args):
from . import mergedb
mergedb.mergedb_argparse(args)
def offline_reading_gateway(args):
from . import offline_reading
offline_reading.offline_reading_argparse(args)
def redmash_gateway(args):
from . import redmash
redmash.redmash_argparse(args)
def timesearch_gateway(args):
from . import timesearch
timesearch.timesearch_argparse(args)
parser = argparse.ArgumentParser()
subparsers = parser.add_subparsers()
p_breakdown = subparsers.add_parser('breakdown')
p_breakdown.add_argument('--sort', dest='sort', default=None)
p_breakdown.add_argument('-r', '--subreddit', dest='subreddit', default=None)
p_breakdown.add_argument('-u', '--user', dest='username', default=None)
p_breakdown.set_defaults(func=breakdown_gateway)
p_commentaugment = subparsers.add_parser('commentaugment')
p_commentaugment.add_argument('-l', '--limit', dest='limit', default=None)
p_commentaugment.add_argument('-n', '--num_thresh', dest='num_thresh', default=1)
p_commentaugment.add_argument('-r', '--subreddit', dest='subreddit', default=None)
p_commentaugment.add_argument('-s', '--specific', dest='specific_submission', default=None)
p_commentaugment.add_argument('-t', '--threshold', dest='threshold', default=0)
p_commentaugment.add_argument('-u', '--user', dest='username', default=None)
p_commentaugment.add_argument('-v', '--verbose', dest='verbose', action='store_true')
p_commentaugment.set_defaults(func=commentaugment_gateway)
p_getstyles = subparsers.add_parser('getstyles')
p_getstyles.add_argument('-r', '--subreddit', dest='subreddit')
p_getstyles.set_defaults(func=getstyles_gateway)
p_getwiki = subparsers.add_parser('getwiki')
p_getwiki.add_argument('-r', '--subreddit', dest='subreddit')
p_getwiki.set_defaults(func=getwiki_gateway)
p_livestream = subparsers.add_parser('livestream')
p_livestream.add_argument('-1', '--once', dest='once', action='store_true')
p_livestream.add_argument('-c', '--comments', dest='comments', action='store_true')
p_livestream.add_argument('-l', '--limit', dest='limit', default=None)
p_livestream.add_argument('-r', '--subreddit', dest='subreddit', default=None)
p_livestream.add_argument('-s', '--submissions', dest='submissions', action='store_true')
p_livestream.add_argument('-u', '--user', dest='username', default=None)
p_livestream.add_argument('-v', '--verbose', dest='verbose', action='store_true')
p_livestream.add_argument('-w', '--wait', dest='sleepy', default=30)
p_livestream.set_defaults(func=livestream_gateway)
p_mergedb = subparsers.add_parser('mergedb')
p_mergedb.add_argument('--from', dest='from_db_path', required=True)
p_mergedb.add_argument('--to', dest='to_db_path', required=True)
p_mergedb.set_defaults(func=mergedb_gateway)
p_offline_reading = subparsers.add_parser('offline_reading')
p_offline_reading.add_argument('-r', '--subreddit', dest='subreddit', default=None)
p_offline_reading.add_argument('-s', '--specific', dest='specific_submission', default=None)
p_offline_reading.add_argument('-u', '--user', dest='username', default=None)
p_offline_reading.set_defaults(func=offline_reading_gateway)
p_redmash = subparsers.add_parser('redmash')
p_redmash.add_argument('--all', dest='do_all', action='store_true')
p_redmash.add_argument('--author', dest='do_author', action='store_true')
p_redmash.add_argument('--date', dest='do_date', action='store_true')
p_redmash.add_argument('--flair', dest='do_flair', action='store_true')
p_redmash.add_argument('--html', dest='html', action='store_true')
p_redmash.add_argument('--score', dest='do_score', action='store_true')
p_redmash.add_argument('--sub', dest='do_subreddit', action='store_true')
p_redmash.add_argument('--title', dest='do_title', action='store_true')
p_redmash.add_argument('-r', '--subreddit', dest='subreddit', default=None)
p_redmash.add_argument('-st', '--score_threshold', dest='score_threshold', default=0)
p_redmash.add_argument('-u', '--user', dest='username', default=None)
p_redmash.set_defaults(func=redmash_gateway)
p_timesearch = subparsers.add_parser('timesearch')
p_timesearch.add_argument('-i', '--interval', dest='interval', default=86400)
p_timesearch.add_argument('-l', '--lower', dest='lower', default='update')
p_timesearch.add_argument('-r', '--subreddit', dest='subreddit', default=None)
p_timesearch.add_argument('-u', '--user', dest='username', default=None)
p_timesearch.add_argument('-up', '--upper', dest='upper', default=None)
p_timesearch.set_defaults(func=timesearch_gateway)
def main(argv):
helpstrings = {'', 'help', '-h', '--help'}
command = listget(argv, 0, '').lower()
# The user did not enter a command, or entered something unrecognized.
if command not in MODULE_DOCSTRINGS:
print(DOCSTRING)
if command == '':
print('You are seeing the default help text because you did not choose a command.')
elif command not in helpstrings:
print('You are seeing the default help text because "%s" was not recognized' % command)
return 1
# The user entered a command, but no further arguments, or just help.
argument = listget(argv, 1, '').lower()
if argument in helpstrings:
print(MODULE_DOCSTRINGS[command])
return 1
args = parser.parse_args(argv)
try:
args.func(args)
except exceptions.DBNotFound as e:
message = '"%s" is not an existing database.'
message += '\nHave you used any of the other utilities to collect data?'
message = message % e.path.absolute_path
print(message)
return 1
return 0
if __name__ == '__main__':
raise SystemExit(main(sys.argv[1:]))

103
timesearch/breakdown.py Normal file
View File

@ -0,0 +1,103 @@
import os
import json
from . import common
from . import tsdb
def breakdown_database(subreddit=None, username=None):
'''
Given a database, return a json dict breaking down the submission / comment count for
users (if a subreddit database) or subreddits (if a user database).
'''
if (subreddit is None) == (username is None):
raise Exception('Enter subreddit or username but not both')
breakdown_results = {}
def _ingest(names, subkey):
for name in names:
breakdown_results.setdefault(name, {})
breakdown_results[name].setdefault(subkey, 0)
breakdown_results[name][subkey] += 1
if subreddit:
database = tsdb.TSDB.for_subreddit(subreddit, do_create=False)
else:
database = tsdb.TSDB.for_user(username, do_create=False)
cur = database.sql.cursor()
for table in ['submissions', 'comments']:
if subreddit:
cur.execute('SELECT author FROM %s' % table)
elif username:
cur.execute('SELECT subreddit FROM %s' % table)
names = (row[0] for row in common.fetchgenerator(cur))
_ingest(names, table)
for name in breakdown_results:
breakdown_results[name].setdefault('submissions', 0)
breakdown_results[name].setdefault('comments', 0)
return breakdown_results
def breakdown_argparse(args):
if args.subreddit:
database = tsdb.TSDB.for_subreddit(args.subreddit, do_create=False)
else:
database = tsdb.TSDB.for_user(args.username, do_create=False)
breakdown_results = breakdown_database(
subreddit=args.subreddit,
username=args.username,
)
def sort_name(name):
return name.lower()
def sort_submissions(name):
invert_score = -1 * breakdown_results[name]['submissions']
return (invert_score, name.lower())
def sort_comments(name):
invert_score = -1 * breakdown_results[name]['comments']
return (invert_score, name.lower())
def sort_total_posts(name):
invert_score = breakdown_results[name]['submissions'] + breakdown_results[name]['comments']
invert_score = -1 * invert_score
return (invert_score, name.lower())
breakdown_sorters = {
'name': sort_name,
'submissions': sort_submissions,
'comments': sort_comments,
'total_posts': sort_total_posts,
}
breakdown_names = list(breakdown_results.keys())
if args.sort is not None:
try:
sorter = breakdown_sorters[args.sort.lower()]
except KeyError:
message = '{sorter} is not a sorter. Choose from {options}'
message = message.format(sorter=args.sort, options=list(breakdown_sorters.keys()))
raise KeyError(message)
breakdown_names.sort(key=sorter)
dump = ' "{name}": {{"submissions": {submissions}, "comments": {comments}}}'
dump = [dump.format(name=name, **breakdown_results[name]) for name in breakdown_names]
dump = ',\n'.join(dump)
dump = '{\n' + dump + '\n}\n'
else:
dump = json.dumps(breakdown_results)
if args.sort is None:
breakdown_basename = '%s_breakdown.json'
else:
breakdown_basename = '%%s_breakdown_%s.json' % args.sort
breakdown_basename = breakdown_basename % database.filepath.replace_extension('').basename
breakdown_filepath = database.breakdown_dir.with_child(breakdown_basename)
os.makedirs(breakdown_filepath.parent.absolute_path, exist_ok=True)
breakdown_file = open(breakdown_filepath.absolute_path, 'w')
with breakdown_file:
breakdown_file.write(dump)
print('Wrote', breakdown_filepath.relative_path)
return breakdown_results

View File

@ -0,0 +1,179 @@
import traceback
from . import common
from . import tsdb
def commentaugment(
subreddit=None,
username=None,
limit=0,
num_thresh=0,
specific_submission=None,
threshold=0,
verbose=0,
):
'''
Take the IDs of collected submissions, and gather comments from those threads.
Please see the global DOCSTRING_COMMENTAUGMENT variable.
'''
common.bot.login(common.r)
if specific_submission is not None:
if not specific_submission.startswith('t3_'):
specific_submission = 't3_' + specific_submission
specific_submission_obj = common.r.submission(specific_submission[3:])
subreddit = specific_submission_obj.subreddit.display_name
if (subreddit is None) == (username is None):
raise Exception('Enter subreddit or username but not both')
if subreddit:
if specific_submission is None:
database = tsdb.TSDB.for_subreddit(subreddit, do_create=False)
else:
database = tsdb.TSDB.for_subreddit(subreddit, do_create=True)
else:
database = tsdb.TSDB.for_user(username, do_create=False)
cur = database.sql.cursor()
if limit == 0:
limit = None
if specific_submission is None:
query = '''
SELECT idstr FROM submissions
WHERE idstr IS NOT NULL
AND augmented_at IS NULL
AND num_comments >= ?
ORDER BY num_comments DESC
'''
bindings = [num_thresh]
cur.execute(query, bindings)
fetchall = [item[0] for item in cur.fetchall()]
else:
# Make sure the object we're augmenting is in the table too!
database.insert(specific_submission_obj)
fetchall = [specific_submission]
totalthreads = len(fetchall)
if verbose:
spacer = '\n\t'
else:
spacer = ' '
scannedthreads = 0
get_submission = common.nofailrequest(get_submission_immediately)
while len(fetchall) > 0:
id_batch = fetchall[:100]
fetchall = fetchall[100:]
for submission in id_batch:
submission = get_submission(submission.split('_')[-1])
message = 'Processing {fullname}{spacer}expecting {num_comments} | '
message = message.format(
fullname=submission.fullname,
spacer=spacer,
num_comments=submission.num_comments,
)
print(message, end='', flush=True)
if verbose:
print()
comments = get_comments_for_thread(submission, limit, threshold, verbose)
database.insert(comments, commit=False)
query = '''
UPDATE submissions
set augmented_at = ?,
augmented_count = ?
WHERE idstr == ?
'''
bindings = [common.get_now(), len(comments), submission.fullname]
cur.execute(query, bindings)
database.sql.commit()
scannedthreads += 1
if verbose:
print('\t', end='')
message = 'Found {count} |{spacer}{scannedthreads} / {totalthreads}'
message = message.format(
count=len(comments),
spacer=spacer,
scannedthreads=scannedthreads,
totalthreads=totalthreads,
)
print(message)
def get_comments_for_thread(submission, limit, threshold, verbose):
comments = common.nofailrequest(lambda x: x.comments)(submission)
# PRAW4 flatten is just list().
comments = manually_replace_comments(comments, limit, threshold, verbose)
return comments
def get_submission_immediately(submission_id):
submission = common.r.submission(submission_id)
# force the lazyloader
submission.title = submission.title
return submission
def manually_replace_comments(incomments, limit=None, threshold=0, verbose=False):
'''
PRAW's replace_more_comments method cannot continue
where it left off in the case of an Ow! screen.
So I'm writing my own function to get each MoreComments item individually
Furthermore, this function will maximize the number of retrieved comments by
sorting the MoreComments objects and getting the big chunks before worrying
about the tail ends.
'''
incomments = incomments.list()
comments = []
morecomments = []
while len(incomments) > 0:
item = incomments.pop()
if isinstance(item, common.praw.models.MoreComments) and item.count >= threshold:
morecomments.append(item)
elif isinstance(item, common.praw.models.Comment):
comments.append(item)
while True:
try:
if limit is not None and limit <= 0:
break
if len(morecomments) == 0:
break
morecomments.sort(key=lambda x: x.count)
mc = morecomments.pop()
additional = common.nofailrequest(mc.comments)()
additionals = 0
if limit is not None:
limit -= 1
for item in additional:
if isinstance(item, common.praw.models.MoreComments) and item.count >= threshold:
morecomments.append(item)
elif isinstance(item, common.praw.models.Comment):
comments.append(item)
additionals += 1
if verbose:
s = '\tGot %d more, %d so far.' % (additionals, len(comments))
if limit is not None:
s += ' Can perform %d more replacements' % limit
print(s)
except KeyboardInterrupt:
raise
except Exception:
traceback.print_exc()
return comments
def commentaugment_argparse(args):
return commentaugment(
subreddit=args.subreddit,
username=args.username,
limit=common.int_none(args.limit),
threshold=common.int_none(args.threshold),
num_thresh=common.int_none(args.num_thresh),
verbose=args.verbose,
specific_submission=args.specific_submission,
)

104
timesearch/common.py Normal file
View File

@ -0,0 +1,104 @@
import datetime
import os
import time
import traceback
try:
import praw
except ImportError:
praw = None
if praw is None or praw.__version__.startswith('3.'):
import praw4
praw = praw4
try:
import bot
except ImportError:
bot = None
if bot is None or bot.praw != praw:
import bot4
bot = bot4
r = bot.anonymous()
def assert_file_exists(filepath):
if not os.path.exists(filepath):
raise FileNotFoundError(filepath)
def b36(i):
if isinstance(i, int):
return base36encode(i)
return base36decode(i)
def base36decode(number):
return int(number, 36)
def base36encode(number, alphabet='0123456789abcdefghijklmnopqrstuvwxyz'):
"""Converts an integer to a base36 string."""
if not isinstance(number, (int)):
raise TypeError('number must be an integer')
base36 = ''
sign = ''
if number < 0:
sign = '-'
number = -number
if 0 <= number < len(alphabet):
return sign + alphabet[number]
while number != 0:
number, i = divmod(number, len(alphabet))
base36 = alphabet[i] + base36
return sign + base36
def fetchgenerator(cursor):
while True:
item = cursor.fetchone()
if item is None:
break
yield item
def generator_chunker(generator, chunk_size):
chunk = []
for item in generator:
chunk.append(item)
if len(chunk) == chunk_size:
yield chunk
chunk = []
if len(chunk) != 0:
yield chunk
def get_now(stamp=True):
now = datetime.datetime.now(datetime.timezone.utc)
if stamp:
return int(now.timestamp())
return now
def human(timestamp):
x = datetime.datetime.utcfromtimestamp(timestamp)
x = datetime.datetime.strftime(x, "%b %d %Y %H:%M:%S")
return x
def int_none(x):
if x is None:
return None
return int(x)
def nofailrequest(function):
'''
Creates a function that will retry until it succeeds.
This function accepts 1 parameter, a function, and returns a modified
version of that function that will try-catch, sleep, and loop until it
finally returns.
'''
def a(*args, **kwargs):
while True:
try:
result = function(*args, **kwargs)
return result
except KeyboardInterrupt:
raise
except Exception:
traceback.print_exc()
print('Retrying in 2...')
time.sleep(2)
return a

3
timesearch/exceptions.py Normal file
View File

@ -0,0 +1,3 @@
class DBNotFound(FileNotFoundError):
def __init__(self, path):
self.path = path

31
timesearch/getstyles.py Normal file
View File

@ -0,0 +1,31 @@
import os
import requests
from . import common
from . import tsdb
def getstyles(subreddit):
print('Getting styles for /r/%s' % subreddit)
subreddit = common.r.subreddit(subreddit)
styles = subreddit.stylesheet()
database = tsdb.TSDB.for_subreddit(subreddit.display_name)
os.makedirs(database.styles_dir.absolute_path, exist_ok=True)
stylesheet_filepath = database.styles_dir.with_child('stylesheet.css')
print('Downloading %s' % stylesheet_filepath.relative_path)
with open(stylesheet_filepath.absolute_path, 'w', encoding='utf-8') as stylesheet:
stylesheet.write(styles.stylesheet)
for image in styles.images:
image_basename = image['name'] + '.' + image['url'].split('.')[-1]
image_filepath = database.styles_dir.with_child(image_basename)
print('Downloading %s' % image_filepath.relative_path)
with open(image_filepath.absolute_path, 'wb') as image_file:
response = requests.get(image['url'])
image_file.write(response.content)
def getstyles_argparse(args):
return getstyles(args.subreddit)

23
timesearch/getwiki.py Normal file
View File

@ -0,0 +1,23 @@
import os
from . import common
from . import tsdb
def getwiki(subreddit):
print('Getting wiki pages for /r/%s' % subreddit)
subreddit = common.r.subreddit(subreddit)
database = tsdb.TSDB.for_subreddit(subreddit)
for wikipage in subreddit.wiki:
if wikipage.name == 'config/stylesheet':
continue
wikipage_path = database.wiki_dir.join(wikipage.name).replace_extension('md')
os.makedirs(wikipage_path.parent.absolute_path, exist_ok=True)
with open(wikipage_path.absolute_path, 'w', encoding='utf-8') as handle:
handle.write(wikipage.content_md)
print('Wrote', wikipage_path.relative_path)
def getwiki_argparse(args):
return getwiki(args.subreddit)

175
timesearch/livestream.py Normal file
View File

@ -0,0 +1,175 @@
import copy
import time
import traceback
from . import common
from . import tsdb
def livestream(
subreddit=None,
username=None,
verbose=False,
as_a_generator=False,
do_submissions=True,
do_comments=True,
limit=100,
only_once=False,
sleepy=30,
):
'''
Continuously get posts from this source
and insert them into the database
as_a_generator:
return a generator where every iteration does a single livestream loop.
This is good if you want to manage multiple livestreams yourself by
calling `next` on each of them, instead of getting stuck in here.
'''
if bool(subreddit) == bool(username):
raise Exception('Require either username / subreddit parameter, but not both')
if bool(do_submissions) is bool(do_comments) is False:
raise Exception('Require do_submissions and/or do_comments parameter')
common.bot.login(common.r)
if subreddit:
print('Getting subreddit %s' % subreddit)
database = tsdb.TSDB.for_subreddit(subreddit)
subreddit = common.r.subreddit(subreddit)
submissions = subreddit.new if do_submissions else None
comments = subreddit.comments if do_comments else None
else:
print('Getting redditor %s' % username)
database = tsdb.TSDB.for_user(username)
user = common.r.redditor(username)
submissions = user.submissions.new if do_submissions else None
comments = user.comments.new if do_comments else None
generator = _livestream_as_a_generator(
database,
submission_function=submissions,
comment_function=comments,
limit=limit,
params={'show': 'all'},
verbose=verbose,
)
if as_a_generator:
return generator
while True:
try:
step = next(generator)
newtext = '%ds, %dc' % (step['new_submissions'], step['new_comments'])
totalnew = step['new_submissions'] + step['new_comments']
status = '{now} +{new}'.format(now=common.human(common.get_now()), new=newtext)
print(status, end='', flush=True)
if totalnew == 0 and verbose is False:
# Since there were no news, allow the next line to overwrite status
print('\r', end='')
else:
print()
if verbose:
print('Loop finished.')
if only_once:
break
time.sleep(sleepy)
except KeyboardInterrupt:
print()
return
except Exception:
traceback.print_exc()
print('Retrying in 5...')
time.sleep(5)
hangman = lambda: livestream(
username='gallowboob',
do_submissions=True,
do_comments=True,
sleepy=60,
)
def _livestream_as_a_generator(
database,
submission_function,
comment_function,
limit,
params,
verbose,
):
while True:
#common.r.handler.clear_cache()
try:
items = _livestream_helper(
submission_function=submission_function,
comment_function=comment_function,
limit=limit,
params=params,
verbose=verbose,
)
newitems = database.insert(items)
yield newitems
except Exception:
traceback.print_exc()
print('Retrying in 5...')
time.sleep(5)
def _livestream_helper(
submission_function=None,
comment_function=None,
verbose=False,
*args,
**kwargs,
):
'''
Given a submission-retrieving function and/or a comment-retrieving function,
collect submissions and comments in a list together and return that.
args and kwargs go into the collecting functions.
'''
if bool(submission_function) is bool(comment_function) is False:
raise Exception('Require submissions and/or comments parameter')
results = []
if submission_function:
if verbose:
print('Getting submissions', args, kwargs)
this_kwargs = copy.deepcopy(kwargs)
submission_batch = submission_function(*args, **this_kwargs)
results.extend(submission_batch)
if comment_function:
if verbose:
print('Getting comments', args, kwargs)
this_kwargs = copy.deepcopy(kwargs)
comment_batch = comment_function(*args, **this_kwargs)
results.extend(comment_batch)
if verbose:
print('Collected. Saving...')
return results
def livestream_argparse(args):
if args.submissions is args.comments is False:
args.submissions = True
args.comments = True
if args.limit is None:
limit = 100
else:
limit = int(args.limit)
if args.submissions is False and args.comments is False:
args.submissions = True
args.comments = True
return livestream(
subreddit=args.subreddit,
username=args.username,
do_comments=args.comments,
do_submissions=args.submissions,
limit=limit,
verbose=args.verbose,
only_once=args.once,
sleepy=common.int_none(args.sleepy),
)

35
timesearch/mergedb.py Normal file
View File

@ -0,0 +1,35 @@
import os
import requests
from . import common
from . import tsdb
MIGRATE_QUERY = '''
INSERT INTO {tablename}
SELECT othertable.* FROM other.{tablename} othertable
LEFT JOIN {tablename} mytable ON mytable.idint == othertable.idint
WHERE mytable.idint IS NULL;
'''
def _migrate_helper(db, tablename):
oldcount = db.cur.execute('SELECT count(*) FROM %s' % tablename).fetchone()[0]
query = MIGRATE_QUERY.format(tablename=tablename)
print(query)
db.cur.execute(query)
db.sql.commit()
newcount = db.cur.execute('SELECT count(*) FROM %s' % tablename).fetchone()[0]
print('Gained %d items.' % (newcount - oldcount))
def mergedb(from_db_path, to_db_path):
to_db = tsdb.TSDB(to_db_path)
from_db = tsdb.TSDB(from_db_path)
to_db.cur.execute('ATTACH DATABASE "%s" AS other' % from_db_path)
_migrate_helper(to_db, 'submissions')
_migrate_helper(to_db, 'comments')
def mergedb_argparse(args):
return mergedb(args.from_db_path, args.to_db_path)

View File

@ -0,0 +1,340 @@
import os
import markdown
from . import common
from . import tsdb
class DBEntry:
def __init__(self, fetch):
if fetch[1].startswith('t3_'):
columns = tsdb.SQL_SUBMISSION_COLUMNS
self.object_type = 'submission'
else:
columns = tsdb.SQL_COMMENT_COLUMNS
self.object_type = 'comment'
self.id = None
self.idstr = None
for (index, attribute) in enumerate(columns):
setattr(self, attribute, fetch[index])
def __repr__(self):
return 'DBEntry(\'%s\')' % self.id