Initial migratory commit from voussoir/reddit.
commit
708c774e52
|
@ -0,0 +1,30 @@
|
|||
# Auto detect text files and perform LF normalization
|
||||
* text=auto
|
||||
|
||||
# Custom for Visual Studio
|
||||
*.cs diff=csharp
|
||||
*.sln merge=union
|
||||
*.csproj merge=union
|
||||
*.vbproj merge=union
|
||||
*.fsproj merge=union
|
||||
*.dbproj merge=union
|
||||
|
||||
*.psd binary
|
||||
*.zip binary
|
||||
*.db binary
|
||||
*.png binary
|
||||
*.jpg binary
|
||||
*.ico binary
|
||||
*.exe binary
|
||||
|
||||
# Standard to msysgit
|
||||
*.doc diff=astextplain
|
||||
*.DOC diff=astextplain
|
||||
*.docx diff=astextplain
|
||||
*.DOCX diff=astextplain
|
||||
*.dot diff=astextplain
|
||||
*.DOT diff=astextplain
|
||||
*.pdf diff=astextplain
|
||||
*.PDF diff=astextplain
|
||||
*.rtf diff=astextplain
|
||||
*.RTF diff=astextplain
|
|
@ -0,0 +1,231 @@
|
|||
databases/*
|
||||
@hangman.md
|
||||
hangman.py
|
||||
merge_database.py
|
||||
migrate_20160605.py
|
||||
timesearch_backup.py
|
||||
|
||||
*.ignore
|
||||
*.db-journal
|
||||
*.pydevproject
|
||||
.project
|
||||
.metadata
|
||||
bin/
|
||||
tmp/
|
||||
*.tmp
|
||||
*.bak
|
||||
*.swp
|
||||
*~.nib
|
||||
local.properties
|
||||
.classpath
|
||||
.settings/
|
||||
.loadpath
|
||||
|
||||
# External tool builders
|
||||
.externalToolBuilders/
|
||||
|
||||
# Locally stored "Eclipse launch configurations"
|
||||
*.launch
|
||||
|
||||
# CDT-specific
|
||||
.cproject
|
||||
|
||||
# PDT-specific
|
||||
.buildpath
|
||||
|
||||
|
||||
#################
|
||||
## Visual Studio
|
||||
#################
|
||||
|
||||
## Ignore Visual Studio temporary files, build results, and
|
||||
## files generated by popular Visual Studio add-ons.
|
||||
|
||||
# User-specific files
|
||||
*.suo
|
||||
*.user
|
||||
*.sln.docstates
|
||||
|
||||
# Build results
|
||||
|
||||
[Dd]ebug/
|
||||
[Rr]elease/
|
||||
x64/
|
||||
build/
|
||||
[Bb]in/
|
||||
[Oo]bj/
|
||||
|
||||
# MSTest test Results
|
||||
[Tt]est[Rr]esult*/
|
||||
[Bb]uild[Ll]og.*
|
||||
|
||||
*_i.c
|
||||
*_p.c
|
||||
*.ilk
|
||||
*.meta
|
||||
*.obj
|
||||
*.pch
|
||||
*.pdb
|
||||
*.pgc
|
||||
*.pgd
|
||||
*.rsp
|
||||
*.sbr
|
||||
*.tlb
|
||||
*.tli
|
||||
*.tlh
|
||||
*.tmp
|
||||
*.tmp_proj
|
||||
*.log
|
||||
*.vspscc
|
||||
*.vssscc
|
||||
.builds
|
||||
*.pidb
|
||||
*.log
|
||||
*.scc
|
||||
|
||||
# Visual C++ cache files
|
||||
ipch/
|
||||
*.aps
|
||||
*.ncb
|
||||
*.opensdf
|
||||
*.sdf
|
||||
*.cachefile
|
||||
|
||||
# Visual Studio profiler
|
||||
*.psess
|
||||
*.vsp
|
||||
*.vspx
|
||||
|
||||
# Guidance Automation Toolkit
|
||||
*.gpState
|
||||
|
||||
# ReSharper is a .NET coding add-in
|
||||
_ReSharper*/
|
||||
*.[Rr]e[Ss]harper
|
||||
|
||||
# TeamCity is a build add-in
|
||||
_TeamCity*
|
||||
|
||||
# DotCover is a Code Coverage Tool
|
||||
*.dotCover
|
||||
|
||||
# NCrunch
|
||||
*.ncrunch*
|
||||
.*crunch*.local.xml
|
||||
|
||||
# Installshield output folder
|
||||
[Ee]xpress/
|
||||
|
||||
# DocProject is a documentation generator add-in
|
||||
DocProject/buildhelp/
|
||||
DocProject/Help/*.HxT
|
||||
DocProject/Help/*.HxC
|
||||
DocProject/Help/*.hhc
|
||||
DocProject/Help/*.hhk
|
||||
DocProject/Help/*.hhp
|
||||
DocProject/Help/Html2
|
||||
DocProject/Help/html
|
||||
|
||||
# Click-Once directory
|
||||
publish/
|
||||
|
||||
# Publish Web Output
|
||||
*.Publish.xml
|
||||
*.pubxml
|
||||
|
||||
# NuGet Packages Directory
|
||||
## TODO: If you have NuGet Package Restore enabled, uncomment the next line
|
||||
#packages/
|
||||
|
||||
# Windows Azure Build Output
|
||||
csx
|
||||
*.build.csdef
|
||||
|
||||
# Windows Store app package directory
|
||||
AppPackages/
|
||||
|
||||
# Others
|
||||
sql/
|
||||
*.Cache
|
||||
ClientBin/
|
||||
[Ss]tyle[Cc]op.*
|
||||
~$*
|
||||
*~
|
||||
*.dbmdl
|
||||
*.[Pp]ublish.xml
|
||||
*.pfx
|
||||
*.publishsettings
|
||||
|
||||
# RIA/Silverlight projects
|
||||
Generated_Code/
|
||||
|
||||
# Backup & report files from converting an old project file to a newer
|
||||
# Visual Studio version. Backup files are not needed, because we have git ;-)
|
||||
_UpgradeReport_Files/
|
||||
Backup*/
|
||||
UpgradeLog*.XML
|
||||
UpgradeLog*.htm
|
||||
|
||||
# SQL Server files
|
||||
App_Data/*.mdf
|
||||
App_Data/*.ldf
|
||||
|
||||
#############
|
||||
## Windows detritus
|
||||
#############
|
||||
|
||||
# Windows image file caches
|
||||
Thumbs.db
|
||||
ehthumbs.db
|
||||
|
||||
# Folder config file
|
||||
Desktop.ini
|
||||
|
||||
# Recycle Bin used on file shares
|
||||
$RECYCLE.BIN/
|
||||
|
||||
# Mac crap
|
||||
.DS_Store
|
||||
|
||||
|
||||
#############
|
||||
## Python
|
||||
#############
|
||||
|
||||
*.py[co]
|
||||
|
||||
# Packages
|
||||
*.egg
|
||||
*.egg-info
|
||||
dist/
|
||||
build/
|
||||
eggs/
|
||||
parts/
|
||||
var/
|
||||
sdist/
|
||||
develop-eggs/
|
||||
.installed.cfg
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
.coverage
|
||||
.tox
|
||||
|
||||
#Translations
|
||||
*.mo
|
||||
|
||||
#Mr Developer
|
||||
.mr.developer.cfg
|
||||
=======
|
||||
*~
|
||||
*.egg
|
||||
*.pyc
|
||||
.coverage
|
||||
*.egg-info/
|
||||
_build/
|
||||
build/
|
||||
dist/
|
||||
.DS_Store
|
||||
|
|
@ -0,0 +1,114 @@
|
|||
timesearch
|
||||
==========
|
||||
|
||||
I don't have a test suite. You're my test suite! Messages go to [/u/GoldenSights](https://reddit.com/u/GoldenSights).
|
||||
|
||||
Timesearch is a collection of utilities for archiving subreddits.
|
||||
|
||||
### Make sure you have:
|
||||
- Installed [Python](https://www.python.org/download). I use Python 3.6.
|
||||
- Installed PRAW >= 4, as well as the other modules in `requirements.txt`. Try `pip install -r requirements.txt` to get them all.
|
||||
- Created an OAuth app at https://reddit.com/prefs/apps. Make it `script` type, and set the redirect URI to `http://localhost:8080`. The title and description can be anything you want, and the about URL is not required.
|
||||
- Used [this PRAW script](https://praw.readthedocs.io/en/latest/tutorials/refresh_token.html) to generate a refresh token. Just save it as a .py file somewhere and run it through your terminal / command line. For simplicity's sake, I just choose `all` for the scopes.
|
||||
- Downloaded a copy of [this file](https://github.com/voussoir/reddit/blob/master/bot4.py) and saved it as `bot.py`. Fill out the variables using your OAuth information, and read the instructions to see where to put it. The Useragent is a description of your API usage. Typically "/u/username's praw client" is sufficient.
|
||||
|
||||
### This package consists of:
|
||||
|
||||
- **timesearch**: If you try to page through `/new` on a subreddit, you'll hit a limit at or before 1,000 posts. Timesearch uses the `timestamp` cloudsearch query parameter to step from the beginning of a subreddit to present time, to collect as many submissions as possible. Read more about timestamp searching [here](https://www.reddit.com/r/reddittips/comments/2ix73n/use_cloudsearch_to_search_for_posts_on_reddit/).
|
||||
`> timesearch.py timesearch -r subredditname <flags>`
|
||||
`> timesearch.py timesearch -u username <flags>`
|
||||
|
||||
- **commentaugment**: Although we can search for submissions, we cannot search for comments. After performing a timesearch, you can use commentaugment to download the comment tree for each submission.
|
||||
Note: commentaugment only gets the comments attached to the submissions that you found in your timesearch scan. If you're trying to commentaugment on a user, you're going to get comments that were made on their submissions, **not** comments they made on other people's submissions. Therefore, comprehensively collecting a user's activity is not possible. You will have to use someone else's dataset like that of [/u/Stuck_in_the_Matrix](https://reddit.com/u/Stuck_in_the_Matrix) at [pushshift.io](https://pushshift.io).
|
||||
`> timesearch.py commentaugment -r subredditname <flags>`
|
||||
`> timesearch.py commentaugment -u username <flags>`
|
||||
|
||||
- **livestream**: timesearch+commentaugment is great for starting your database and getting historical posts, but it's not the best for staying up-to-date. Instead, livestream monitors `/new` and `/comments` to continuously ingest data.
|
||||
`> timesearch.py livestream -r subredditname <flags>`
|
||||
`> timesearch.py livestream -u username <flags>`
|
||||
|
||||
- **getstyles**: Downloads the stylesheet and CSS images.
|
||||
`> timesearch.py getstyles -r subredditname`
|
||||
|
||||
- **getwiki**: Downloads the wiki pages, sidebar, etc. from /wiki/pages.
|
||||
`> timesearch.py getwiki -r subredditname`
|
||||
|
||||
- **offline_reading**: Renders comment threads into HTML via markdown.
|
||||
Note: I'm currently using the [markdown library from pypi](https://pypi.python.org/pypi/Markdown), and it doesn't do reddit's custom markdown like `/r/` or `/u/`, obviously. So far I don't think anybody really uses o_r so I haven't invested much time into improving it.
|
||||
`> timesearch.py offline_reading -r subredditname <flags>`
|
||||
`> timesearch.py offline_reading -u username <flags>`
|
||||
|
||||
- **redmash**: Generates plaintext or HTML lists of submissions, sorted by a property of your choosing. You can order by date, author, flair, etc.
|
||||
`> timesearch.py redmash -r subredditname <flags>`
|
||||
`> timesearch.py redmash -u username <flags>`
|
||||
|
||||
- **breakdown**: Produces a JSON file indicating which users make the most posts in a subreddit, or which subreddits a user posts in.
|
||||
`> timesearch.py breakdown -r subredditname` <flags>
|
||||
`> timesearch.py breakdown -u username` <flags>
|
||||
|
||||
- **mergedb**: Copy all new data from one timesearch database into another. Useful for syncing or merging two scans of the same subreddit.
|
||||
`> timesearch.py mergedb --from filepath/database1.db --to filepath/database2.db`
|
||||
|
||||
### To use it
|
||||
|
||||
You will need both the `timesearch` package (folder) and the external `timesearch.py` file. You can click the green "Clone or Download" button in the upper right. When you run the .py file, it sends your commandline arguments into the package. You can view a summarized version of all the help text with just `timesearch.py`, or you can view a specific docstring with `timesearch.py livestream`, etc.
|
||||
|
||||
I recommend [sqlitebrowser](https://github.com/sqlitebrowser/sqlitebrowser/releases) if you want to inspect the database yourself.
|
||||
|
||||
### Changelog
|
||||
- 2017 11 13
|
||||
- Gave timesearch its own Github repository so that (1) it will be easier for people to download it and (2) it has a cleaner, more independent URL. [voussoir/timesearch](https://github.com/voussoir/timesearch)
|
||||
|
||||
- 2017 11 05
|
||||
- Added a try-except inside livestream helper to prevent generator from terminating.
|
||||
|
||||
- 2017 11 04
|
||||
- For timesearch, I switched from using my custom cloudsearch iterator to the one that comes with PRAW4+.
|
||||
|
||||
- 2017 10 12
|
||||
- Added the `mergedb` utility for combining databases.
|
||||
|
||||
- 2017 06 02
|
||||
- You can use `commentaugment -s abcdef` to get a particular thread even if you haven't scraped anything else from that subreddit. Previously `-s` only worked if the database already existed and you specified it via `-r`. Now it is inferred from the submission itself.
|
||||
|
||||
- 2017 04 28
|
||||
- Complete restructure into package, started using PRAW4.
|
||||
|
||||
- 2016 08 10
|
||||
- Started merging redmash and wrote its argparser
|
||||
|
||||
- 2016 07 03
|
||||
- Improved docstring clarity.
|
||||
|
||||
- 2016 07 02
|
||||
- Added `livestream` argparse
|
||||
|
||||
- 2016 06 07
|
||||
- Offline_reading has been merged with the main timesearch file
|
||||
- `get_all_posts` renamed to `timesearch`
|
||||
- Timesearch parameter `usermode` renamed to `username`; `maxupper` renamed to `upper`.
|
||||
- Everything now accessible via commandline arguments. Read the docstring at the top of the file.
|
||||
|
||||
- 2016 06 05
|
||||
- NEW DATABASE SCHEME. Submissions and comments now live in different tables like they should have all along. Submission table has two new columns for a little bit of commentaugment metadata. This allows commentaugment to only scan threads that are new.
|
||||
- You can use the `migrate_20160605.py` script to convert old databases into new ones.
|
||||
|
||||
- 2015 11 11
|
||||
- created `offline_reading.py` which converts a timesearch database into a comment tree that can be rendered into HTML
|
||||
|
||||
- 2015 09 07
|
||||
- fixed bug which allowed `livestream` to crash because `bot.refresh()` was outside of the try-catch.
|
||||
|
||||
- 2015 08 19
|
||||
- fixed bug in which updatescores stopped iterating early if you had more than 100 comments in a row in the db
|
||||
- commentaugment has been completely merged into the timesearch.py file. you can use commentaugment_prompt() to input the parameters, or use the commentaugment() function directly.
|
||||
|
||||
|
||||
____
|
||||
|
||||
|
||||
I want to live in a future where everyone uses UTC and agrees on daylight savings.
|
||||
|
||||
<p align="center">
|
||||
<img src="https://github.com/voussoir/reddit/blob/master/.GitImages/timesearch_logo_256.png?raw=true" alt="Timesearch"/>
|
||||
</p>
|
|
@ -0,0 +1,3 @@
|
|||
markdown
|
||||
praw
|
||||
voussoirkit
|
|
@ -0,0 +1,5 @@
|
|||
import sys
|
||||
import timesearch
|
||||
|
||||
status_code = timesearch.main(sys.argv[1:])
|
||||
raise SystemExit(status_code)
|
|
@ -0,0 +1,436 @@
|
|||
import argparse
|
||||
import sys
|
||||
|
||||
from . import exceptions
|
||||
|
||||
# NOTE: Originally I wanted the docstring for each module to be within their
|
||||
# file. However, this means that composing the global helptext would require
|
||||
# importing those modules, which will subsequently import PRAW and a whole lot
|
||||
# of other things. This made TS very slow to load which is okay when you're
|
||||
# actually using it but really terrible when you're just viewing the help text.
|
||||
DOCSTRING = '''
|
||||
Timesearch
|
||||
The subreddit archiver
|
||||
|
||||
The basics:
|
||||
1. Collect a subreddit's submissions
|
||||
> timesearch.py timesearch -r subredditname
|
||||
|
||||
2. Collect the comments for those submissions
|
||||
> timesearch.py commentaugment -r subredditname
|
||||
|
||||
3. Stay up-to-date
|
||||
> timesearch.py livestream -r subredditname
|
||||
|
||||
|
||||
Commands for collecting:
|
||||
{timesearch}
|
||||
{commentaugment}
|
||||
{livestream}
|
||||
{getstyles}
|
||||
{getwiki}
|
||||
|
||||
Commands for processing:
|
||||
{offline_reading}
|
||||
{redmash}
|
||||
{breakdown}
|
||||
{mergedb}
|
||||
|
||||
TO SEE DETAILS ON EACH COMMAND, RUN
|
||||
> timesearch.py <command>
|
||||
'''
|
||||
|
||||
MODULE_DOCSTRINGS = {
|
||||
'breakdown': '''
|
||||
breakdown:
|
||||
Give the comment / submission counts for users in a subreddit, or
|
||||
the subreddits that a user posts to.
|
||||
|
||||
Automatically dumps into a <database>_breakdown.json file
|
||||
in the same directory as the database.
|
||||
|
||||
> timesearch.py breakdown -r subredditname
|
||||
> timesearch.py breakdown -u username
|
||||
|
||||
flags:
|
||||
-r "test" | --subreddit "test":
|
||||
The subreddit database to break down.
|
||||
|
||||
-u "test" | --username "test":
|
||||
The username database to break down.
|
||||
|
||||
--sort "name" | "submissions" | "comments" | "total_posts"
|
||||
Sort the output.
|
||||
''',
|
||||
|
||||
'commentaugment': '''
|
||||
commentaugment:
|
||||
Collect comments for the submissions in the database.
|
||||
NOTE - if you did a timesearch scan on a username, this function is
|
||||
mostly useless. It collects comments that were made on OP's submissions
|
||||
but it does not find OP's comments on other people's submissions which
|
||||
is what you probably wanted. Unfortunately that's not possible.
|
||||
|
||||
> timesearch.py commentaugment -r subredditname <flags>
|
||||
> timesearch.py commentaugment -u username <flags>
|
||||
|
||||
flags:
|
||||
-l 18 | --limit 18:
|
||||
The number of MoreComments objects to replace.
|
||||
Default: No limit
|
||||
|
||||
-t 5 | --threshold 5:
|
||||
The number of comments a MoreComments object must claim to have
|
||||
for us to open it.
|
||||
Actual number received may be lower.
|
||||
Default: >= 0
|
||||
|
||||
-n 4 | --num_thresh 4:
|
||||
The number of comments a submission must claim to have for us to
|
||||
scan it at all.
|
||||
Actual number received may be lower.
|
||||
Default: >= 1
|
||||
|
||||
-s "t3_xxxxxx" | --specific "t3_xxxxxx":
|
||||
Given a submission ID, t3_xxxxxx, scan only that submission.
|
||||
|
||||
-v | --verbose:
|
||||
If provided, print more stuff while working.
|
||||
''',
|
||||
|
||||
'getstyles': '''
|
||||
getstyles:
|
||||
Collect the stylesheet, and css images.
|
||||
|
||||
> timesearch.py getstyles -r subredditname
|
||||
''',
|
||||
|
||||
'getwiki': '''
|
||||
getwiki:
|
||||
Collect all available wiki pages.
|
||||
|
||||
> timesearch.py getwiki -r subredditname
|
||||
''',
|
||||
|
||||
'mergedb': '''
|
||||
mergedb:
|
||||
Copy all new posts from one timesearch database into another.
|
||||
|
||||
> timesearch mergedb --from redditdev1.db --to redditdev2.db
|
||||
|
||||
flags:
|
||||
--from:
|
||||
The database file containing the posts you wish to copy.
|
||||
|
||||
--to:
|
||||
The database file to which you will copy the posts.
|
||||
The database is modified in-place.
|
||||
Existing posts will be ignored and not updated.
|
||||
''',
|
||||
|
||||
'livestream': '''
|
||||
livestream:
|
||||
Continously collect submissions and/or comments.
|
||||
|
||||
> timesearch.py livestream -r subredditname <flags>
|
||||
> timesearch.py livestream -u username <flags>
|
||||
|
||||
flags:
|
||||
-r "test" | --subreddit "test":
|
||||
The subreddit to collect from.
|
||||
|
||||
-u "test" | --username "test":
|
||||
The redditor to collect from.
|
||||
|
||||
-s | --submissions:
|
||||
If provided, do collect submissions. Otherwise don't.
|
||||
|
||||
-c | --comments:
|
||||
If provided, do collect comments. Otherwise don't.
|
||||
|
||||
If submissions and comments are BOTH left unspecified, then they will
|
||||
BOTH be collected.
|
||||
|
||||
-v | --verbose:
|
||||
If provided, print extra information to the screen.
|
||||
|
||||
-w 30 | --wait 30:
|
||||
The number of seconds to wait between cycles.
|
||||
|
||||
-1 | --once:
|
||||
If provided, only do a single loop. Otherwise go forever.
|
||||
''',
|
||||
|
||||
'offline_reading': '''
|
||||
offline_reading:
|
||||
Render submissions and comment threads to HTML via Markdown.
|
||||
|
||||
> timesearch.py offline_reading -r subredditname <flags>
|
||||
> timesearch.py offline_reading -u username <flags>
|
||||
|
||||
flags:
|
||||
-s "t3_xxxxxx" | --specific "t3_xxxxxx":
|
||||
Given a submission ID, t3_xxxxxx, render only that submission.
|
||||
Otherwise render every submission in the database.
|
||||
''',
|
||||
|
||||
'redmash': '''
|
||||
redmash:
|
||||
Dump submission listings to a plaintext or HTML file.
|
||||
|
||||
> timesearch.py redmash -r subredditname <flags>
|
||||
> timesearch.py redmash -u username <flags>
|
||||
|
||||
flags:
|
||||
-r "test" | --subreddit "test":
|
||||
The subreddit database to dump
|
||||
|
||||
-u "test" | --username "test":
|
||||
The username database to dump
|
||||
|
||||
--html:
|
||||
Write HTML files instead of plain text.
|
||||
|
||||
-st 50 | --score_threshold 50:
|
||||
Only mash posts with at least this many points.
|
||||
Applies to ALL mashes!
|
||||
|
||||
--all:
|
||||
Perform all of the mashes listed below.
|
||||
|
||||
--date:
|
||||
Perform a mash sorted by date.
|
||||
|
||||
--title:
|
||||
Perform a mash sorted by title.
|
||||
|
||||
--score:
|
||||
Perform a mash sorted by score.
|
||||
|
||||
--author:
|
||||
For subreddit databases only.
|
||||
Perform a mash sorted by author.
|
||||
|
||||
--sub:
|
||||
For username databases only.
|
||||
Perform a mash sorted by subreddit.
|
||||
|
||||
--flair:
|
||||
Perform a mash sorted by flair.
|
||||
|
||||
examples:
|
||||
`timesearch redmash -r botwatch --date`
|
||||
does only the date file.
|
||||
|
||||
`timesearch redmash -r botwatch --score --title`
|
||||
does both the score and title files.
|
||||
|
||||
`timesearch redmash -r botwatch --score --score_threshold 50`
|
||||
only shows submissions with >= 50 points.
|
||||
|
||||
`timesearch redmash -r botwatch --all`
|
||||
performs all of the different mashes.
|
||||
''',
|
||||
|
||||
'timesearch': '''
|
||||
timesearch:
|
||||
Collect submissions from the subreddit across all of history, or
|
||||
Collect submissions by a user (as many as possible).
|
||||
|
||||
> timesearch.py timesearch -r subredditname <flags>
|
||||
> timesearch.py timesearch -u username <flags>
|
||||
|
||||
-r "test" | --subreddit "test":
|
||||
The subreddit to scan. Mutually exclusive with username.
|
||||
|
||||
-u "test" | --username "test":
|
||||
The user to scan. Mutually exclusive with subreddit.
|
||||
|
||||
-l "update" | --lower "update":
|
||||
If a number - the unix timestamp to start at.
|
||||
If "update" - continue from latest submission in db.
|
||||
Default: update
|
||||
|
||||
-up 1467460221 | --upper 1467460221:
|
||||
If a number - the unix timestamp to stop at.
|
||||
If not provided - stop at current time.
|
||||
Default: current time
|
||||
|
||||
-i 86400 | --interval 86400:
|
||||
The initial interval for the scanning window, in seconds.
|
||||
This is only a starting value. The window will shrink and stretch
|
||||
as necessary based on received submission counts.
|
||||
Default: 86400
|
||||
''',
|
||||
}
|
||||
|
||||
|
||||
def docstring_preview(text):
|
||||
'''
|
||||
Return the brief description at the top of the text.
|
||||
User can get full text by looking at each specifically.
|
||||
'''
|
||||
return text.split('\n\n')[0]
|
||||
|
||||
def listget(li, index, fallback=None):
|
||||
try:
|
||||
return li[index]
|
||||
except IndexError:
|
||||
return fallback
|
||||
|
||||
def indent(text, spaces=4):
|
||||
spaces = ' ' * spaces
|
||||
return '\n'.join(spaces + line if line.strip() != '' else line for line in text.split('\n'))
|
||||
|
||||
docstring_headers = {
|
||||
key: indent(docstring_preview(value))
|
||||
for (key, value) in MODULE_DOCSTRINGS.items()
|
||||
}
|
||||
|
||||
DOCSTRING = DOCSTRING.format(**docstring_headers)
|
||||
|
||||
####################################################################################################
|
||||
####################################################################################################
|
||||
|
||||
def breakdown_gateway(args):
|
||||
from . import breakdown
|
||||
breakdown.breakdown_argparse(args)
|
||||
|
||||
def commentaugment_gateway(args):
|
||||
from . import commentaugment
|
||||
commentaugment.commentaugment_argparse(args)
|
||||
|
||||
def getstyles_gateway(args):
|
||||
from . import getstyles
|
||||
getstyles.getstyles_argparse(args)
|
||||
|
||||
def getwiki_gateway(args):
|
||||
from . import getwiki
|
||||
getwiki.getwiki_argparse(args)
|
||||
|
||||
def livestream_gateway(args):
|
||||
from . import livestream
|
||||
livestream.livestream_argparse(args)
|
||||
|
||||
def mergedb_gateway(args):
|
||||
from . import mergedb
|
||||
mergedb.mergedb_argparse(args)
|
||||
|
||||
def offline_reading_gateway(args):
|
||||
from . import offline_reading
|
||||
offline_reading.offline_reading_argparse(args)
|
||||
|
||||
def redmash_gateway(args):
|
||||
from . import redmash
|
||||
redmash.redmash_argparse(args)
|
||||
|
||||
def timesearch_gateway(args):
|
||||
from . import timesearch
|
||||
timesearch.timesearch_argparse(args)
|
||||
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
subparsers = parser.add_subparsers()
|
||||
|
||||
p_breakdown = subparsers.add_parser('breakdown')
|
||||
p_breakdown.add_argument('--sort', dest='sort', default=None)
|
||||
p_breakdown.add_argument('-r', '--subreddit', dest='subreddit', default=None)
|
||||
p_breakdown.add_argument('-u', '--user', dest='username', default=None)
|
||||
p_breakdown.set_defaults(func=breakdown_gateway)
|
||||
|
||||
p_commentaugment = subparsers.add_parser('commentaugment')
|
||||
p_commentaugment.add_argument('-l', '--limit', dest='limit', default=None)
|
||||
p_commentaugment.add_argument('-n', '--num_thresh', dest='num_thresh', default=1)
|
||||
p_commentaugment.add_argument('-r', '--subreddit', dest='subreddit', default=None)
|
||||
p_commentaugment.add_argument('-s', '--specific', dest='specific_submission', default=None)
|
||||
p_commentaugment.add_argument('-t', '--threshold', dest='threshold', default=0)
|
||||
p_commentaugment.add_argument('-u', '--user', dest='username', default=None)
|
||||
p_commentaugment.add_argument('-v', '--verbose', dest='verbose', action='store_true')
|
||||
p_commentaugment.set_defaults(func=commentaugment_gateway)
|
||||
|
||||
p_getstyles = subparsers.add_parser('getstyles')
|
||||
p_getstyles.add_argument('-r', '--subreddit', dest='subreddit')
|
||||
p_getstyles.set_defaults(func=getstyles_gateway)
|
||||
|
||||
p_getwiki = subparsers.add_parser('getwiki')
|
||||
p_getwiki.add_argument('-r', '--subreddit', dest='subreddit')
|
||||
p_getwiki.set_defaults(func=getwiki_gateway)
|
||||
|
||||
p_livestream = subparsers.add_parser('livestream')
|
||||
p_livestream.add_argument('-1', '--once', dest='once', action='store_true')
|
||||
p_livestream.add_argument('-c', '--comments', dest='comments', action='store_true')
|
||||
p_livestream.add_argument('-l', '--limit', dest='limit', default=None)
|
||||
p_livestream.add_argument('-r', '--subreddit', dest='subreddit', default=None)
|
||||
p_livestream.add_argument('-s', '--submissions', dest='submissions', action='store_true')
|
||||
p_livestream.add_argument('-u', '--user', dest='username', default=None)
|
||||
p_livestream.add_argument('-v', '--verbose', dest='verbose', action='store_true')
|
||||
p_livestream.add_argument('-w', '--wait', dest='sleepy', default=30)
|
||||
p_livestream.set_defaults(func=livestream_gateway)
|
||||
|
||||
p_mergedb = subparsers.add_parser('mergedb')
|
||||
p_mergedb.add_argument('--from', dest='from_db_path', required=True)
|
||||
p_mergedb.add_argument('--to', dest='to_db_path', required=True)
|
||||
p_mergedb.set_defaults(func=mergedb_gateway)
|
||||
|
||||
p_offline_reading = subparsers.add_parser('offline_reading')
|
||||
p_offline_reading.add_argument('-r', '--subreddit', dest='subreddit', default=None)
|
||||
p_offline_reading.add_argument('-s', '--specific', dest='specific_submission', default=None)
|
||||
p_offline_reading.add_argument('-u', '--user', dest='username', default=None)
|
||||
p_offline_reading.set_defaults(func=offline_reading_gateway)
|
||||
|
||||
p_redmash = subparsers.add_parser('redmash')
|
||||
p_redmash.add_argument('--all', dest='do_all', action='store_true')
|
||||
p_redmash.add_argument('--author', dest='do_author', action='store_true')
|
||||
p_redmash.add_argument('--date', dest='do_date', action='store_true')
|
||||
p_redmash.add_argument('--flair', dest='do_flair', action='store_true')
|
||||
p_redmash.add_argument('--html', dest='html', action='store_true')
|
||||
p_redmash.add_argument('--score', dest='do_score', action='store_true')
|
||||
p_redmash.add_argument('--sub', dest='do_subreddit', action='store_true')
|
||||
p_redmash.add_argument('--title', dest='do_title', action='store_true')
|
||||
p_redmash.add_argument('-r', '--subreddit', dest='subreddit', default=None)
|
||||
p_redmash.add_argument('-st', '--score_threshold', dest='score_threshold', default=0)
|
||||
p_redmash.add_argument('-u', '--user', dest='username', default=None)
|
||||
p_redmash.set_defaults(func=redmash_gateway)
|
||||
|
||||
p_timesearch = subparsers.add_parser('timesearch')
|
||||
p_timesearch.add_argument('-i', '--interval', dest='interval', default=86400)
|
||||
p_timesearch.add_argument('-l', '--lower', dest='lower', default='update')
|
||||
p_timesearch.add_argument('-r', '--subreddit', dest='subreddit', default=None)
|
||||
p_timesearch.add_argument('-u', '--user', dest='username', default=None)
|
||||
p_timesearch.add_argument('-up', '--upper', dest='upper', default=None)
|
||||
p_timesearch.set_defaults(func=timesearch_gateway)
|
||||
|
||||
def main(argv):
|
||||
helpstrings = {'', 'help', '-h', '--help'}
|
||||
|
||||
command = listget(argv, 0, '').lower()
|
||||
|
||||
# The user did not enter a command, or entered something unrecognized.
|
||||
if command not in MODULE_DOCSTRINGS:
|
||||
print(DOCSTRING)
|
||||
if command == '':
|
||||
print('You are seeing the default help text because you did not choose a command.')
|
||||
elif command not in helpstrings:
|
||||
print('You are seeing the default help text because "%s" was not recognized' % command)
|
||||
return 1
|
||||
|
||||
# The user entered a command, but no further arguments, or just help.
|
||||
argument = listget(argv, 1, '').lower()
|
||||
if argument in helpstrings:
|
||||
print(MODULE_DOCSTRINGS[command])
|
||||
return 1
|
||||
|
||||
args = parser.parse_args(argv)
|
||||
try:
|
||||
args.func(args)
|
||||
except exceptions.DBNotFound as e:
|
||||
message = '"%s" is not an existing database.'
|
||||
message += '\nHave you used any of the other utilities to collect data?'
|
||||
message = message % e.path.absolute_path
|
||||
print(message)
|
||||
return 1
|
||||
|
||||
return 0
|
||||
|
||||
if __name__ == '__main__':
|
||||
raise SystemExit(main(sys.argv[1:]))
|
|
@ -0,0 +1,103 @@
|
|||
import os
|
||||
import json
|
||||
|
||||
from . import common
|
||||
from . import tsdb
|
||||
|
||||
|
||||
def breakdown_database(subreddit=None, username=None):
|
||||
'''
|
||||
Given a database, return a json dict breaking down the submission / comment count for
|
||||
users (if a subreddit database) or subreddits (if a user database).
|
||||
'''
|
||||
if (subreddit is None) == (username is None):
|
||||
raise Exception('Enter subreddit or username but not both')
|
||||
|
||||
breakdown_results = {}
|
||||
def _ingest(names, subkey):
|
||||
for name in names:
|
||||
breakdown_results.setdefault(name, {})
|
||||
breakdown_results[name].setdefault(subkey, 0)
|
||||
breakdown_results[name][subkey] += 1
|
||||
|
||||
if subreddit:
|
||||
database = tsdb.TSDB.for_subreddit(subreddit, do_create=False)
|
||||
else:
|
||||
database = tsdb.TSDB.for_user(username, do_create=False)
|
||||
cur = database.sql.cursor()
|
||||
|
||||
for table in ['submissions', 'comments']:
|
||||
if subreddit:
|
||||
cur.execute('SELECT author FROM %s' % table)
|
||||
elif username:
|
||||
cur.execute('SELECT subreddit FROM %s' % table)
|
||||
|
||||
names = (row[0] for row in common.fetchgenerator(cur))
|
||||
_ingest(names, table)
|
||||
|
||||
for name in breakdown_results:
|
||||
breakdown_results[name].setdefault('submissions', 0)
|
||||
breakdown_results[name].setdefault('comments', 0)
|
||||
|
||||
return breakdown_results
|
||||
|
||||
def breakdown_argparse(args):
|
||||
if args.subreddit:
|
||||
database = tsdb.TSDB.for_subreddit(args.subreddit, do_create=False)
|
||||
else:
|
||||
database = tsdb.TSDB.for_user(args.username, do_create=False)
|
||||
|
||||
breakdown_results = breakdown_database(
|
||||
subreddit=args.subreddit,
|
||||
username=args.username,
|
||||
)
|
||||
|
||||
def sort_name(name):
|
||||
return name.lower()
|
||||
def sort_submissions(name):
|
||||
invert_score = -1 * breakdown_results[name]['submissions']
|
||||
return (invert_score, name.lower())
|
||||
def sort_comments(name):
|
||||
invert_score = -1 * breakdown_results[name]['comments']
|
||||
return (invert_score, name.lower())
|
||||
def sort_total_posts(name):
|
||||
invert_score = breakdown_results[name]['submissions'] + breakdown_results[name]['comments']
|
||||
invert_score = -1 * invert_score
|
||||
return (invert_score, name.lower())
|
||||
breakdown_sorters = {
|
||||
'name': sort_name,
|
||||
'submissions': sort_submissions,
|
||||
'comments': sort_comments,
|
||||
'total_posts': sort_total_posts,
|
||||
}
|
||||
|
||||
breakdown_names = list(breakdown_results.keys())
|
||||
if args.sort is not None:
|
||||
try:
|
||||
sorter = breakdown_sorters[args.sort.lower()]
|
||||
except KeyError:
|
||||
message = '{sorter} is not a sorter. Choose from {options}'
|
||||
message = message.format(sorter=args.sort, options=list(breakdown_sorters.keys()))
|
||||
raise KeyError(message)
|
||||
breakdown_names.sort(key=sorter)
|
||||
dump = ' "{name}": {{"submissions": {submissions}, "comments": {comments}}}'
|
||||
dump = [dump.format(name=name, **breakdown_results[name]) for name in breakdown_names]
|
||||
dump = ',\n'.join(dump)
|
||||
dump = '{\n' + dump + '\n}\n'
|
||||
else:
|
||||
dump = json.dumps(breakdown_results)
|
||||
|
||||
if args.sort is None:
|
||||
breakdown_basename = '%s_breakdown.json'
|
||||
else:
|
||||
breakdown_basename = '%%s_breakdown_%s.json' % args.sort
|
||||
|
||||
breakdown_basename = breakdown_basename % database.filepath.replace_extension('').basename
|
||||
breakdown_filepath = database.breakdown_dir.with_child(breakdown_basename)
|
||||
os.makedirs(breakdown_filepath.parent.absolute_path, exist_ok=True)
|
||||
breakdown_file = open(breakdown_filepath.absolute_path, 'w')
|
||||
with breakdown_file:
|
||||
breakdown_file.write(dump)
|
||||
print('Wrote', breakdown_filepath.relative_path)
|
||||
|
||||
return breakdown_results
|
|
@ -0,0 +1,179 @@
|
|||
import traceback
|
||||
|
||||
from . import common
|
||||
from . import tsdb
|
||||
|
||||
|
||||
def commentaugment(
|
||||
subreddit=None,
|
||||
username=None,
|
||||
limit=0,
|
||||
num_thresh=0,
|
||||
specific_submission=None,
|
||||
threshold=0,
|
||||
verbose=0,
|
||||
):
|
||||
'''
|
||||
Take the IDs of collected submissions, and gather comments from those threads.
|
||||
Please see the global DOCSTRING_COMMENTAUGMENT variable.
|
||||
'''
|
||||
common.bot.login(common.r)
|
||||
if specific_submission is not None:
|
||||
if not specific_submission.startswith('t3_'):
|
||||
specific_submission = 't3_' + specific_submission
|
||||
specific_submission_obj = common.r.submission(specific_submission[3:])
|
||||
subreddit = specific_submission_obj.subreddit.display_name
|
||||
|
||||
if (subreddit is None) == (username is None):
|
||||
raise Exception('Enter subreddit or username but not both')
|
||||
|
||||
if subreddit:
|
||||
if specific_submission is None:
|
||||
database = tsdb.TSDB.for_subreddit(subreddit, do_create=False)
|
||||
else:
|
||||
database = tsdb.TSDB.for_subreddit(subreddit, do_create=True)
|
||||
else:
|
||||
database = tsdb.TSDB.for_user(username, do_create=False)
|
||||
cur = database.sql.cursor()
|
||||
|
||||
if limit == 0:
|
||||
limit = None
|
||||
|
||||
if specific_submission is None:
|
||||
query = '''
|
||||
SELECT idstr FROM submissions
|
||||
WHERE idstr IS NOT NULL
|
||||
AND augmented_at IS NULL
|
||||
AND num_comments >= ?
|
||||
ORDER BY num_comments DESC
|
||||
'''
|
||||
bindings = [num_thresh]
|
||||
cur.execute(query, bindings)
|
||||
fetchall = [item[0] for item in cur.fetchall()]
|
||||
else:
|
||||
# Make sure the object we're augmenting is in the table too!
|
||||
database.insert(specific_submission_obj)
|
||||
fetchall = [specific_submission]
|
||||
|
||||
totalthreads = len(fetchall)
|
||||
|
||||
if verbose:
|
||||
spacer = '\n\t'
|
||||
else:
|
||||
spacer = ' '
|
||||
|
||||
scannedthreads = 0
|
||||
get_submission = common.nofailrequest(get_submission_immediately)
|
||||
while len(fetchall) > 0:
|
||||
id_batch = fetchall[:100]
|
||||
fetchall = fetchall[100:]
|
||||
|
||||
for submission in id_batch:
|
||||
submission = get_submission(submission.split('_')[-1])
|
||||
message = 'Processing {fullname}{spacer}expecting {num_comments} | '
|
||||
message = message.format(
|
||||
fullname=submission.fullname,
|
||||
spacer=spacer,
|
||||
num_comments=submission.num_comments,
|
||||
)
|
||||
|
||||
print(message, end='', flush=True)
|
||||
if verbose:
|
||||
print()
|
||||
|
||||
comments = get_comments_for_thread(submission, limit, threshold, verbose)
|
||||
|
||||
database.insert(comments, commit=False)
|
||||
query = '''
|
||||
UPDATE submissions
|
||||
set augmented_at = ?,
|
||||
augmented_count = ?
|
||||
WHERE idstr == ?
|
||||
'''
|
||||
bindings = [common.get_now(), len(comments), submission.fullname]
|
||||
cur.execute(query, bindings)
|
||||
database.sql.commit()
|
||||
|
||||
scannedthreads += 1
|
||||
if verbose:
|
||||
print('\t', end='')
|
||||
message = 'Found {count} |{spacer}{scannedthreads} / {totalthreads}'
|
||||
message = message.format(
|
||||
count=len(comments),
|
||||
spacer=spacer,
|
||||
scannedthreads=scannedthreads,
|
||||
totalthreads=totalthreads,
|
||||
)
|
||||
print(message)
|
||||
|
||||
def get_comments_for_thread(submission, limit, threshold, verbose):
|
||||
comments = common.nofailrequest(lambda x: x.comments)(submission)
|
||||
# PRAW4 flatten is just list().
|
||||
comments = manually_replace_comments(comments, limit, threshold, verbose)
|
||||
return comments
|
||||
|
||||
def get_submission_immediately(submission_id):
|
||||
submission = common.r.submission(submission_id)
|
||||
# force the lazyloader
|
||||
submission.title = submission.title
|
||||
return submission
|
||||
|
||||
def manually_replace_comments(incomments, limit=None, threshold=0, verbose=False):
|
||||
'''
|
||||
PRAW's replace_more_comments method cannot continue
|
||||
where it left off in the case of an Ow! screen.
|
||||
So I'm writing my own function to get each MoreComments item individually
|
||||
|
||||
Furthermore, this function will maximize the number of retrieved comments by
|
||||
sorting the MoreComments objects and getting the big chunks before worrying
|
||||
about the tail ends.
|
||||
'''
|
||||
incomments = incomments.list()
|
||||
comments = []
|
||||
morecomments = []
|
||||
while len(incomments) > 0:
|
||||
item = incomments.pop()
|
||||
if isinstance(item, common.praw.models.MoreComments) and item.count >= threshold:
|
||||
morecomments.append(item)
|
||||
elif isinstance(item, common.praw.models.Comment):
|
||||
comments.append(item)
|
||||
|
||||
while True:
|
||||
try:
|
||||
if limit is not None and limit <= 0:
|
||||
break
|
||||
if len(morecomments) == 0:
|
||||
break
|
||||
morecomments.sort(key=lambda x: x.count)
|
||||
mc = morecomments.pop()
|
||||
additional = common.nofailrequest(mc.comments)()
|
||||
additionals = 0
|
||||
if limit is not None:
|
||||
limit -= 1
|
||||
for item in additional:
|
||||
if isinstance(item, common.praw.models.MoreComments) and item.count >= threshold:
|
||||
morecomments.append(item)
|
||||
elif isinstance(item, common.praw.models.Comment):
|
||||
comments.append(item)
|
||||
additionals += 1
|
||||
if verbose:
|
||||
s = '\tGot %d more, %d so far.' % (additionals, len(comments))
|
||||
if limit is not None:
|
||||
s += ' Can perform %d more replacements' % limit
|
||||
print(s)
|
||||
except KeyboardInterrupt:
|
||||
raise
|
||||
except Exception:
|
||||
traceback.print_exc()
|
||||
return comments
|
||||
|
||||
def commentaugment_argparse(args):
|
||||
return commentaugment(
|
||||
subreddit=args.subreddit,
|
||||
username=args.username,
|
||||
limit=common.int_none(args.limit),
|
||||
threshold=common.int_none(args.threshold),
|
||||
num_thresh=common.int_none(args.num_thresh),
|
||||
verbose=args.verbose,
|
||||
specific_submission=args.specific_submission,
|
||||
)
|
|
@ -0,0 +1,104 @@
|
|||
import datetime
|
||||
import os
|
||||
import time
|
||||
import traceback
|
||||
|
||||
try:
|
||||
import praw
|
||||
except ImportError:
|
||||
praw = None
|
||||
if praw is None or praw.__version__.startswith('3.'):
|
||||
import praw4
|
||||
praw = praw4
|
||||
|
||||
try:
|
||||
import bot
|
||||
except ImportError:
|
||||
bot = None
|
||||
if bot is None or bot.praw != praw:
|
||||
import bot4
|
||||
bot = bot4
|
||||
|
||||
|
||||
r = bot.anonymous()
|
||||
|
||||
def assert_file_exists(filepath):
|
||||
if not os.path.exists(filepath):
|
||||
raise FileNotFoundError(filepath)
|
||||
|
||||
def b36(i):
|
||||
if isinstance(i, int):
|
||||
return base36encode(i)
|
||||
return base36decode(i)
|
||||
|
||||
def base36decode(number):
|
||||
return int(number, 36)
|
||||
|
||||
def base36encode(number, alphabet='0123456789abcdefghijklmnopqrstuvwxyz'):
|
||||
"""Converts an integer to a base36 string."""
|
||||
if not isinstance(number, (int)):
|
||||
raise TypeError('number must be an integer')
|
||||
base36 = ''
|
||||
sign = ''
|
||||
if number < 0:
|
||||
sign = '-'
|
||||
number = -number
|
||||
if 0 <= number < len(alphabet):
|
||||
return sign + alphabet[number]
|
||||
while number != 0:
|
||||
number, i = divmod(number, len(alphabet))
|
||||
base36 = alphabet[i] + base36
|
||||
return sign + base36
|
||||
|
||||
def fetchgenerator(cursor):
|
||||
while True:
|
||||
item = cursor.fetchone()
|
||||
if item is None:
|
||||
break
|
||||
yield item
|
||||
|
||||
def generator_chunker(generator, chunk_size):
|
||||
chunk = []
|
||||
for item in generator:
|
||||
chunk.append(item)
|
||||
if len(chunk) == chunk_size:
|
||||
yield chunk
|
||||
chunk = []
|
||||
if len(chunk) != 0:
|
||||
yield chunk
|
||||
|
||||
def get_now(stamp=True):
|
||||
now = datetime.datetime.now(datetime.timezone.utc)
|
||||
if stamp:
|
||||
return int(now.timestamp())
|
||||
return now
|
||||
|
||||
def human(timestamp):
|
||||
x = datetime.datetime.utcfromtimestamp(timestamp)
|
||||
x = datetime.datetime.strftime(x, "%b %d %Y %H:%M:%S")
|
||||
return x
|
||||
|
||||
def int_none(x):
|
||||
if x is None:
|
||||
return None
|
||||
return int(x)
|
||||
|
||||
def nofailrequest(function):
|
||||
'''
|
||||
Creates a function that will retry until it succeeds.
|
||||
This function accepts 1 parameter, a function, and returns a modified
|
||||
version of that function that will try-catch, sleep, and loop until it
|
||||
finally returns.
|
||||
'''
|
||||
def a(*args, **kwargs):
|
||||
while True:
|
||||
try:
|
||||
result = function(*args, **kwargs)
|
||||
return result
|
||||
except KeyboardInterrupt:
|
||||
raise
|
||||
except Exception:
|
||||
traceback.print_exc()
|
||||
print('Retrying in 2...')
|
||||
time.sleep(2)
|
||||
return a
|
|
@ -0,0 +1,3 @@
|
|||
class DBNotFound(FileNotFoundError):
|
||||
def __init__(self, path):
|
||||
self.path = path
|
|
@ -0,0 +1,31 @@
|
|||
import os
|
||||
import requests
|
||||
|
||||
from . import common
|
||||
from . import tsdb
|
||||
|
||||
|
||||
def getstyles(subreddit):
|
||||
print('Getting styles for /r/%s' % subreddit)
|
||||
subreddit = common.r.subreddit(subreddit)
|
||||
|
||||
styles = subreddit.stylesheet()
|
||||
database = tsdb.TSDB.for_subreddit(subreddit.display_name)
|
||||
|
||||
os.makedirs(database.styles_dir.absolute_path, exist_ok=True)
|
||||
|
||||
stylesheet_filepath = database.styles_dir.with_child('stylesheet.css')
|
||||
print('Downloading %s' % stylesheet_filepath.relative_path)
|
||||
with open(stylesheet_filepath.absolute_path, 'w', encoding='utf-8') as stylesheet:
|
||||
stylesheet.write(styles.stylesheet)
|
||||
|
||||
for image in styles.images:
|
||||
image_basename = image['name'] + '.' + image['url'].split('.')[-1]
|
||||
image_filepath = database.styles_dir.with_child(image_basename)
|
||||
print('Downloading %s' % image_filepath.relative_path)
|
||||
with open(image_filepath.absolute_path, 'wb') as image_file:
|
||||
response = requests.get(image['url'])
|
||||
image_file.write(response.content)
|
||||
|
||||
def getstyles_argparse(args):
|
||||
return getstyles(args.subreddit)
|
|
@ -0,0 +1,23 @@
|
|||
import os
|
||||
|
||||
from . import common
|
||||
from . import tsdb
|
||||
|
||||
|
||||
def getwiki(subreddit):
|
||||
print('Getting wiki pages for /r/%s' % subreddit)
|
||||
subreddit = common.r.subreddit(subreddit)
|
||||
database = tsdb.TSDB.for_subreddit(subreddit)
|
||||
|
||||
for wikipage in subreddit.wiki:
|
||||
if wikipage.name == 'config/stylesheet':
|
||||
continue
|
||||
|
||||
wikipage_path = database.wiki_dir.join(wikipage.name).replace_extension('md')
|
||||
os.makedirs(wikipage_path.parent.absolute_path, exist_ok=True)
|
||||
with open(wikipage_path.absolute_path, 'w', encoding='utf-8') as handle:
|
||||
handle.write(wikipage.content_md)
|
||||
print('Wrote', wikipage_path.relative_path)
|
||||
|
||||
def getwiki_argparse(args):
|
||||
return getwiki(args.subreddit)
|
|
@ -0,0 +1,175 @@
|
|||
import copy
|
||||
import time
|
||||
import traceback
|
||||
|
||||
from . import common
|
||||
from . import tsdb
|
||||
|
||||
|
||||
def livestream(
|
||||
subreddit=None,
|
||||
username=None,
|
||||
verbose=False,
|
||||
as_a_generator=False,
|
||||
do_submissions=True,
|
||||
do_comments=True,
|
||||
limit=100,
|
||||
only_once=False,
|
||||
sleepy=30,
|
||||
):
|
||||
'''
|
||||
Continuously get posts from this source
|
||||
and insert them into the database
|
||||
|
||||
as_a_generator:
|
||||
return a generator where every iteration does a single livestream loop.
|
||||
This is good if you want to manage multiple livestreams yourself by
|
||||
calling `next` on each of them, instead of getting stuck in here.
|
||||
'''
|
||||
if bool(subreddit) == bool(username):
|
||||
raise Exception('Require either username / subreddit parameter, but not both')
|
||||
if bool(do_submissions) is bool(do_comments) is False:
|
||||
raise Exception('Require do_submissions and/or do_comments parameter')
|
||||
common.bot.login(common.r)
|
||||
|
||||
if subreddit:
|
||||
print('Getting subreddit %s' % subreddit)
|
||||
database = tsdb.TSDB.for_subreddit(subreddit)
|
||||
subreddit = common.r.subreddit(subreddit)
|
||||
submissions = subreddit.new if do_submissions else None
|
||||
comments = subreddit.comments if do_comments else None
|
||||
else:
|
||||
print('Getting redditor %s' % username)
|
||||
database = tsdb.TSDB.for_user(username)
|
||||
user = common.r.redditor(username)
|
||||
submissions = user.submissions.new if do_submissions else None
|
||||
comments = user.comments.new if do_comments else None
|
||||
|
||||
generator = _livestream_as_a_generator(
|
||||
database,
|
||||
submission_function=submissions,
|
||||
comment_function=comments,
|
||||
limit=limit,
|
||||
params={'show': 'all'},
|
||||
verbose=verbose,
|
||||
)
|
||||
if as_a_generator:
|
||||
return generator
|
||||
|
||||
while True:
|
||||
try:
|
||||
step = next(generator)
|
||||
newtext = '%ds, %dc' % (step['new_submissions'], step['new_comments'])
|
||||
totalnew = step['new_submissions'] + step['new_comments']
|
||||
status = '{now} +{new}'.format(now=common.human(common.get_now()), new=newtext)
|
||||
print(status, end='', flush=True)
|
||||
if totalnew == 0 and verbose is False:
|
||||
# Since there were no news, allow the next line to overwrite status
|
||||
print('\r', end='')
|
||||
else:
|
||||
print()
|
||||
|
||||
if verbose:
|
||||
print('Loop finished.')
|
||||
if only_once:
|
||||
break
|
||||
time.sleep(sleepy)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print()
|
||||
return
|
||||
|
||||
except Exception:
|
||||
traceback.print_exc()
|
||||
print('Retrying in 5...')
|
||||
time.sleep(5)
|
||||
|
||||
hangman = lambda: livestream(
|
||||
username='gallowboob',
|
||||
do_submissions=True,
|
||||
do_comments=True,
|
||||
sleepy=60,
|
||||
)
|
||||
|
||||
def _livestream_as_a_generator(
|
||||
database,
|
||||
submission_function,
|
||||
comment_function,
|
||||
limit,
|
||||
params,
|
||||
verbose,
|
||||
):
|
||||
while True:
|
||||
#common.r.handler.clear_cache()
|
||||
try:
|
||||
items = _livestream_helper(
|
||||
submission_function=submission_function,
|
||||
comment_function=comment_function,
|
||||
limit=limit,
|
||||
params=params,
|
||||
verbose=verbose,
|
||||
)
|
||||
newitems = database.insert(items)
|
||||
yield newitems
|
||||
except Exception:
|
||||
traceback.print_exc()
|
||||
print('Retrying in 5...')
|
||||
time.sleep(5)
|
||||
|
||||
|
||||
def _livestream_helper(
|
||||
submission_function=None,
|
||||
comment_function=None,
|
||||
verbose=False,
|
||||
*args,
|
||||
**kwargs,
|
||||
):
|
||||
'''
|
||||
Given a submission-retrieving function and/or a comment-retrieving function,
|
||||
collect submissions and comments in a list together and return that.
|
||||
|
||||
args and kwargs go into the collecting functions.
|
||||
'''
|
||||
if bool(submission_function) is bool(comment_function) is False:
|
||||
raise Exception('Require submissions and/or comments parameter')
|
||||
results = []
|
||||
|
||||
if submission_function:
|
||||
if verbose:
|
||||
print('Getting submissions', args, kwargs)
|
||||
this_kwargs = copy.deepcopy(kwargs)
|
||||
submission_batch = submission_function(*args, **this_kwargs)
|
||||
results.extend(submission_batch)
|
||||
if comment_function:
|
||||
if verbose:
|
||||
print('Getting comments', args, kwargs)
|
||||
this_kwargs = copy.deepcopy(kwargs)
|
||||
comment_batch = comment_function(*args, **this_kwargs)
|
||||
results.extend(comment_batch)
|
||||
if verbose:
|
||||
print('Collected. Saving...')
|
||||
return results
|
||||
|
||||
def livestream_argparse(args):
|
||||
if args.submissions is args.comments is False:
|
||||
args.submissions = True
|
||||
args.comments = True
|
||||
if args.limit is None:
|
||||
limit = 100
|
||||
else:
|
||||
limit = int(args.limit)
|
||||
|
||||
if args.submissions is False and args.comments is False:
|
||||
args.submissions = True
|
||||
args.comments = True
|
||||
|
||||
return livestream(
|
||||
subreddit=args.subreddit,
|
||||
username=args.username,
|
||||
do_comments=args.comments,
|
||||
do_submissions=args.submissions,
|
||||
limit=limit,
|
||||
verbose=args.verbose,
|
||||
only_once=args.once,
|
||||
sleepy=common.int_none(args.sleepy),
|
||||
)
|
|
@ -0,0 +1,35 @@
|
|||
import os
|
||||
import requests
|
||||
|
||||
from . import common
|
||||
from . import tsdb
|
||||
|
||||
|
||||
MIGRATE_QUERY = '''
|
||||
INSERT INTO {tablename}
|
||||
SELECT othertable.* FROM other.{tablename} othertable
|
||||
LEFT JOIN {tablename} mytable ON mytable.idint == othertable.idint
|
||||
WHERE mytable.idint IS NULL;
|
||||
'''
|
||||
|
||||
def _migrate_helper(db, tablename):
|
||||
oldcount = db.cur.execute('SELECT count(*) FROM %s' % tablename).fetchone()[0]
|
||||
|
||||
query = MIGRATE_QUERY.format(tablename=tablename)
|
||||
print(query)
|
||||
db.cur.execute(query)
|
||||
db.sql.commit()
|
||||
|
||||
newcount = db.cur.execute('SELECT count(*) FROM %s' % tablename).fetchone()[0]
|
||||
print('Gained %d items.' % (newcount - oldcount))
|
||||
|
||||
def mergedb(from_db_path, to_db_path):
|
||||
to_db = tsdb.TSDB(to_db_path)
|
||||
from_db = tsdb.TSDB(from_db_path)
|
||||
|
||||
to_db.cur.execute('ATTACH DATABASE "%s" AS other' % from_db_path)
|
||||
_migrate_helper(to_db, 'submissions')
|
||||
_migrate_helper(to_db, 'comments')
|
||||
|
||||
def mergedb_argparse(args):
|
||||
return mergedb(args.from_db_path, args.to_db_path)
|
|
@ -0,0 +1,340 @@
|
|||
import os
|
||||
import markdown
|
||||
|
||||
from . import common
|
||||
from . import tsdb
|
||||
|
||||
|
||||
class DBEntry:
|
||||
def __init__(self, fetch):
|
||||
if fetch[1].startswith('t3_'):
|
||||
columns = tsdb.SQL_SUBMISSION_COLUMNS
|
||||
self.object_type = 'submission'
|
||||
else:
|
||||
columns = tsdb.SQL_COMMENT_COLUMNS
|
||||
self.object_type = 'comment'
|
||||
|
||||
self.id = None
|
||||
self.idstr = None
|
||||
for (index, attribute) in enumerate(columns):
|
||||
setattr(self, attribute, fetch[index])
|
||||
|
||||
def __repr__(self):
|
||||
return 'DBEntry(\'%s\')' % self.id
|
||||
|
||||