Initial migratory commit from voussoir/reddit.
This commit is contained in:
commit
708c774e52
18 changed files with 2471 additions and 0 deletions
30
.gitattributes
vendored
Normal file
30
.gitattributes
vendored
Normal file
|
@ -0,0 +1,30 @@
|
|||
# Auto detect text files and perform LF normalization
|
||||
* text=auto
|
||||
|
||||
# Custom for Visual Studio
|
||||
*.cs diff=csharp
|
||||
*.sln merge=union
|
||||
*.csproj merge=union
|
||||
*.vbproj merge=union
|
||||
*.fsproj merge=union
|
||||
*.dbproj merge=union
|
||||
|
||||
*.psd binary
|
||||
*.zip binary
|
||||
*.db binary
|
||||
*.png binary
|
||||
*.jpg binary
|
||||
*.ico binary
|
||||
*.exe binary
|
||||
|
||||
# Standard to msysgit
|
||||
*.doc diff=astextplain
|
||||
*.DOC diff=astextplain
|
||||
*.docx diff=astextplain
|
||||
*.DOCX diff=astextplain
|
||||
*.dot diff=astextplain
|
||||
*.DOT diff=astextplain
|
||||
*.pdf diff=astextplain
|
||||
*.PDF diff=astextplain
|
||||
*.rtf diff=astextplain
|
||||
*.RTF diff=astextplain
|
231
.gitignore
vendored
Normal file
231
.gitignore
vendored
Normal file
|
@ -0,0 +1,231 @@
|
|||
databases/*
|
||||
@hangman.md
|
||||
hangman.py
|
||||
merge_database.py
|
||||
migrate_20160605.py
|
||||
timesearch_backup.py
|
||||
|
||||
*.ignore
|
||||
*.db-journal
|
||||
*.pydevproject
|
||||
.project
|
||||
.metadata
|
||||
bin/
|
||||
tmp/
|
||||
*.tmp
|
||||
*.bak
|
||||
*.swp
|
||||
*~.nib
|
||||
local.properties
|
||||
.classpath
|
||||
.settings/
|
||||
.loadpath
|
||||
|
||||
# External tool builders
|
||||
.externalToolBuilders/
|
||||
|
||||
# Locally stored "Eclipse launch configurations"
|
||||
*.launch
|
||||
|
||||
# CDT-specific
|
||||
.cproject
|
||||
|
||||
# PDT-specific
|
||||
.buildpath
|
||||
|
||||
|
||||
#################
|
||||
## Visual Studio
|
||||
#################
|
||||
|
||||
## Ignore Visual Studio temporary files, build results, and
|
||||
## files generated by popular Visual Studio add-ons.
|
||||
|
||||
# User-specific files
|
||||
*.suo
|
||||
*.user
|
||||
*.sln.docstates
|
||||
|
||||
# Build results
|
||||
|
||||
[Dd]ebug/
|
||||
[Rr]elease/
|
||||
x64/
|
||||
build/
|
||||
[Bb]in/
|
||||
[Oo]bj/
|
||||
|
||||
# MSTest test Results
|
||||
[Tt]est[Rr]esult*/
|
||||
[Bb]uild[Ll]og.*
|
||||
|
||||
*_i.c
|
||||
*_p.c
|
||||
*.ilk
|
||||
*.meta
|
||||
*.obj
|
||||
*.pch
|
||||
*.pdb
|
||||
*.pgc
|
||||
*.pgd
|
||||
*.rsp
|
||||
*.sbr
|
||||
*.tlb
|
||||
*.tli
|
||||
*.tlh
|
||||
*.tmp
|
||||
*.tmp_proj
|
||||
*.log
|
||||
*.vspscc
|
||||
*.vssscc
|
||||
.builds
|
||||
*.pidb
|
||||
*.log
|
||||
*.scc
|
||||
|
||||
# Visual C++ cache files
|
||||
ipch/
|
||||
*.aps
|
||||
*.ncb
|
||||
*.opensdf
|
||||
*.sdf
|
||||
*.cachefile
|
||||
|
||||
# Visual Studio profiler
|
||||
*.psess
|
||||
*.vsp
|
||||
*.vspx
|
||||
|
||||
# Guidance Automation Toolkit
|
||||
*.gpState
|
||||
|
||||
# ReSharper is a .NET coding add-in
|
||||
_ReSharper*/
|
||||
*.[Rr]e[Ss]harper
|
||||
|
||||
# TeamCity is a build add-in
|
||||
_TeamCity*
|
||||
|
||||
# DotCover is a Code Coverage Tool
|
||||
*.dotCover
|
||||
|
||||
# NCrunch
|
||||
*.ncrunch*
|
||||
.*crunch*.local.xml
|
||||
|
||||
# Installshield output folder
|
||||
[Ee]xpress/
|
||||
|
||||
# DocProject is a documentation generator add-in
|
||||
DocProject/buildhelp/
|
||||
DocProject/Help/*.HxT
|
||||
DocProject/Help/*.HxC
|
||||
DocProject/Help/*.hhc
|
||||
DocProject/Help/*.hhk
|
||||
DocProject/Help/*.hhp
|
||||
DocProject/Help/Html2
|
||||
DocProject/Help/html
|
||||
|
||||
# Click-Once directory
|
||||
publish/
|
||||
|
||||
# Publish Web Output
|
||||
*.Publish.xml
|
||||
*.pubxml
|
||||
|
||||
# NuGet Packages Directory
|
||||
## TODO: If you have NuGet Package Restore enabled, uncomment the next line
|
||||
#packages/
|
||||
|
||||
# Windows Azure Build Output
|
||||
csx
|
||||
*.build.csdef
|
||||
|
||||
# Windows Store app package directory
|
||||
AppPackages/
|
||||
|
||||
# Others
|
||||
sql/
|
||||
*.Cache
|
||||
ClientBin/
|
||||
[Ss]tyle[Cc]op.*
|
||||
~$*
|
||||
*~
|
||||
*.dbmdl
|
||||
*.[Pp]ublish.xml
|
||||
*.pfx
|
||||
*.publishsettings
|
||||
|
||||
# RIA/Silverlight projects
|
||||
Generated_Code/
|
||||
|
||||
# Backup & report files from converting an old project file to a newer
|
||||
# Visual Studio version. Backup files are not needed, because we have git ;-)
|
||||
_UpgradeReport_Files/
|
||||
Backup*/
|
||||
UpgradeLog*.XML
|
||||
UpgradeLog*.htm
|
||||
|
||||
# SQL Server files
|
||||
App_Data/*.mdf
|
||||
App_Data/*.ldf
|
||||
|
||||
#############
|
||||
## Windows detritus
|
||||
#############
|
||||
|
||||
# Windows image file caches
|
||||
Thumbs.db
|
||||
ehthumbs.db
|
||||
|
||||
# Folder config file
|
||||
Desktop.ini
|
||||
|
||||
# Recycle Bin used on file shares
|
||||
$RECYCLE.BIN/
|
||||
|
||||
# Mac crap
|
||||
.DS_Store
|
||||
|
||||
|
||||
#############
|
||||
## Python
|
||||
#############
|
||||
|
||||
*.py[co]
|
||||
|
||||
# Packages
|
||||
*.egg
|
||||
*.egg-info
|
||||
dist/
|
||||
build/
|
||||
eggs/
|
||||
parts/
|
||||
var/
|
||||
sdist/
|
||||
develop-eggs/
|
||||
.installed.cfg
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
.coverage
|
||||
.tox
|
||||
|
||||
#Translations
|
||||
*.mo
|
||||
|
||||
#Mr Developer
|
||||
.mr.developer.cfg
|
||||
=======
|
||||
*~
|
||||
*.egg
|
||||
*.pyc
|
||||
.coverage
|
||||
*.egg-info/
|
||||
_build/
|
||||
build/
|
||||
dist/
|
||||
.DS_Store
|
||||
|
114
README.md
Normal file
114
README.md
Normal file
|
@ -0,0 +1,114 @@
|
|||
timesearch
|
||||
==========
|
||||
|
||||
I don't have a test suite. You're my test suite! Messages go to [/u/GoldenSights](https://reddit.com/u/GoldenSights).
|
||||
|
||||
Timesearch is a collection of utilities for archiving subreddits.
|
||||
|
||||
### Make sure you have:
|
||||
- Installed [Python](https://www.python.org/download). I use Python 3.6.
|
||||
- Installed PRAW >= 4, as well as the other modules in `requirements.txt`. Try `pip install -r requirements.txt` to get them all.
|
||||
- Created an OAuth app at https://reddit.com/prefs/apps. Make it `script` type, and set the redirect URI to `http://localhost:8080`. The title and description can be anything you want, and the about URL is not required.
|
||||
- Used [this PRAW script](https://praw.readthedocs.io/en/latest/tutorials/refresh_token.html) to generate a refresh token. Just save it as a .py file somewhere and run it through your terminal / command line. For simplicity's sake, I just choose `all` for the scopes.
|
||||
- Downloaded a copy of [this file](https://github.com/voussoir/reddit/blob/master/bot4.py) and saved it as `bot.py`. Fill out the variables using your OAuth information, and read the instructions to see where to put it. The Useragent is a description of your API usage. Typically "/u/username's praw client" is sufficient.
|
||||
|
||||
### This package consists of:
|
||||
|
||||
- **timesearch**: If you try to page through `/new` on a subreddit, you'll hit a limit at or before 1,000 posts. Timesearch uses the `timestamp` cloudsearch query parameter to step from the beginning of a subreddit to present time, to collect as many submissions as possible. Read more about timestamp searching [here](https://www.reddit.com/r/reddittips/comments/2ix73n/use_cloudsearch_to_search_for_posts_on_reddit/).
|
||||
`> timesearch.py timesearch -r subredditname <flags>`
|
||||
`> timesearch.py timesearch -u username <flags>`
|
||||
|
||||
- **commentaugment**: Although we can search for submissions, we cannot search for comments. After performing a timesearch, you can use commentaugment to download the comment tree for each submission.
|
||||
Note: commentaugment only gets the comments attached to the submissions that you found in your timesearch scan. If you're trying to commentaugment on a user, you're going to get comments that were made on their submissions, **not** comments they made on other people's submissions. Therefore, comprehensively collecting a user's activity is not possible. You will have to use someone else's dataset like that of [/u/Stuck_in_the_Matrix](https://reddit.com/u/Stuck_in_the_Matrix) at [pushshift.io](https://pushshift.io).
|
||||
`> timesearch.py commentaugment -r subredditname <flags>`
|
||||
`> timesearch.py commentaugment -u username <flags>`
|
||||
|
||||
- **livestream**: timesearch+commentaugment is great for starting your database and getting historical posts, but it's not the best for staying up-to-date. Instead, livestream monitors `/new` and `/comments` to continuously ingest data.
|
||||
`> timesearch.py livestream -r subredditname <flags>`
|
||||
`> timesearch.py livestream -u username <flags>`
|
||||
|
||||
- **getstyles**: Downloads the stylesheet and CSS images.
|
||||
`> timesearch.py getstyles -r subredditname`
|
||||
|
||||
- **getwiki**: Downloads the wiki pages, sidebar, etc. from /wiki/pages.
|
||||
`> timesearch.py getwiki -r subredditname`
|
||||
|
||||
- **offline_reading**: Renders comment threads into HTML via markdown.
|
||||
Note: I'm currently using the [markdown library from pypi](https://pypi.python.org/pypi/Markdown), and it doesn't do reddit's custom markdown like `/r/` or `/u/`, obviously. So far I don't think anybody really uses o_r so I haven't invested much time into improving it.
|
||||
`> timesearch.py offline_reading -r subredditname <flags>`
|
||||
`> timesearch.py offline_reading -u username <flags>`
|
||||
|
||||
- **redmash**: Generates plaintext or HTML lists of submissions, sorted by a property of your choosing. You can order by date, author, flair, etc.
|
||||
`> timesearch.py redmash -r subredditname <flags>`
|
||||
`> timesearch.py redmash -u username <flags>`
|
||||
|
||||
- **breakdown**: Produces a JSON file indicating which users make the most posts in a subreddit, or which subreddits a user posts in.
|
||||
`> timesearch.py breakdown -r subredditname` <flags>
|
||||
`> timesearch.py breakdown -u username` <flags>
|
||||
|
||||
- **mergedb**: Copy all new data from one timesearch database into another. Useful for syncing or merging two scans of the same subreddit.
|
||||
`> timesearch.py mergedb --from filepath/database1.db --to filepath/database2.db`
|
||||
|
||||
### To use it
|
||||
|
||||
You will need both the `timesearch` package (folder) and the external `timesearch.py` file. You can click the green "Clone or Download" button in the upper right. When you run the .py file, it sends your commandline arguments into the package. You can view a summarized version of all the help text with just `timesearch.py`, or you can view a specific docstring with `timesearch.py livestream`, etc.
|
||||
|
||||
I recommend [sqlitebrowser](https://github.com/sqlitebrowser/sqlitebrowser/releases) if you want to inspect the database yourself.
|
||||
|
||||
### Changelog
|
||||
- 2017 11 13
|
||||
- Gave timesearch its own Github repository so that (1) it will be easier for people to download it and (2) it has a cleaner, more independent URL. [voussoir/timesearch](https://github.com/voussoir/timesearch)
|
||||
|
||||
- 2017 11 05
|
||||
- Added a try-except inside livestream helper to prevent generator from terminating.
|
||||
|
||||
- 2017 11 04
|
||||
- For timesearch, I switched from using my custom cloudsearch iterator to the one that comes with PRAW4+.
|
||||
|
||||
- 2017 10 12
|
||||
- Added the `mergedb` utility for combining databases.
|
||||
|
||||
- 2017 06 02
|
||||
- You can use `commentaugment -s abcdef` to get a particular thread even if you haven't scraped anything else from that subreddit. Previously `-s` only worked if the database already existed and you specified it via `-r`. Now it is inferred from the submission itself.
|
||||
|
||||
- 2017 04 28
|
||||
- Complete restructure into package, started using PRAW4.
|
||||
|
||||
- 2016 08 10
|
||||
- Started merging redmash and wrote its argparser
|
||||
|
||||
- 2016 07 03
|
||||
- Improved docstring clarity.
|
||||
|
||||
- 2016 07 02
|
||||
- Added `livestream` argparse
|
||||
|
||||
- 2016 06 07
|
||||
- Offline_reading has been merged with the main timesearch file
|
||||
- `get_all_posts` renamed to `timesearch`
|
||||
- Timesearch parameter `usermode` renamed to `username`; `maxupper` renamed to `upper`.
|
||||
- Everything now accessible via commandline arguments. Read the docstring at the top of the file.
|
||||
|
||||
- 2016 06 05
|
||||
- NEW DATABASE SCHEME. Submissions and comments now live in different tables like they should have all along. Submission table has two new columns for a little bit of commentaugment metadata. This allows commentaugment to only scan threads that are new.
|
||||
- You can use the `migrate_20160605.py` script to convert old databases into new ones.
|
||||
|
||||
- 2015 11 11
|
||||
- created `offline_reading.py` which converts a timesearch database into a comment tree that can be rendered into HTML
|
||||
|
||||
- 2015 09 07
|
||||
- fixed bug which allowed `livestream` to crash because `bot.refresh()` was outside of the try-catch.
|
||||
|
||||
- 2015 08 19
|
||||
- fixed bug in which updatescores stopped iterating early if you had more than 100 comments in a row in the db
|
||||
- commentaugment has been completely merged into the timesearch.py file. you can use commentaugment_prompt() to input the parameters, or use the commentaugment() function directly.
|
||||
|
||||
|
||||
____
|
||||
|
||||
|
||||
I want to live in a future where everyone uses UTC and agrees on daylight savings.
|
||||
|
||||
<p align="center">
|
||||
<img src="https://github.com/voussoir/reddit/blob/master/.GitImages/timesearch_logo_256.png?raw=true" alt="Timesearch"/>
|
||||
</p>
|
3
requirements.txt
Normal file
3
requirements.txt
Normal file
|
@ -0,0 +1,3 @@
|
|||
markdown
|
||||
praw
|
||||
voussoirkit
|
5
timesearch.py
Normal file
5
timesearch.py
Normal file
|
@ -0,0 +1,5 @@
|
|||
import sys
|
||||
import timesearch
|
||||
|
||||
status_code = timesearch.main(sys.argv[1:])
|
||||
raise SystemExit(status_code)
|
436
timesearch/__init__.py
Normal file
436
timesearch/__init__.py
Normal file
|
@ -0,0 +1,436 @@
|
|||
import argparse
|
||||
import sys
|
||||
|
||||
from . import exceptions
|
||||
|
||||
# NOTE: Originally I wanted the docstring for each module to be within their
|
||||
# file. However, this means that composing the global helptext would require
|
||||
# importing those modules, which will subsequently import PRAW and a whole lot
|
||||
# of other things. This made TS very slow to load which is okay when you're
|
||||
# actually using it but really terrible when you're just viewing the help text.
|
||||
DOCSTRING = '''
|
||||
Timesearch
|
||||
The subreddit archiver
|
||||
|
||||
The basics:
|
||||
1. Collect a subreddit's submissions
|
||||
> timesearch.py timesearch -r subredditname
|
||||
|
||||
2. Collect the comments for those submissions
|
||||
> timesearch.py commentaugment -r subredditname
|
||||
|
||||
3. Stay up-to-date
|
||||
> timesearch.py livestream -r subredditname
|
||||
|
||||
|
||||
Commands for collecting:
|
||||
{timesearch}
|
||||
{commentaugment}
|
||||
{livestream}
|
||||
{getstyles}
|
||||
{getwiki}
|
||||
|
||||
Commands for processing:
|
||||
{offline_reading}
|
||||
{redmash}
|
||||
{breakdown}
|
||||
{mergedb}
|
||||
|
||||
TO SEE DETAILS ON EACH COMMAND, RUN
|
||||
> timesearch.py <command>
|
||||
'''
|
||||
|
||||
MODULE_DOCSTRINGS = {
|
||||
'breakdown': '''
|
||||
breakdown:
|
||||
Give the comment / submission counts for users in a subreddit, or
|
||||
the subreddits that a user posts to.
|
||||
|
||||
Automatically dumps into a <database>_breakdown.json file
|
||||
in the same directory as the database.
|
||||
|
||||
> timesearch.py breakdown -r subredditname
|
||||
> timesearch.py breakdown -u username
|
||||
|
||||
flags:
|
||||
-r "test" | --subreddit "test":
|
||||
The subreddit database to break down.
|
||||
|
||||
-u "test" | --username "test":
|
||||
The username database to break down.
|
||||
|
||||
--sort "name" | "submissions" | "comments" | "total_posts"
|
||||
Sort the output.
|
||||
''',
|
||||
|
||||
'commentaugment': '''
|
||||
commentaugment:
|
||||
Collect comments for the submissions in the database.
|
||||
NOTE - if you did a timesearch scan on a username, this function is
|
||||
mostly useless. It collects comments that were made on OP's submissions
|
||||
but it does not find OP's comments on other people's submissions which
|
||||
is what you probably wanted. Unfortunately that's not possible.
|
||||
|
||||
> timesearch.py commentaugment -r subredditname <flags>
|
||||
> timesearch.py commentaugment -u username <flags>
|
||||
|
||||
flags:
|
||||
-l 18 | --limit 18:
|
||||
The number of MoreComments objects to replace.
|
||||
Default: No limit
|
||||
|
||||
-t 5 | --threshold 5:
|
||||
The number of comments a MoreComments object must claim to have
|
||||
for us to open it.
|
||||
Actual number received may be lower.
|
||||
Default: >= 0
|
||||
|
||||
-n 4 | --num_thresh 4:
|
||||
The number of comments a submission must claim to have for us to
|
||||
scan it at all.
|
||||
Actual number received may be lower.
|
||||
Default: >= 1
|
||||
|
||||
-s "t3_xxxxxx" | --specific "t3_xxxxxx":
|
||||
Given a submission ID, t3_xxxxxx, scan only that submission.
|
||||
|
||||
-v | --verbose:
|
||||
If provided, print more stuff while working.
|
||||
''',
|
||||
|
||||
'getstyles': '''
|
||||
getstyles:
|
||||
Collect the stylesheet, and css images.
|
||||
|
||||
> timesearch.py getstyles -r subredditname
|
||||
''',
|
||||
|
||||
'getwiki': '''
|
||||
getwiki:
|
||||
Collect all available wiki pages.
|
||||
|
||||
> timesearch.py getwiki -r subredditname
|
||||
''',
|
||||
|
||||
'mergedb': '''
|
||||
mergedb:
|
||||
Copy all new posts from one timesearch database into another.
|
||||
|
||||
> timesearch mergedb --from redditdev1.db --to redditdev2.db
|
||||
|
||||
flags:
|
||||
--from:
|
||||
The database file containing the posts you wish to copy.
|
||||
|
||||
--to:
|
||||
The database file to which you will copy the posts.
|
||||
The database is modified in-place.
|
||||
Existing posts will be ignored and not updated.
|
||||
''',
|
||||
|
||||
'livestream': '''
|
||||
livestream:
|
||||
Continously collect submissions and/or comments.
|
||||
|
||||
> timesearch.py livestream -r subredditname <flags>
|
||||
> timesearch.py livestream -u username <flags>
|
||||
|
||||
flags:
|
||||
-r "test" | --subreddit "test":
|
||||
The subreddit to collect from.
|
||||
|
||||
-u "test" | --username "test":
|
||||
The redditor to collect from.
|
||||
|
||||
-s | --submissions:
|
||||
If provided, do collect submissions. Otherwise don't.
|
||||
|
||||
-c | --comments:
|
||||
If provided, do collect comments. Otherwise don't.
|
||||
|
||||
If submissions and comments are BOTH left unspecified, then they will
|
||||
BOTH be collected.
|
||||
|
||||
-v | --verbose:
|
||||
If provided, print extra information to the screen.
|
||||
|
||||
-w 30 | --wait 30:
|
||||
The number of seconds to wait between cycles.
|
||||
|
||||
-1 | --once:
|
||||
If provided, only do a single loop. Otherwise go forever.
|
||||
''',
|
||||
|
||||
'offline_reading': '''
|
||||
offline_reading:
|
||||
Render submissions and comment threads to HTML via Markdown.
|
||||
|
||||
> timesearch.py offline_reading -r subredditname <flags>
|
||||
> timesearch.py offline_reading -u username <flags>
|
||||
|
||||
flags:
|
||||
-s "t3_xxxxxx" | --specific "t3_xxxxxx":
|
||||
Given a submission ID, t3_xxxxxx, render only that submission.
|
||||
Otherwise render every submission in the database.
|
||||
''',
|
||||
|
||||
'redmash': '''
|
||||
redmash:
|
||||
Dump submission listings to a plaintext or HTML file.
|
||||
|
||||
> timesearch.py redmash -r subredditname <flags>
|
||||
> timesearch.py redmash -u username <flags>
|
||||
|
||||
flags:
|
||||
-r "test" | --subreddit "test":
|
||||
The subreddit database to dump
|
||||
|
||||
-u "test" | --username "test":
|
||||
The username database to dump
|
||||
|
||||
--html:
|
||||
Write HTML files instead of plain text.
|
||||
|
||||
-st 50 | --score_threshold 50:
|
||||
Only mash posts with at least this many points.
|
||||
Applies to ALL mashes!
|
||||
|
||||
--all:
|
||||
Perform all of the mashes listed below.
|
||||
|
||||
--date:
|
||||
Perform a mash sorted by date.
|
||||
|
||||
--title:
|
||||
Perform a mash sorted by title.
|
||||
|
||||
--score:
|
||||
Perform a mash sorted by score.
|
||||
|
||||
--author:
|
||||
For subreddit databases only.
|
||||
Perform a mash sorted by author.
|
||||
|
||||
--sub:
|
||||
For username databases only.
|
||||
Perform a mash sorted by subreddit.
|
||||
|
||||
--flair:
|
||||
Perform a mash sorted by flair.
|
||||
|
||||
examples:
|
||||
`timesearch redmash -r botwatch --date`
|
||||
does only the date file.
|
||||
|
||||
`timesearch redmash -r botwatch --score --title`
|
||||
does both the score and title files.
|
||||
|
||||
`timesearch redmash -r botwatch --score --score_threshold 50`
|
||||
only shows submissions with >= 50 points.
|
||||
|
||||
`timesearch redmash -r botwatch --all`
|
||||
performs all of the different mashes.
|
||||
''',
|
||||
|
||||
'timesearch': '''
|
||||
timesearch:
|
||||
Collect submissions from the subreddit across all of history, or
|
||||
Collect submissions by a user (as many as possible).
|
||||
|
||||
> timesearch.py timesearch -r subredditname <flags>
|
||||
> timesearch.py timesearch -u username <flags>
|
||||
|
||||
-r "test" | --subreddit "test":
|
||||
The subreddit to scan. Mutually exclusive with username.
|
||||
|
||||
-u "test" | --username "test":
|
||||
The user to scan. Mutually exclusive with subreddit.
|
||||
|
||||
-l "update" | --lower "update":
|
||||
If a number - the unix timestamp to start at.
|
||||
If "update" - continue from latest submission in db.
|
||||
Default: update
|
||||
|
||||
-up 1467460221 | --upper 1467460221:
|
||||
If a number - the unix timestamp to stop at.
|
||||
If not provided - stop at current time.
|
||||
Default: current time
|
||||
|
||||
-i 86400 | --interval 86400:
|
||||
The initial interval for the scanning window, in seconds.
|
||||
This is only a starting value. The window will shrink and stretch
|
||||
as necessary based on received submission counts.
|
||||
Default: 86400
|
||||
''',
|
||||
}
|
||||
|
||||
|
||||
def docstring_preview(text):
|
||||
'''
|
||||
Return the brief description at the top of the text.
|
||||
User can get full text by looking at each specifically.
|
||||
'''
|
||||
return text.split('\n\n')[0]
|
||||
|
||||
def listget(li, index, fallback=None):
|
||||
try:
|
||||
return li[index]
|
||||
except IndexError:
|
||||
return fallback
|
||||
|
||||
def indent(text, spaces=4):
|
||||
spaces = ' ' * spaces
|
||||
return '\n'.join(spaces + line if line.strip() != '' else line for line in text.split('\n'))
|
||||
|
||||
docstring_headers = {
|
||||
key: indent(docstring_preview(value))
|
||||
for (key, value) in MODULE_DOCSTRINGS.items()
|
||||
}
|
||||
|
||||
DOCSTRING = DOCSTRING.format(**docstring_headers)
|
||||
|
||||
####################################################################################################
|
||||
####################################################################################################
|
||||
|
||||
def breakdown_gateway(args):
|
||||
from . import breakdown
|
||||
breakdown.breakdown_argparse(args)
|
||||
|
||||
def commentaugment_gateway(args):
|
||||
from . import commentaugment
|
||||
commentaugment.commentaugment_argparse(args)
|
||||
|
||||
def getstyles_gateway(args):
|
||||
from . import getstyles
|
||||
getstyles.getstyles_argparse(args)
|
||||
|
||||
def getwiki_gateway(args):
|
||||
from . import getwiki
|
||||
getwiki.getwiki_argparse(args)
|
||||
|
||||
def livestream_gateway(args):
|
||||
from . import livestream
|
||||
livestream.livestream_argparse(args)
|
||||
|
||||
def mergedb_gateway(args):
|
||||
from . import mergedb
|
||||
mergedb.mergedb_argparse(args)
|
||||
|
||||
def offline_reading_gateway(args):
|
||||
from . import offline_reading
|
||||
offline_reading.offline_reading_argparse(args)
|
||||
|
||||
def redmash_gateway(args):
|
||||
from . import redmash
|
||||
redmash.redmash_argparse(args)
|
||||
|
||||
def timesearch_gateway(args):
|
||||
from . import timesearch
|
||||
timesearch.timesearch_argparse(args)
|
||||
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
subparsers = parser.add_subparsers()
|
||||
|
||||
p_breakdown = subparsers.add_parser('breakdown')
|
||||
p_breakdown.add_argument('--sort', dest='sort', default=None)
|
||||
p_breakdown.add_argument('-r', '--subreddit', dest='subreddit', default=None)
|
||||
p_breakdown.add_argument('-u', '--user', dest='username', default=None)
|
||||
p_breakdown.set_defaults(func=breakdown_gateway)
|
||||
|
||||
p_commentaugment = subparsers.add_parser('commentaugment')
|
||||
p_commentaugment.add_argument('-l', '--limit', dest='limit', default=None)
|
||||
p_commentaugment.add_argument('-n', '--num_thresh', dest='num_thresh', default=1)
|
||||
p_commentaugment.add_argument('-r', '--subreddit', dest='subreddit', default=None)
|
||||
p_commentaugment.add_argument('-s', '--specific', dest='specific_submission', default=None)
|
||||
p_commentaugment.add_argument('-t', '--threshold', dest='threshold', default=0)
|
||||
p_commentaugment.add_argument('-u', '--user', dest='username', default=None)
|
||||
p_commentaugment.add_argument('-v', '--verbose', dest='verbose', action='store_true')
|
||||
p_commentaugment.set_defaults(func=commentaugment_gateway)
|
||||
|
||||
p_getstyles = subparsers.add_parser('getstyles')
|
||||
p_getstyles.add_argument('-r', '--subreddit', dest='subreddit')
|
||||
p_getstyles.set_defaults(func=getstyles_gateway)
|
||||
|
||||
p_getwiki = subparsers.add_parser('getwiki')
|
||||
p_getwiki.add_argument('-r', '--subreddit', dest='subreddit')
|
||||
p_getwiki.set_defaults(func=getwiki_gateway)
|
||||
|
||||
p_livestream = subparsers.add_parser('livestream')
|
||||
p_livestream.add_argument('-1', '--once', dest='once', action='store_true')
|
||||
p_livestream.add_argument('-c', '--comments', dest='comments', action='store_true')
|
||||
p_livestream.add_argument('-l', '--limit', dest='limit', default=None)
|
||||
p_livestream.add_argument('-r', '--subreddit', dest='subreddit', default=None)
|
||||
p_livestream.add_argument('-s', '--submissions', dest='submissions', action='store_true')
|
||||
p_livestream.add_argument('-u', '--user', dest='username', default=None)
|
||||
p_livestream.add_argument('-v', '--verbose', dest='verbose', action='store_true')
|
||||
p_livestream.add_argument('-w', '--wait', dest='sleepy', default=30)
|
||||
p_livestream.set_defaults(func=livestream_gateway)
|
||||
|
||||
p_mergedb = subparsers.add_parser('mergedb')
|
||||
p_mergedb.add_argument('--from', dest='from_db_path', required=True)
|
||||
p_mergedb.add_argument('--to', dest='to_db_path', required=True)
|
||||
p_mergedb.set_defaults(func=mergedb_gateway)
|
||||
|
||||
p_offline_reading = subparsers.add_parser('offline_reading')
|
||||
p_offline_reading.add_argument('-r', '--subreddit', dest='subreddit', default=None)
|
||||
p_offline_reading.add_argument('-s', '--specific', dest='specific_submission', default=None)
|
||||
p_offline_reading.add_argument('-u', '--user', dest='username', default=None)
|
||||
p_offline_reading.set_defaults(func=offline_reading_gateway)
|
||||
|
||||
p_redmash = subparsers.add_parser('redmash')
|
||||
p_redmash.add_argument('--all', dest='do_all', action='store_true')
|
||||
p_redmash.add_argument('--author', dest='do_author', action='store_true')
|
||||
p_redmash.add_argument('--date', dest='do_date', action='store_true')
|
||||
p_redmash.add_argument('--flair', dest='do_flair', action='store_true')
|
||||
p_redmash.add_argument('--html', dest='html', action='store_true')
|
||||
p_redmash.add_argument('--score', dest='do_score', action='store_true')
|
||||
p_redmash.add_argument('--sub', dest='do_subreddit', action='store_true')
|
||||
p_redmash.add_argument('--title', dest='do_title', action='store_true')
|
||||
p_redmash.add_argument('-r', '--subreddit', dest='subreddit', default=None)
|
||||
p_redmash.add_argument('-st', '--score_threshold', dest='score_threshold', default=0)
|
||||
p_redmash.add_argument('-u', '--user', dest='username', default=None)
|
||||
p_redmash.set_defaults(func=redmash_gateway)
|
||||
|
||||
p_timesearch = subparsers.add_parser('timesearch')
|
||||
p_timesearch.add_argument('-i', '--interval', dest='interval', default=86400)
|
||||
p_timesearch.add_argument('-l', '--lower', dest='lower', default='update')
|
||||
p_timesearch.add_argument('-r', '--subreddit', dest='subreddit', default=None)
|
||||
p_timesearch.add_argument('-u', '--user', dest='username', default=None)
|
||||
p_timesearch.add_argument('-up', '--upper', dest='upper', default=None)
|
||||
p_timesearch.set_defaults(func=timesearch_gateway)
|
||||
|
||||
def main(argv):
|
||||
helpstrings = {'', 'help', '-h', '--help'}
|
||||
|
||||
command = listget(argv, 0, '').lower()
|
||||
|
||||
# The user did not enter a command, or entered something unrecognized.
|
||||
if command not in MODULE_DOCSTRINGS:
|
||||
print(DOCSTRING)
|
||||
if command == '':
|
||||
print('You are seeing the default help text because you did not choose a command.')
|
||||
elif command not in helpstrings:
|
||||
print('You are seeing the default help text because "%s" was not recognized' % command)
|
||||
return 1
|
||||
|
||||
# The user entered a command, but no further arguments, or just help.
|
||||
argument = listget(argv, 1, '').lower()
|
||||
if argument in helpstrings:
|
||||
print(MODULE_DOCSTRINGS[command])
|
||||
return 1
|
||||
|
||||
args = parser.parse_args(argv)
|
||||
try:
|
||||
args.func(args)
|
||||
except exceptions.DBNotFound as e:
|
||||
message = '"%s" is not an existing database.'
|
||||
message += '\nHave you used any of the other utilities to collect data?'
|
||||
message = message % e.path.absolute_path
|
||||
print(message)
|
||||
return 1
|
||||
|
||||
return 0
|
||||
|
||||
if __name__ == '__main__':
|
||||
raise SystemExit(main(sys.argv[1:]))
|
103
timesearch/breakdown.py
Normal file
103
timesearch/breakdown.py
Normal file
|
@ -0,0 +1,103 @@
|
|||
import os
|
||||
import json
|
||||
|
||||
from . import common
|
||||
from . import tsdb
|
||||
|
||||
|
||||
def breakdown_database(subreddit=None, username=None):
|
||||
'''
|
||||
Given a database, return a json dict breaking down the submission / comment count for
|
||||
users (if a subreddit database) or subreddits (if a user database).
|
||||
'''
|
||||
if (subreddit is None) == (username is None):
|
||||
raise Exception('Enter subreddit or username but not both')
|
||||
|
||||
breakdown_results = {}
|
||||
def _ingest(names, subkey):
|
||||
for name in names:
|
||||
breakdown_results.setdefault(name, {})
|
||||
breakdown_results[name].setdefault(subkey, 0)
|
||||
breakdown_results[name][subkey] += 1
|
||||
|
||||
if subreddit:
|
||||
database = tsdb.TSDB.for_subreddit(subreddit, do_create=False)
|
||||
else:
|
||||
database = tsdb.TSDB.for_user(username, do_create=False)
|
||||
cur = database.sql.cursor()
|
||||
|
||||
for table in ['submissions', 'comments']:
|
||||
if subreddit:
|
||||
cur.execute('SELECT author FROM %s' % table)
|
||||
elif username:
|
||||
cur.execute('SELECT subreddit FROM %s' % table)
|
||||
|
||||
names = (row[0] for row in common.fetchgenerator(cur))
|
||||
_ingest(names, table)
|
||||
|
||||
for name in breakdown_results:
|
||||
breakdown_results[name].setdefault('submissions', 0)
|
||||
breakdown_results[name].setdefault('comments', 0)
|
||||
|
||||
return breakdown_results
|
||||
|
||||
def breakdown_argparse(args):
|
||||
if args.subreddit:
|
||||
database = tsdb.TSDB.for_subreddit(args.subreddit, do_create=False)
|
||||
else:
|
||||
database = tsdb.TSDB.for_user(args.username, do_create=False)
|
||||
|
||||
breakdown_results = breakdown_database(
|
||||
subreddit=args.subreddit,
|
||||
username=args.username,
|
||||
)
|
||||
|
||||
def sort_name(name):
|
||||
return name.lower()
|
||||
def sort_submissions(name):
|
||||
invert_score = -1 * breakdown_results[name]['submissions']
|
||||
return (invert_score, name.lower())
|
||||
def sort_comments(name):
|
||||
invert_score = -1 * breakdown_results[name]['comments']
|
||||
return (invert_score, name.lower())
|
||||
def sort_total_posts(name):
|
||||
invert_score = breakdown_results[name]['submissions'] + breakdown_results[name]['comments']
|
||||
invert_score = -1 * invert_score
|
||||
return (invert_score, name.lower())
|
||||
breakdown_sorters = {
|
||||
'name': sort_name,
|
||||
'submissions': sort_submissions,
|
||||
'comments': sort_comments,
|
||||
'total_posts': sort_total_posts,
|
||||
}
|
||||
|
||||
breakdown_names = list(breakdown_results.keys())
|
||||
if args.sort is not None:
|
||||
try:
|
||||
sorter = breakdown_sorters[args.sort.lower()]
|
||||
except KeyError:
|
||||
message = '{sorter} is not a sorter. Choose from {options}'
|
||||
message = message.format(sorter=args.sort, options=list(breakdown_sorters.keys()))
|
||||
raise KeyError(message)
|
||||
breakdown_names.sort(key=sorter)
|
||||
dump = ' "{name}": {{"submissions": {submissions}, "comments": {comments}}}'
|
||||
dump = [dump.format(name=name, **breakdown_results[name]) for name in breakdown_names]
|
||||
dump = ',\n'.join(dump)
|
||||
dump = '{\n' + dump + '\n}\n'
|
||||
else:
|
||||
dump = json.dumps(breakdown_results)
|
||||
|
||||
if args.sort is None:
|
||||
breakdown_basename = '%s_breakdown.json'
|
||||
else:
|
||||
breakdown_basename = '%%s_breakdown_%s.json' % args.sort
|
||||
|
||||
breakdown_basename = breakdown_basename % database.filepath.replace_extension('').basename
|
||||
breakdown_filepath = database.breakdown_dir.with_child(breakdown_basename)
|
||||
os.makedirs(breakdown_filepath.parent.absolute_path, exist_ok=True)
|
||||
breakdown_file = open(breakdown_filepath.absolute_path, 'w')
|
||||
with breakdown_file:
|
||||
breakdown_file.write(dump)
|
||||
print('Wrote', breakdown_filepath.relative_path)
|
||||
|
||||
return breakdown_results
|
179
timesearch/commentaugment.py
Normal file
179
timesearch/commentaugment.py
Normal file
|
@ -0,0 +1,179 @@
|
|||
import traceback
|
||||
|
||||
from . import common
|
||||
from . import tsdb
|
||||
|
||||
|
||||
def commentaugment(
|
||||
subreddit=None,
|
||||
username=None,
|
||||
limit=0,
|
||||
num_thresh=0,
|
||||
specific_submission=None,
|
||||
threshold=0,
|
||||
verbose=0,
|
||||
):
|
||||
'''
|
||||
Take the IDs of collected submissions, and gather comments from those threads.
|
||||
Please see the global DOCSTRING_COMMENTAUGMENT variable.
|
||||
'''
|
||||
common.bot.login(common.r)
|
||||
if specific_submission is not None:
|
||||
if not specific_submission.startswith('t3_'):
|
||||
specific_submission = 't3_' + specific_submission
|
||||
specific_submission_obj = common.r.submission(specific_submission[3:])
|
||||
subreddit = specific_submission_obj.subreddit.display_name
|
||||
|
||||
if (subreddit is None) == (username is None):
|
||||
raise Exception('Enter subreddit or username but not both')
|
||||
|
||||
if subreddit:
|
||||
if specific_submission is None:
|
||||
database = tsdb.TSDB.for_subreddit(subreddit, do_create=False)
|
||||
else:
|
||||
database = tsdb.TSDB.for_subreddit(subreddit, do_create=True)
|
||||
else:
|
||||
database = tsdb.TSDB.for_user(username, do_create=False)
|
||||
cur = database.sql.cursor()
|
||||
|
||||
if limit == 0:
|
||||
limit = None
|
||||
|
||||
if specific_submission is None:
|
||||
query = '''
|
||||
SELECT idstr FROM submissions
|
||||
WHERE idstr IS NOT NULL
|
||||
AND augmented_at IS NULL
|
||||
AND num_comments >= ?
|
||||
ORDER BY num_comments DESC
|
||||
'''
|
||||
bindings = [num_thresh]
|
||||
cur.execute(query, bindings)
|
||||
fetchall = [item[0] for item in cur.fetchall()]
|
||||
else:
|
||||
# Make sure the object we're augmenting is in the table too!
|
||||
database.insert(specific_submission_obj)
|
||||
fetchall = [specific_submission]
|
||||
|
||||
totalthreads = len(fetchall)
|
||||
|
||||
if verbose:
|
||||
spacer = '\n\t'
|
||||
else:
|
||||
spacer = ' '
|
||||
|
||||
scannedthreads = 0
|
||||
get_submission = common.nofailrequest(get_submission_immediately)
|
||||
while len(fetchall) > 0:
|
||||
id_batch = fetchall[:100]
|
||||
fetchall = fetchall[100:]
|
||||
|
||||
for submission in id_batch:
|
||||
submission = get_submission(submission.split('_')[-1])
|
||||
message = 'Processing {fullname}{spacer}expecting {num_comments} | '
|
||||
message = message.format(
|
||||
fullname=submission.fullname,
|
||||
spacer=spacer,
|
||||
num_comments=submission.num_comments,
|
||||
)
|
||||
|
||||
print(message, end='', flush=True)
|
||||
if verbose:
|
||||
print()
|
||||
|
||||
comments = get_comments_for_thread(submission, limit, threshold, verbose)
|
||||
|
||||
database.insert(comments, commit=False)
|
||||
query = '''
|
||||
UPDATE submissions
|
||||
set augmented_at = ?,
|
||||
augmented_count = ?
|
||||
WHERE idstr == ?
|
||||
'''
|
||||
bindings = [common.get_now(), len(comments), submission.fullname]
|
||||
cur.execute(query, bindings)
|
||||
database.sql.commit()
|
||||
|
||||
scannedthreads += 1
|
||||
if verbose:
|
||||
print('\t', end='')
|
||||
message = 'Found {count} |{spacer}{scannedthreads} / {totalthreads}'
|
||||
message = message.format(
|
||||
count=len(comments),
|
||||
spacer=spacer,
|
||||
scannedthreads=scannedthreads,
|
||||
totalthreads=totalthreads,
|
||||
)
|
||||
print(message)
|
||||
|
||||
def get_comments_for_thread(submission, limit, threshold, verbose):
|
||||
comments = common.nofailrequest(lambda x: x.comments)(submission)
|
||||
# PRAW4 flatten is just list().
|
||||
comments = manually_replace_comments(comments, limit, threshold, verbose)
|
||||
return comments
|
||||
|
||||
def get_submission_immediately(submission_id):
|
||||
submission = common.r.submission(submission_id)
|
||||
# force the lazyloader
|
||||
submission.title = submission.title
|
||||
return submission
|
||||
|
||||
def manually_replace_comments(incomments, limit=None, threshold=0, verbose=False):
|
||||
'''
|
||||
PRAW's replace_more_comments method cannot continue
|
||||
where it left off in the case of an Ow! screen.
|
||||
So I'm writing my own function to get each MoreComments item individually
|
||||
|
||||
Furthermore, this function will maximize the number of retrieved comments by
|
||||
sorting the MoreComments objects and getting the big chunks before worrying
|
||||
about the tail ends.
|
||||
'''
|
||||
incomments = incomments.list()
|
||||
comments = []
|
||||
morecomments = []
|
||||
while len(incomments) > 0:
|
||||
item = incomments.pop()
|
||||
if isinstance(item, common.praw.models.MoreComments) and item.count >= threshold:
|
||||
morecomments.append(item)
|
||||
elif isinstance(item, common.praw.models.Comment):
|
||||
comments.append(item)
|
||||
|
||||
while True:
|
||||
try:
|
||||
if limit is not None and limit <= 0:
|
||||
break
|
||||
if len(morecomments) == 0:
|
||||
break
|
||||
morecomments.sort(key=lambda x: x.count)
|
||||
mc = morecomments.pop()
|
||||
additional = common.nofailrequest(mc.comments)()
|
||||
additionals = 0
|
||||
if limit is not None:
|
||||
limit -= 1
|
||||
for item in additional:
|
||||
if isinstance(item, common.praw.models.MoreComments) and item.count >= threshold:
|
||||
morecomments.append(item)
|
||||
elif isinstance(item, common.praw.models.Comment):
|
||||
comments.append(item)
|
||||
additionals += 1
|
||||
if verbose:
|
||||
s = '\tGot %d more, %d so far.' % (additionals, len(comments))
|
||||
if limit is not None:
|
||||
s += ' Can perform %d more replacements' % limit
|
||||
print(s)
|
||||
except KeyboardInterrupt:
|
||||
raise
|
||||
except Exception:
|
||||
traceback.print_exc()
|
||||
return comments
|
||||
|
||||
def commentaugment_argparse(args):
|
||||
return commentaugment(
|
||||
subreddit=args.subreddit,
|
||||
username=args.username,
|
||||
limit=common.int_none(args.limit),
|
||||
threshold=common.int_none(args.threshold),
|
||||
num_thresh=common.int_none(args.num_thresh),
|
||||
verbose=args.verbose,
|
||||
specific_submission=args.specific_submission,
|
||||
)
|
104
timesearch/common.py
Normal file
104
timesearch/common.py
Normal file
|
@ -0,0 +1,104 @@
|
|||
import datetime
|
||||
import os
|
||||
import time
|
||||
import traceback
|
||||
|
||||
try:
|
||||
import praw
|
||||
except ImportError:
|
||||
praw = None
|
||||
if praw is None or praw.__version__.startswith('3.'):
|
||||
import praw4
|
||||
praw = praw4
|
||||
|
||||
try:
|
||||
import bot
|
||||
except ImportError:
|
||||
bot = None
|
||||
if bot is None or bot.praw != praw:
|
||||
import bot4
|
||||
bot = bot4
|
||||
|
||||
|
||||
r = bot.anonymous()
|
||||
|
||||
def assert_file_exists(filepath):
|
||||
if not os.path.exists(filepath):
|
||||
raise FileNotFoundError(filepath)
|
||||
|
||||
def b36(i):
|
||||
if isinstance(i, int):
|
||||
return base36encode(i)
|
||||
return base36decode(i)
|
||||
|
||||
def base36decode(number):
|
||||
return int(number, 36)
|
||||
|
||||
def base36encode(number, alphabet='0123456789abcdefghijklmnopqrstuvwxyz'):
|
||||
"""Converts an integer to a base36 string."""
|
||||
if not isinstance(number, (int)):
|
||||
raise TypeError('number must be an integer')
|
||||
base36 = ''
|
||||
sign = ''
|
||||
if number < 0:
|
||||
sign = '-'
|
||||
number = -number
|
||||
if 0 <= number < len(alphabet):
|
||||
return sign + alphabet[number]
|
||||
while number != 0:
|
||||
number, i = divmod(number, len(alphabet))
|
||||
base36 = alphabet[i] + base36
|
||||
return sign + base36
|
||||
|
||||
def fetchgenerator(cursor):
|
||||
while True:
|
||||
item = cursor.fetchone()
|
||||
if item is None:
|
||||
break
|
||||
yield item
|
||||
|
||||
def generator_chunker(generator, chunk_size):
|
||||
chunk = []
|
||||
for item in generator:
|
||||
chunk.append(item)
|
||||
if len(chunk) == chunk_size:
|
||||
yield chunk
|
||||
chunk = []
|
||||
if len(chunk) != 0:
|
||||
yield chunk
|
||||
|
||||
def get_now(stamp=True):
|
||||
now = datetime.datetime.now(datetime.timezone.utc)
|
||||
if stamp:
|
||||
return int(now.timestamp())
|
||||
return now
|
||||
|
||||
def human(timestamp):
|
||||
x = datetime.datetime.utcfromtimestamp(timestamp)
|
||||
x = datetime.datetime.strftime(x, "%b %d %Y %H:%M:%S")
|
||||
return x
|
||||
|
||||
def int_none(x):
|
||||
if x is None:
|
||||
return None
|
||||
return int(x)
|
||||
|
||||
def nofailrequest(function):
|
||||
'''
|
||||
Creates a function that will retry until it succeeds.
|
||||
This function accepts 1 parameter, a function, and returns a modified
|
||||
version of that function that will try-catch, sleep, and loop until it
|
||||
finally returns.
|
||||
'''
|
||||
def a(*args, **kwargs):
|
||||
while True:
|
||||
try:
|
||||
result = function(*args, **kwargs)
|
||||
return result
|
||||
except KeyboardInterrupt:
|
||||
raise
|
||||
except Exception:
|
||||
traceback.print_exc()
|
||||
print('Retrying in 2...')
|
||||
time.sleep(2)
|
||||
return a
|
3
timesearch/exceptions.py
Normal file
3
timesearch/exceptions.py
Normal file
|
@ -0,0 +1,3 @@
|
|||
class DBNotFound(FileNotFoundError):
|
||||
def __init__(self, path):
|
||||
self.path = path
|
31
timesearch/getstyles.py
Normal file
31
timesearch/getstyles.py
Normal file
|
@ -0,0 +1,31 @@
|
|||
import os
|
||||
import requests
|
||||
|
||||
from . import common
|
||||
from . import tsdb
|
||||
|
||||
|
||||
def getstyles(subreddit):
|
||||
print('Getting styles for /r/%s' % subreddit)
|
||||
subreddit = common.r.subreddit(subreddit)
|
||||
|
||||
styles = subreddit.stylesheet()
|
||||
database = tsdb.TSDB.for_subreddit(subreddit.display_name)
|
||||
|
||||
os.makedirs(database.styles_dir.absolute_path, exist_ok=True)
|
||||
|
||||
stylesheet_filepath = database.styles_dir.with_child('stylesheet.css')
|
||||
print('Downloading %s' % stylesheet_filepath.relative_path)
|
||||
with open(stylesheet_filepath.absolute_path, 'w', encoding='utf-8') as stylesheet:
|
||||
stylesheet.write(styles.stylesheet)
|
||||
|
||||
for image in styles.images:
|
||||
image_basename = image['name'] + '.' + image['url'].split('.')[-1]
|
||||
image_filepath = database.styles_dir.with_child(image_basename)
|
||||
print('Downloading %s' % image_filepath.relative_path)
|
||||
with open(image_filepath.absolute_path, 'wb') as image_file:
|
||||
response = requests.get(image['url'])
|
||||
image_file.write(response.content)
|
||||
|
||||
def getstyles_argparse(args):
|
||||
return getstyles(args.subreddit)
|
23
timesearch/getwiki.py
Normal file
23
timesearch/getwiki.py
Normal file
|
@ -0,0 +1,23 @@
|
|||
import os
|
||||
|
||||
from . import common
|
||||
from . import tsdb
|
||||
|
||||
|
||||
def getwiki(subreddit):
|
||||
print('Getting wiki pages for /r/%s' % subreddit)
|
||||
subreddit = common.r.subreddit(subreddit)
|
||||
database = tsdb.TSDB.for_subreddit(subreddit)
|
||||
|
||||
for wikipage in subreddit.wiki:
|
||||
if wikipage.name == 'config/stylesheet':
|
||||
continue
|
||||
|
||||
wikipage_path = database.wiki_dir.join(wikipage.name).replace_extension('md')
|
||||
os.makedirs(wikipage_path.parent.absolute_path, exist_ok=True)
|
||||
with open(wikipage_path.absolute_path, 'w', encoding='utf-8') as handle:
|
||||
handle.write(wikipage.content_md)
|
||||
print('Wrote', wikipage_path.relative_path)
|
||||
|
||||
def getwiki_argparse(args):
|
||||
return getwiki(args.subreddit)
|
175
timesearch/livestream.py
Normal file
175
timesearch/livestream.py
Normal file
|
@ -0,0 +1,175 @@
|
|||
import copy
|
||||
import time
|
||||
import traceback
|
||||
|
||||
from . import common
|
||||
from . import tsdb
|
||||
|
||||
|
||||
def livestream(
|
||||
subreddit=None,
|
||||
username=None,
|
||||
verbose=False,
|
||||
as_a_generator=False,
|
||||
do_submissions=True,
|
||||
do_comments=True,
|
||||
limit=100,
|
||||
only_once=False,
|
||||
sleepy=30,
|
||||
):
|
||||
'''
|
||||
Continuously get posts from this source
|
||||
and insert them into the database
|
||||
|
||||
as_a_generator:
|
||||
return a generator where every iteration does a single livestream loop.
|
||||
This is good if you want to manage multiple livestreams yourself by
|
||||
calling `next` on each of them, instead of getting stuck in here.
|
||||
'''
|
||||
if bool(subreddit) == bool(username):
|
||||
raise Exception('Require either username / subreddit parameter, but not both')
|
||||
if bool(do_submissions) is bool(do_comments) is False:
|
||||
raise Exception('Require do_submissions and/or do_comments parameter')
|
||||
common.bot.login(common.r)
|
||||
|
||||
if subreddit:
|
||||
print('Getting subreddit %s' % subreddit)
|
||||
database = tsdb.TSDB.for_subreddit(subreddit)
|
||||
subreddit = common.r.subreddit(subreddit)
|
||||
submissions = subreddit.new if do_submissions else None
|
||||
comments = subreddit.comments if do_comments else None
|
||||
else:
|
||||
print('Getting redditor %s' % username)
|
||||
database = tsdb.TSDB.for_user(username)
|
||||
user = common.r.redditor(username)
|
||||
submissions = user.submissions.new if do_submissions else None
|
||||
comments = user.comments.new if do_comments else None
|
||||
|
||||
generator = _livestream_as_a_generator(
|
||||
database,
|
||||
submission_function=submissions,
|
||||
comment_function=comments,
|
||||
limit=limit,
|
||||
params={'show': 'all'},
|
||||
verbose=verbose,
|
||||
)
|
||||
if as_a_generator:
|
||||
return generator
|
||||
|
||||
while True:
|
||||
try:
|
||||
step = next(generator)
|
||||
newtext = '%ds, %dc' % (step['new_submissions'], step['new_comments'])
|
||||
totalnew = step['new_submissions'] + step['new_comments']
|
||||
status = '{now} +{new}'.format(now=common.human(common.get_now()), new=newtext)
|
||||
print(status, end='', flush=True)
|
||||
if totalnew == 0 and verbose is False:
|
||||
# Since there were no news, allow the next line to overwrite status
|
||||
print('\r', end='')
|
||||
else:
|
||||
print()
|
||||
|
||||
if verbose:
|
||||
print('Loop finished.')
|
||||
if only_once:
|
||||
break
|
||||
time.sleep(sleepy)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print()
|
||||
return
|
||||
|
||||
except Exception:
|
||||
traceback.print_exc()
|
||||
print('Retrying in 5...')
|
||||
time.sleep(5)
|
||||
|
||||
hangman = lambda: livestream(
|
||||
username='gallowboob',
|
||||
do_submissions=True,
|
||||
do_comments=True,
|
||||
sleepy=60,
|
||||
)
|
||||
|
||||
def _livestream_as_a_generator(
|
||||
database,
|
||||
submission_function,
|
||||
comment_function,
|
||||
limit,
|
||||
params,
|
||||
verbose,
|
||||
):
|
||||
while True:
|
||||
#common.r.handler.clear_cache()
|
||||
try:
|
||||
items = _livestream_helper(
|
||||
submission_function=submission_function,
|
||||
comment_function=comment_function,
|
||||
limit=limit,
|
||||
params=params,
|
||||
verbose=verbose,
|
||||
)
|
||||
newitems = database.insert(items)
|
||||
yield newitems
|
||||
except Exception:
|
||||
traceback.print_exc()
|
||||
print('Retrying in 5...')
|
||||
time.sleep(5)
|
||||
|
||||
|
||||
def _livestream_helper(
|
||||
submission_function=None,
|
||||
comment_function=None,
|
||||
verbose=False,
|
||||
*args,
|
||||
**kwargs,
|
||||
):
|
||||
'''
|
||||
Given a submission-retrieving function and/or a comment-retrieving function,
|
||||
collect submissions and comments in a list together and return that.
|
||||
|
||||
args and kwargs go into the collecting functions.
|
||||
'''
|
||||
if bool(submission_function) is bool(comment_function) is False:
|
||||
raise Exception('Require submissions and/or comments parameter')
|
||||
results = []
|
||||
|
||||
if submission_function:
|
||||
if verbose:
|
||||
print('Getting submissions', args, kwargs)
|
||||
this_kwargs = copy.deepcopy(kwargs)
|
||||
submission_batch = submission_function(*args, **this_kwargs)
|
||||
results.extend(submission_batch)
|
||||
if comment_function:
|
||||
if verbose:
|
||||
print('Getting comments', args, kwargs)
|
||||
this_kwargs = copy.deepcopy(kwargs)
|
||||
comment_batch = comment_function(*args, **this_kwargs)
|
||||
results.extend(comment_batch)
|
||||
if verbose:
|
||||
print('Collected. Saving...')
|
||||
return results
|
||||
|
||||
def livestream_argparse(args):
|
||||
if args.submissions is args.comments is False:
|
||||
args.submissions = True
|
||||
args.comments = True
|
||||
if args.limit is None:
|
||||
limit = 100
|
||||
else:
|
||||
limit = int(args.limit)
|
||||
|
||||
if args.submissions is False and args.comments is False:
|
||||
args.submissions = True
|
||||
args.comments = True
|
||||
|
||||
return livestream(
|
||||
subreddit=args.subreddit,
|
||||
username=args.username,
|
||||
do_comments=args.comments,
|
||||
do_submissions=args.submissions,
|
||||
limit=limit,
|
||||
verbose=args.verbose,
|
||||
only_once=args.once,
|
||||
sleepy=common.int_none(args.sleepy),
|
||||
)
|
35
timesearch/mergedb.py
Normal file
35
timesearch/mergedb.py
Normal file
|
@ -0,0 +1,35 @@
|
|||
import os
|
||||
import requests
|
||||
|
||||
from . import common
|
||||
from . import tsdb
|
||||
|
||||
|
||||
MIGRATE_QUERY = '''
|
||||
INSERT INTO {tablename}
|
||||
SELECT othertable.* FROM other.{tablename} othertable
|
||||
LEFT JOIN {tablename} mytable ON mytable.idint == othertable.idint
|
||||
WHERE mytable.idint IS NULL;
|
||||
'''
|
||||
|
||||
def _migrate_helper(db, tablename):
|
||||
oldcount = db.cur.execute('SELECT count(*) FROM %s' % tablename).fetchone()[0]
|
||||
|
||||
query = MIGRATE_QUERY.format(tablename=tablename)
|
||||
print(query)
|
||||
db.cur.execute(query)
|
||||
db.sql.commit()
|
||||
|
||||
newcount = db.cur.execute('SELECT count(*) FROM %s' % tablename).fetchone()[0]
|
||||
print('Gained %d items.' % (newcount - oldcount))
|
||||
|
||||
def mergedb(from_db_path, to_db_path):
|
||||
to_db = tsdb.TSDB(to_db_path)
|
||||
from_db = tsdb.TSDB(from_db_path)
|
||||
|
||||
to_db.cur.execute('ATTACH DATABASE "%s" AS other' % from_db_path)
|
||||
_migrate_helper(to_db, 'submissions')
|
||||
_migrate_helper(to_db, 'comments')
|
||||
|
||||
def mergedb_argparse(args):
|
||||
return mergedb(args.from_db_path, args.to_db_path)
|
340
timesearch/offline_reading.py
Normal file
340
timesearch/offline_reading.py
Normal file
|
@ -0,0 +1,340 @@
|
|||
import os
|
||||
import markdown
|
||||
|
||||
from . import common
|
||||
from . import tsdb
|
||||
|
||||
|
||||
class DBEntry:
|
||||
def __init__(self, fetch):
|
||||
if fetch[1].startswith('t3_'):
|
||||
columns = tsdb.SQL_SUBMISSION_COLUMNS
|
||||
self.object_type = 'submission'
|
||||
else:
|
||||
columns = tsdb.SQL_COMMENT_COLUMNS
|
||||
self.object_type = 'comment'
|
||||
|
||||
self.id = None
|
||||
self.idstr = None
|
||||
for (index, attribute) in enumerate(columns):
|
||||
setattr(self, attribute, fetch[index])
|
||||
|
||||
def __repr__(self):
|
||||
return 'DBEntry(\'%s\')' % self.id
|
||||
|
||||
|
||||
class TreeNode:
|
||||
def __init__(self, identifier, data, parent=None):
|
||||
assert isinstance(identifier, str)
|
||||
assert '\\' not in identifier
|
||||
self.identifier = identifier
|
||||
self.data = data
|
||||
self.parent = parent
|
||||
self.children = {}
|
||||
|
||||
def __getitem__(self, key):
|
||||
return self.children[key]
|
||||
|
||||
def __repr__(self):
|
||||
return 'TreeNode %s' % self.abspath()
|
||||
|
||||
def abspath(self):
|
||||
node = self
|
||||
nodes = [node]
|
||||
while node.parent is not None:
|
||||
node = node.parent
|
||||
nodes.append(node)
|
||||
nodes.reverse()
|
||||
nodes = [node.identifier for node in nodes]
|
||||
return '\\'.join(nodes)
|
||||
|
||||
def add_child(self, other_node, overwrite_parent=False):
|
||||
self.check_child_availability(other_node.identifier)
|
||||
if other_node.parent is not None and not overwrite_parent:
|
||||
raise ValueError('That node already has a parent. Try `overwrite_parent=True`')
|
||||
|
||||
other_node.parent = self
|
||||
self.children[other_node.identifier] = other_node
|
||||
return other_node
|
||||
|
||||
def check_child_availability(self, identifier):
|
||||
if ':' in identifier:
|
||||
raise Exception('Only roots may have a colon')
|
||||
if identifier in self.children:
|
||||
raise Exception('Node %s already has child %s' % (self.identifier, identifier))
|
||||
|
||||
def detach(self):
|
||||
del self.parent.children[self.identifier]
|
||||
self.parent = None
|
||||
|
||||
def listnodes(self, customsort=None):
|
||||
items = list(self.children.items())
|
||||
if customsort is None:
|
||||
items.sort(key=lambda x: x[0].lower())
|
||||
else:
|
||||
items.sort(key=customsort)
|
||||
return [item[1] for item in items]
|
||||
|
||||
def merge_other(self, othertree, otherroot=None):
|
||||
newroot = None
|
||||
if ':' in othertree.identifier:
|
||||
if otherroot is None:
|
||||
raise Exception('Must specify a new name for the other tree\'s root')
|
||||
else:
|
||||
newroot = otherroot
|
||||
else:
|
||||
newroot = othertree.identifier
|
||||
othertree.identifier = newroot
|
||||
othertree.parent = self
|
||||
self.check_child_availability(newroot)
|
||||
self.children[newroot] = othertree
|
||||
|
||||
def printtree(self, customsort=None):
|
||||
for node in self.walk(customsort):
|
||||
print(node.abspath())
|
||||
|
||||
def walk(self, customsort=None):
|
||||
yield self
|
||||
for child in self.listnodes(customsort=customsort):
|
||||
#print(child)
|
||||
#print(child.listnodes())
|
||||
yield from child.walk(customsort=customsort)
|
||||
|
||||
def html_format_comment(comment):
|
||||
text = '''
|
||||
<div class="comment"
|
||||
id="{id}"
|
||||
style="
|
||||
padding-left: 20px;
|
||||
margin-top: 4px;
|
||||
margin-right: 4px;
|
||||
margin-bottom: 4px;
|
||||
border: 2px #000 solid;
|
||||
">
|
||||
<p class="userinfo">
|
||||
{usernamelink}
|
||||
<span class="score"> | {score} points</span>
|
||||
<span class="timestamp"> | {human}</span>
|
||||
</p>
|
||||
|
||||
<p>{body}</p>
|
||||
|
||||
<p class="toolbar">
|
||||
{permalink}
|
||||
</p>
|
||||
{children}
|
||||
</div>
|
||||
'''.format(
|
||||
id=comment.idstr,
|
||||
body=sanitize_braces(render_markdown(comment.body)),
|
||||
usernamelink=html_helper_userlink(comment),
|
||||
score=comment.score,
|
||||
human=common.human(comment.created),
|
||||
permalink=html_helper_permalink(comment),
|
||||
children='{children}',
|
||||
)
|
||||
return text
|
||||
|
||||
def html_format_submission(submission):
|
||||
text = '''
|
||||
<div class="submission"
|
||||
id="{id}"
|
||||
style="
|
||||
border: 4px #00f solid;
|
||||
padding-left: 20px;
|
||||
">
|
||||
|
||||
<p class="userinfo">
|
||||
{usernamelink}
|
||||
<span class="score"> | {score} points</span>
|
||||
<span class="timestamp"> | {human}</span>
|
||||
</p>
|
||||
|
||||
<strong>{title}</strong>
|
||||
<p>{url_or_text}</p>
|
||||
|
||||
<p class="toolbar">
|
||||
{permalink}
|
||||
</p>
|
||||
</div>
|
||||
{children}
|
||||
'''.format(
|
||||
id=submission.idstr,
|
||||
title=sanitize_braces(submission.title),
|
||||
usernamelink=html_helper_userlink(submission),
|
||||
score=submission.score,
|
||||
human=common.human(submission.created),
|
||||
permalink=html_helper_permalink(submission),
|
||||
url_or_text=html_helper_urlortext(submission),
|
||||
children='{children}',
|
||||
)
|
||||
return text
|
||||
|
||||
def html_from_database(subreddit=None, username=None, specific_submission=None):
|
||||
'''
|
||||
Given a timesearch database filename, produce .html files for each
|
||||
of the submissions it contains (or one particular submission fullname)
|
||||
'''
|
||||
if markdown is None:
|
||||
raise ImportError('Page cannot be rendered without the markdown module')
|
||||
|
||||
if (subreddit is None) == (username is None):
|
||||
raise Exception('Enter subreddit or username but not both')
|
||||
|
||||
if subreddit:
|
||||
database = tsdb.TSDB.for_subreddit(subreddit, do_create=False)
|
||||
else:
|
||||
database = tsdb.TSDB.for_user(username, do_create=False)
|
||||
|
||||
submission_trees = trees_from_database(database, specific_submission)
|
||||
for submission_tree in submission_trees:
|
||||
page = html_from_tree(submission_tree, sort=lambda x: x.data.score * -1)
|
||||
os.makedirs(database.offline_reading_dir.absolute_path, exist_ok=True)
|
||||
html_basename = '%s.html' % submission_tree.identifier
|
||||
html_filepath = database.offline_reading_dir.with_child(html_basename)
|
||||
html_handle = open(html_filepath.absolute_path, 'w', encoding='utf-8')
|
||||
html_handle.write('<html><body><meta charset="UTF-8">')
|
||||
html_handle.write(page)
|
||||
html_handle.write('</body></html>')
|
||||
html_handle.close()
|
||||
print('Wrote', html_filepath.relative_path)
|
||||
|
||||
def html_from_tree(tree, sort=None):
|
||||
'''
|
||||
Given a tree *whose root is the submission*, return
|
||||
HTML-formatted text representing each submission's comment page.
|
||||
'''
|
||||
if tree.data.object_type == 'submission':
|
||||
page = html_format_submission(tree.data)
|
||||
elif tree.data.object_type == 'comment':
|
||||
page = html_format_comment(tree.data)
|
||||
children = tree.listnodes()
|
||||
if sort is not None:
|
||||
children.sort(key=sort)
|
||||
children = [html_from_tree(child, sort) for child in children]
|
||||
if len(children) == 0:
|
||||
children = ''
|
||||
else:
|
||||
children = '\n\n'.join(children)
|
||||
try:
|
||||
page = page.format(children=children)
|
||||
except IndexError:
|
||||
print(page)
|
||||
raise
|
||||
return page
|
||||
|
||||
def html_helper_permalink(item):
|
||||
link = 'https://www.reddit.com/r/%s/comments/' % item.subreddit
|
||||
if item.object_type == 'submission':
|
||||
link += item.idstr[3:]
|
||||
elif item.object_type == 'comment':
|
||||
link += '%s/_/%s' % (item.submission[3:], item.idstr[3:])
|
||||
link = '<a href="%s">permalink</a>' % link
|
||||
return link
|
||||
|
||||
def html_helper_urlortext(submission):
|
||||
if submission.url:
|
||||
text = '<a href="{url}">{url}</a>'.format(url=submission.url)
|
||||
elif submission.selftext:
|
||||
text = render_markdown(submission.selftext)
|
||||
else:
|
||||
text = ''
|
||||
text = sanitize_braces(text)
|
||||
return text
|
||||
|
||||
def html_helper_userlink(item):
|
||||
name = item.author
|
||||
if name.lower() == '[deleted]':
|
||||
return '[deleted]'
|
||||
link = 'https://www.reddit.com/u/{name}'
|
||||
link = '<a href="%s">{name}</a>' % link
|
||||
link = link.format(name=name)
|
||||
return link
|
||||
|
||||
def render_markdown(text):
|
||||
text = markdown.markdown(text, output_format='html5')
|
||||
return text
|
||||
|
||||
def sanitize_braces(text):
|
||||
text = text.replace('{', '{{')
|
||||
text = text.replace('}', '}}')
|
||||
return text
|
||||
|
||||
def trees_from_database(database, specific_submission=None):
|
||||
'''
|
||||
Given a timesearch database filename, take all of the submission
|
||||
ids, take all of the comments for each submission id, and run them
|
||||
through `tree_from_submission`.
|
||||
|
||||
Yield each submission's tree as it is generated.
|
||||
'''
|
||||
cur1 = database.sql.cursor()
|
||||
cur2 = database.sql.cursor()
|
||||
|
||||
if specific_submission is None:
|
||||
cur1.execute('SELECT idstr FROM submissions ORDER BY created ASC')
|
||||
submission_ids = common.fetchgenerator(cur1)
|
||||
else:
|
||||
specific_submission = 't3_' + specific_submission.split('_')[-1]
|
||||
# Insert as a tuple to behave like the sql fetch results
|
||||
submission_ids = [(specific_submission, None)]
|
||||
|
||||
found_some_posts = False
|
||||
for submission_id in submission_ids:
|
||||
# Extract sql fetch
|
||||
submission_id = submission_id[0]
|
||||
found_some_posts = True
|
||||
cur2.execute('SELECT * FROM submissions WHERE idstr == ?', [submission_id])
|
||||
submission = cur2.fetchone()
|
||||
cur2.execute('SELECT * FROM comments WHERE submission == ?', [submission_id])
|
||||
fetched_comments = cur2.fetchall()
|
||||
submission_tree = tree_from_submission(submission, fetched_comments)
|
||||
yield submission_tree
|
||||
|
||||
if not found_some_posts:
|
||||
raise Exception('Found no submissions!')
|
||||
|
||||
def tree_from_submission(submission, commentpool):
|
||||
'''
|
||||
Given the sqlite data for a submission and all of its comments,
|
||||
return a tree with the submission id as the root
|
||||
'''
|
||||
submission = DBEntry(submission)
|
||||
commentpool = [DBEntry(c) for c in commentpool]
|
||||
commentpool.sort(key=lambda x: x.created)
|
||||
|
||||
print('Building tree for %s (%d comments)' % (submission.idstr, len(commentpool)))
|
||||
# Thanks Martin Schmidt for the algorithm
|
||||
# http://stackoverflow.com/a/29942118/5430534
|
||||
tree = TreeNode(identifier=submission.idstr, data=submission)
|
||||
node_map = {}
|
||||
|
||||
for comment in commentpool:
|
||||
# Ensure this comment is in a node of its own
|
||||
this_node = node_map.get(comment.idstr, None)
|
||||
if this_node:
|
||||
# This ID was detected as a parent of a previous iteration
|
||||
# Now we're actually filling it in.
|
||||
this_node.data = comment
|
||||
else:
|
||||
this_node = TreeNode(comment.idstr, comment)
|
||||
node_map[comment.idstr] = this_node
|
||||
|
||||
# Attach this node to the parent.
|
||||
if comment.parent.startswith('t3_'):
|
||||
tree.add_child(this_node)
|
||||
else:
|
||||
parent_node = node_map.get(comment.parent, None)
|
||||
if not parent_node:
|
||||
parent_node = TreeNode(comment.parent, data=None)
|
||||
node_map[comment.parent] = parent_node
|
||||
parent_node.add_child(this_node)
|
||||
this_node.parent = parent_node
|
||||
return tree
|
||||
|
||||
def offline_reading_argparse(args):
|
||||
return html_from_database(
|
||||
subreddit=args.subreddit,
|
||||
username=args.username,
|
||||
specific_submission=args.specific_submission,
|
||||
)
|
177
timesearch/redmash.py
Normal file
177
timesearch/redmash.py
Normal file
|
@ -0,0 +1,177 @@
|
|||
import datetime
|
||||
import os
|
||||
|
||||
from . import common
|
||||
from . import tsdb
|
||||
|
||||
|
||||
LINE_FORMAT_TXT = '''
|
||||
{timestamp}: [{title}]({shortlink}) - /u/{author} (+{score})
|
||||
'''.replace('\n', '')
|
||||
|
||||
LINE_FORMAT_HTML = '''
|
||||
{timestamp}: <a href=\"{shortlink}\">[{flairtext}] {title}</a> - <a href=\"{authorlink}\">{author}</a> (+{score})<br>
|
||||
'''.replace('\n', '')
|
||||
|
||||
TIMESTAMP_FORMAT = '%Y %b %d'
|
||||
#The time format.
|
||||
# "%Y %b %d" = "2016 August 10"
|
||||
# See http://strftime.org/
|
||||
|
||||
HTML_HEADER = '''
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<style>
|
||||
*
|
||||
{
|
||||
font-family: Consolas;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
|
||||
<body>
|
||||
'''
|
||||
|
||||
HTML_FOOTER = '''
|
||||
</body>
|
||||
</html>
|
||||
'''
|
||||
|
||||
|
||||
def redmash(
|
||||
subreddit=None,
|
||||
username=None,
|
||||
do_all=False,
|
||||
do_date=False,
|
||||
do_title=False,
|
||||
do_score=False,
|
||||
do_author=False,
|
||||
do_subreddit=False,
|
||||
do_flair=False,
|
||||
html=False,
|
||||
score_threshold=0,
|
||||
):
|
||||
if (subreddit is None) == (username is None):
|
||||
raise Exception('Enter subreddit or username but not both')
|
||||
|
||||
if subreddit:
|
||||
database = tsdb.TSDB.for_subreddit(subreddit, do_create=False)
|
||||
else:
|
||||
database = tsdb.TSDB.for_user(username, do_create=False)
|
||||
|
||||
kwargs = {'html': html, 'score_threshold': score_threshold}
|
||||
wrote = None
|
||||
|
||||
if do_all or do_date:
|
||||
print('Writing time file')
|
||||
wrote = redmash_worker(database, suffix='_date', orderby='created ASC', **kwargs)
|
||||
|
||||
if do_all or do_title:
|
||||
print('Writing title file')
|
||||
wrote = redmash_worker(database, suffix='_title', orderby='title ASC', **kwargs)
|
||||
|
||||
if do_all or do_score:
|
||||
print('Writing score file')
|
||||
wrote = redmash_worker(database, suffix='_score', orderby='score DESC', **kwargs)
|
||||
|
||||
if not username and (do_all or do_author):
|
||||
print('Writing author file')
|
||||
wrote = redmash_worker(database, suffix='_author', orderby='author ASC', **kwargs)
|
||||
|
||||
if username and (do_all or do_subreddit):
|
||||
print('Writing subreddit file')
|
||||
wrote = redmash_worker(database, suffix='_subreddit', orderby='subreddit ASC', **kwargs)
|
||||
|
||||
if do_all or do_flair:
|
||||
print('Writing flair file')
|
||||
# Items with flair come before items without. Each group is sorted by time separately.
|
||||
orderby = 'flair_text IS NULL ASC, created ASC'
|
||||
wrote = redmash_worker(database, suffix='_flair', orderby=orderby, **kwargs)
|
||||
|
||||
if not wrote:
|
||||
raise Exception('No sorts selected! Read the docstring')
|
||||
print('Done.')
|
||||
|
||||
def redmash_worker(
|
||||
database,
|
||||
suffix,
|
||||
orderby,
|
||||
score_threshold=0,
|
||||
html=False,
|
||||
):
|
||||
cur = database.sql.cursor()
|
||||
statement = 'SELECT * FROM submissions WHERE score >= {threshold} ORDER BY {order}'
|
||||
statement = statement.format(threshold=score_threshold, order=orderby)
|
||||
cur.execute(statement)
|
||||
|
||||
os.makedirs(database.redmash_dir.absolute_path, exist_ok=True)
|
||||
|
||||
extension = '.html' if html else '.txt'
|
||||
mash_basename = database.filepath.replace_extension('').basename
|
||||
mash_basename += suffix + extension
|
||||
mash_filepath = database.redmash_dir.with_child(mash_basename)
|
||||
|
||||
mash_handle = open(mash_filepath.absolute_path, 'w', encoding='UTF-8')
|
||||
if html:
|
||||
mash_handle.write(HTML_HEADER)
|
||||
line_format = LINE_FORMAT_HTML
|
||||
else:
|
||||
line_format = LINE_FORMAT_TXT
|
||||
|
||||
do_timestamp = '{timestamp}' in line_format
|
||||
|
||||
for item in common.fetchgenerator(cur):
|
||||
if do_timestamp:
|
||||
timestamp = int(item[tsdb.SQL_SUBMISSION['created']])
|
||||
timestamp = datetime.datetime.utcfromtimestamp(timestamp)
|
||||
timestamp = timestamp.strftime(TIMESTAMP_FORMAT)
|
||||
else:
|
||||
timestamp = ''
|
||||
|
||||
short_link = 'https://redd.it/%s' % item[tsdb.SQL_SUBMISSION['idstr']][3:]
|
||||
author = item[tsdb.SQL_SUBMISSION['author']]
|
||||
if author.lower() == '[deleted]':
|
||||
author_link = '#'
|
||||
else:
|
||||
author_link = 'https://reddit.com/u/%s' % author
|
||||
line = line_format.format(
|
||||
author=author,
|
||||
authorlink=author_link,
|
||||
flaircss=item[tsdb.SQL_SUBMISSION['flair_css_class']] or '',
|
||||
flairtext=item[tsdb.SQL_SUBMISSION['flair_text']] or '',
|
||||
id=item[tsdb.SQL_SUBMISSION['idstr']],
|
||||
numcomments=item[tsdb.SQL_SUBMISSION['num_comments']],
|
||||
score=item[tsdb.SQL_SUBMISSION['score']],
|
||||
shortlink=short_link,
|
||||
subreddit=item[tsdb.SQL_SUBMISSION['subreddit']],
|
||||
timestamp=timestamp,
|
||||
title=item[tsdb.SQL_SUBMISSION['title']].replace('\n', ' '),
|
||||
url=item[tsdb.SQL_SUBMISSION['url']] or short_link,
|
||||
)
|
||||
line += '\n'
|
||||
mash_handle.write(line)
|
||||
|
||||
if html:
|
||||
mash_handle.write(HTML_FOOTER)
|
||||
mash_handle.close()
|
||||
print('Wrote', mash_filepath.relative_path)
|
||||
return mash_filepath
|
||||
|
||||
def redmash_argparse(args):
|
||||
if args.subreddit is args.username is None:
|
||||
raise ValueError('-r subreddit OR -u username must be provided')
|
||||
|
||||
return redmash(
|
||||
subreddit=args.subreddit,
|
||||
username=args.username,
|
||||
do_all=args.do_all,
|
||||
do_date=args.do_date,
|
||||
do_title=args.do_title,
|
||||
do_score=args.do_score,
|
||||
do_author=args.do_author,
|
||||
do_subreddit=args.do_subreddit,
|
||||
do_flair=args.do_flair,
|
||||
html=args.html,
|
||||
score_threshold=common.int_none(args.score_threshold),
|
||||
)
|
147
timesearch/timesearch.py
Normal file
147
timesearch/timesearch.py
Normal file
|
@ -0,0 +1,147 @@
|
|||
import time
|
||||
import traceback
|
||||
|
||||
from . import common
|
||||
from . import tsdb
|
||||
|
||||
|
||||
# The maximum amount by which it can multiply the interval
|
||||
# when not enough posts are found.
|
||||
MAXIMUM_EXPANSION_MULTIPLIER = 2
|
||||
|
||||
|
||||
def timesearch(
|
||||
subreddit=None,
|
||||
username=None,
|
||||
lower=None,
|
||||
upper=None,
|
||||
interval=86400,
|
||||
):
|
||||
'''
|
||||
Collect submissions across time.
|
||||
Please see the global DOCSTRING variable.
|
||||
'''
|
||||
if (subreddit is None) == (username is None):
|
||||
raise Exception('Enter subreddit or username but not both')
|
||||
|
||||
common.bot.login(common.r)
|
||||
|
||||
if subreddit:
|
||||
database = tsdb.TSDB.for_subreddit(subreddit)
|
||||
else:
|
||||
# When searching, we'll take the user's submissions from anywhere.
|
||||
subreddit = 'all'
|
||||
database = tsdb.TSDB.for_user(username)
|
||||
cur = database.sql.cursor()
|
||||
|
||||
if lower == 'update':
|
||||
# Start from the latest submission
|
||||
cur.execute('SELECT * FROM submissions ORDER BY idint DESC LIMIT 1')
|
||||
f = cur.fetchone()
|
||||
if f:
|
||||
lower = f[tsdb.SQL_SUBMISSION['created']]
|
||||
print(f[tsdb.SQL_SUBMISSION['idstr']], common.human(lower), lower)
|
||||
else:
|
||||
lower = None
|
||||
|
||||
if not isinstance(subreddit, common.praw.models.Subreddit):
|
||||
subreddit = common.r.subreddit(subreddit)
|
||||
|
||||
if subreddit != 'all':
|
||||
if isinstance(subreddit, common.praw.models.Subreddit):
|
||||
creation = subreddit.created_utc
|
||||
else:
|
||||
subreddits = subreddit.split('+')
|
||||
subreddits = [common.r.subreddit(sr) for sr in subreddits]
|
||||
creation = min([sr.created_utc for sr in subreddits])
|
||||
else:
|
||||
if not isinstance(username, common.praw.models.Redditor):
|
||||
user = common.r.redditor(username)
|
||||
creation = user.created_utc
|
||||
|
||||
if lower is None or lower < creation:
|
||||
lower = creation
|
||||
|
||||
maxupper = upper
|
||||
if maxupper is None:
|
||||
maxupper = common.get_now() + 86400
|
||||
|
||||
form = '{upper} - {lower} +{gain}'
|
||||
submissions = subreddit.submissions(start=lower, end=maxupper)
|
||||
submissions = common.generator_chunker(submissions, 100)
|
||||
for chunk in submissions:
|
||||
chunk.sort(key=lambda x: x.created_utc, reverse=True)
|
||||
new_count = database.insert(chunk)['new_submissions']
|
||||
message = form.format(
|
||||
upper=common.human(chunk[0].created_utc),
|
||||
lower=common.human(chunk[-1].created_utc),
|
||||
gain=new_count,
|
||||
)
|
||||
print(message)
|
||||
|
||||
#upper = lower + interval
|
||||
#toomany_inarow = 0
|
||||
# while lower < maxupper:
|
||||
# print('\nCurrent interval:', interval, 'seconds')
|
||||
# print('Lower:', common.human(lower), lower)
|
||||
# print('Upper:', common.human(upper), upper)
|
||||
# if username:
|
||||
# query = '(and author:"%s" (and timestamp:%d..%d))' % (username, lower, upper)
|
||||
# else:
|
||||
# query = 'timestamp:%d..%d' % (lower, upper)
|
||||
|
||||
# try:
|
||||
# searchresults = subreddit.search(
|
||||
# query,
|
||||
# sort='new',
|
||||
# limit=100,
|
||||
# syntax='cloudsearch'
|
||||
# )
|
||||
# searchresults = list(searchresults)
|
||||
# except Exception:
|
||||
# traceback.print_exc()
|
||||
# print('resuming in 5...')
|
||||
# time.sleep(5)
|
||||
# continue
|
||||
|
||||
# searchresults.sort(key=lambda x: x.created_utc)
|
||||
# print([i.id for i in searchresults])
|
||||
|
||||
# itemsfound = len(searchresults)
|
||||
# print('Found', itemsfound, 'items.')
|
||||
# if itemsfound < 50:
|
||||
# print('Too few results, increasing interval', end='')
|
||||
# diff = (1 - (itemsfound / 75)) + 1
|
||||
# diff = min(MAXIMUM_EXPANSION_MULTIPLIER, diff)
|
||||
# interval = int(interval * diff)
|
||||
# if itemsfound > 99:
|
||||
# #Intentionally not elif
|
||||
# print('Too many results, reducing interval', end='')
|
||||
# interval = int(interval * (0.8 - (0.05 * toomany_inarow)))
|
||||
# upper = lower + interval
|
||||
# toomany_inarow += 1
|
||||
# else:
|
||||
# lower = upper
|
||||
# upper = lower + interval
|
||||
# toomany_inarow = max(0, toomany_inarow-1)
|
||||
# print(database.insert(searchresults))
|
||||
# print()
|
||||
|
||||
cur.execute('SELECT COUNT(idint) FROM submissions')
|
||||
itemcount = cur.fetchone()[0]
|
||||
|
||||
print('Ended with %d items in %s' % (itemcount, database.filepath.basename))
|
||||
|
||||
def timesearch_argparse(args):
|
||||
if args.lower == 'update':
|
||||
lower = 'update'
|
||||
else:
|
||||
lower = common.int_none(args.lower)
|
||||
|
||||
return timesearch(
|
||||
subreddit=args.subreddit,
|
||||
username=args.username,
|
||||
lower=lower,
|
||||
upper=common.int_none(args.upper),
|
||||
interval=common.int_none(args.interval),
|
||||
)
|
335
timesearch/tsdb.py
Normal file
335
timesearch/tsdb.py
Normal file
|
@ -0,0 +1,335 @@
|
|||
import os
|
||||
import sqlite3
|
||||
import types
|
||||
|
||||
from . import common
|
||||
from . import exceptions
|
||||
|
||||
from voussoirkit import pathclass
|
||||
|
||||
|
||||
# For backwards compatibility reasons, this list of format strings will help
|
||||
# timesearch find databases that are using the old filename style.
|
||||
# The final element will be used if none of the previous ones were found.
|
||||
DB_FORMATS_SUBREDDIT = [
|
||||
'.\\{name}.db',
|
||||
'.\\subreddits\\{name}\\{name}.db',
|
||||
'.\\{name}\\{name}.db',
|
||||
'.\\databases\\{name}.db',
|
||||
'.\\subreddits\\{name}\\{name}.db',
|
||||
]
|
||||
DB_FORMATS_USER = [
|
||||
'.\\@{name}.db',
|
||||
'.\\users\\@{name}\\@{name}.db',
|
||||
'.\\@{name}\\@{name}.db',
|
||||
'.\\databases\\@{name}.db',
|
||||
'.\\users\\@{name}\\@{name}.db',
|
||||
]
|
||||
|
||||
DB_INIT = '''
|
||||
CREATE TABLE IF NOT EXISTS submissions(
|
||||
idint INT,
|
||||
idstr TEXT,
|
||||
created INT,
|
||||
self INT,
|
||||
nsfw INT,
|
||||
author TEXT,
|
||||
title TEXT,
|
||||
url TEXT,
|
||||
selftext TEXT,
|
||||
score INT,
|
||||
subreddit TEXT,
|
||||
distinguish INT,
|
||||
textlen INT,
|
||||
num_comments INT,
|
||||
flair_text TEXT,
|
||||
flair_css_class TEXT,
|
||||
augmented_at INT,
|
||||
augmented_count INT
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS submission_index ON submissions(idstr);
|
||||
----------------------------------------------------------------------------------------------------
|
||||
CREATE TABLE IF NOT EXISTS comments(
|
||||
idint INT,
|
||||
idstr TEXT,
|
||||
created INT,
|
||||
author TEXT,
|
||||
parent TEXT,
|
||||
submission TEXT,
|
||||
body TEXT,
|
||||
score INT,
|
||||
subreddit TEXT,
|
||||
distinguish TEXT,
|
||||
textlen INT
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS comment_index ON comments(idstr);
|
||||
'''.strip()
|
||||
|
||||
SQL_SUBMISSION_COLUMNS = [
|
||||
'idint',
|
||||
'idstr',
|
||||
'created',
|
||||
'self',
|
||||
'nsfw',
|
||||
'author',
|
||||
'title',
|
||||
'url',
|
||||
'selftext',
|
||||
'score',
|
||||
'subreddit',
|
||||
'distinguish',
|
||||
'textlen',
|
||||
'num_comments',
|
||||
'flair_text',
|
||||
'flair_css_class',
|
||||
'augmented_at',
|
||||
'augmented_count',
|
||||
]
|
||||
|
||||
SQL_COMMENT_COLUMNS = [
|
||||
'idint',
|
||||
'idstr',
|
||||
'created',
|
||||
'author',
|
||||
'parent',
|
||||
'submission',
|
||||
'body',
|
||||
'score',
|
||||
'subreddit',
|
||||
'distinguish',
|
||||
'textlen',
|
||||
]
|
||||
|
||||
SQL_SUBMISSION = {key:index for (index, key) in enumerate(SQL_SUBMISSION_COLUMNS)}
|
||||
SQL_COMMENT = {key:index for (index, key) in enumerate(SQL_COMMENT_COLUMNS)}
|
||||
|
||||
|
||||
class TSDB:
|
||||
def __init__(self, filepath, do_create=True):
|
||||
self.filepath = pathclass.Path(filepath)
|
||||
if not self.filepath.is_file:
|
||||
if not do_create:
|
||||
raise exceptions.DBNotFound(self.filepath)
|
||||
print('New database', self.filepath.relative_path)
|
||||
|
||||
os.makedirs(self.filepath.parent.absolute_path, exist_ok=True)
|
||||
|
||||
self.breakdown_dir = self.filepath.parent.with_child('breakdown')
|
||||
self.offline_reading_dir = self.filepath.parent.with_child('offline_reading')
|
||||
self.redmash_dir = self.filepath.parent.with_child('redmash')
|
||||
self.styles_dir = self.filepath.parent.with_child('styles')
|
||||
self.wiki_dir = self.filepath.parent.with_child('wiki')
|
||||
|
||||
self.sql = sqlite3.connect(self.filepath.absolute_path)
|
||||
self.cur = self.sql.cursor()
|
||||
statements = DB_INIT.split(';')
|
||||
for statement in statements:
|
||||
self.cur.execute(statement)
|
||||
self.sql.commit()
|
||||
|
||||
def __repr__(self):
|
||||
return 'TSDB(%s)' % self.filepath
|
||||
|
||||
@staticmethod
|
||||
def _pick_filepath(formats, name):
|
||||
'''
|
||||
Starting with the most specific and preferred filename format, check
|
||||
if there is an existing database that matches the name we're looking
|
||||
for, and return that path. If none of them exist, then use the most
|
||||
preferred filepath.
|
||||
'''
|
||||
paths = [pathclass.Path(format.format(name=name)) for format in formats]
|
||||
for path in paths:
|
||||
if path.is_file:
|
||||
return path
|
||||
return paths[-1]
|
||||
|
||||
@classmethod
|
||||
def for_subreddit(cls, name, do_create=True):
|
||||
if isinstance(name, common.praw.models.Subreddit):
|
||||
name = name.display_name
|
||||
elif not isinstance(name, str):
|
||||
raise TypeError(name, 'should be str or Subreddit.')
|
||||
|
||||
filepath = cls._pick_filepath(formats=DB_FORMATS_SUBREDDIT, name=name)
|
||||
return cls(filepath=filepath, do_create=do_create)
|
||||
|
||||
@classmethod
|
||||
def for_user(cls, name, do_create=True):
|
||||
if isinstance(name, common.praw.models.Redditor):
|
||||
name = name.name
|
||||
elif not isinstance(name, str):
|
||||
raise TypeError(name, 'should be str or Redditor.')
|
||||
|
||||
filepath = cls._pick_filepath(formats=DB_FORMATS_USER, name=name)
|
||||
return cls(filepath=filepath, do_create=do_create)
|
||||
|
||||
def insert(self, objects, commit=True):
|
||||
if not isinstance(objects, (list, tuple, types.GeneratorType)):
|
||||
objects = [objects]
|
||||
|
||||
new_values = {
|
||||
'new_submissions': 0,
|
||||
'new_comments': 0,
|
||||
}
|
||||
methods = {
|
||||
common.praw.models.Submission: (self.insert_submission, 'new_submissions'),
|
||||
common.praw.models.Comment: (self.insert_comment, 'new_comments'),
|
||||
}
|
||||
for obj in objects:
|
||||
(method, key) = methods.get(type(obj), (None, None))
|
||||
if method is None:
|
||||
raise TypeError('Unsupported', type(obj), obj)
|
||||
status = method(obj)
|
||||
new_values[key] += status
|
||||
|
||||
if commit:
|
||||
self.sql.commit()
|
||||
|
||||
return new_values
|
||||
|
||||
def insert_submission(self, submission):
|
||||
cur = self.sql.cursor()
|
||||
cur.execute('SELECT * FROM submissions WHERE idstr == ?', [submission.fullname])
|
||||
existing_entry = cur.fetchone()
|
||||
|
||||
if submission.author is None:
|
||||
author = '[DELETED]'
|
||||
else:
|
||||
author = submission.author.name
|
||||
|
||||
if not existing_entry:
|
||||
if submission.is_self:
|
||||
# Selfpost's URL leads back to itself, so just ignore it.
|
||||
url = None
|
||||
else:
|
||||
url = submission.url
|
||||
|
||||
postdata = {
|
||||
'idint': common.b36(submission.id),
|
||||
'idstr': submission.fullname,
|
||||
'created': submission.created_utc,
|
||||
'self': submission.is_self,
|
||||
'nsfw': submission.over_18,
|
||||
'author': author,
|
||||
'title': submission.title,
|
||||
'url': url,
|
||||
'selftext': submission.selftext,
|
||||
'score': submission.score,
|
||||
'subreddit': submission.subreddit.display_name,
|
||||
'distinguish': submission.distinguished,
|
||||
'textlen': len(submission.selftext),
|
||||
'num_comments': submission.num_comments,
|
||||
'flair_text': submission.link_flair_text,
|
||||
'flair_css_class': submission.link_flair_css_class,
|
||||
'augmented_at': None,
|
||||
'augmented_count': None,
|
||||
}
|
||||
(qmarks, bindings) = binding_filler(SQL_SUBMISSION_COLUMNS, postdata, require_all=True)
|
||||
query = 'INSERT INTO submissions VALUES(%s)' % qmarks
|
||||
cur.execute(query, bindings)
|
||||
|
||||
else:
|
||||
if submission.author is None:
|
||||
# This post is deleted, therefore its text probably says [deleted] or [removed].
|
||||
# Discard that, and keep the data we already had here.
|
||||
selftext = existing_entry[SQL_SUBMISSION['selftext']]
|
||||
else:
|
||||
selftext = submission.selftext
|
||||
|
||||
query = '''
|
||||
UPDATE submissions SET
|
||||
nsfw = coalesce(?, nsfw),
|
||||
score = coalesce(?, score),
|
||||
selftext = coalesce(?, selftext),
|
||||
distinguish = coalesce(?, distinguish),
|
||||
num_comments = coalesce(?, num_comments),
|
||||
flair_text = coalesce(?, flair_text),
|
||||
flair_css_class = coalesce(?, flair_css_class)
|
||||
WHERE idstr == ?
|
||||
'''
|
||||
bindings = [
|
||||
submission.over_18,
|
||||
submission.score,
|
||||
selftext,
|
||||
submission.distinguished,
|
||||
submission.num_comments,
|
||||
submission.link_flair_text,
|
||||
submission.link_flair_css_class,
|
||||
submission.fullname
|
||||
]
|
||||
cur.execute(query, bindings)
|
||||
|
||||
return existing_entry is None
|
||||
|
||||
def insert_comment(self, comment):
|
||||
cur = self.sql.cursor()
|
||||
cur.execute('SELECT * FROM comments WHERE idstr == ?', [comment.fullname])
|
||||
existing_entry = cur.fetchone()
|
||||
|
||||
if comment.author is None:
|
||||
author = '[DELETED]'
|
||||
else:
|
||||
author = comment.author.name
|
||||
|
||||
if not existing_entry:
|
||||
postdata = {
|
||||
'idint': common.b36(comment.id),
|
||||
'idstr': comment.fullname,
|
||||
'created': comment.created_utc,
|
||||
'author': author,
|
||||
'parent': comment.parent_id,
|
||||
'submission': comment.link_id,
|
||||
'body': comment.body,
|
||||
'score': comment.score,
|
||||
'subreddit': comment.subreddit.display_name,
|
||||
'distinguish': comment.distinguished,
|
||||
'textlen': len(comment.body),
|
||||
}
|
||||
(qmarks, bindings) = binding_filler(SQL_COMMENT_COLUMNS, postdata, require_all=True)
|
||||
query = 'INSERT INTO comments VALUES(%s)' % qmarks
|
||||
cur.execute(query, bindings)
|
||||
|
||||
else:
|
||||
greasy = ['has been overwritten', 'pastebin.com/64GuVi2F']
|
||||
if comment.author is None or any(grease in comment.body for grease in greasy):
|
||||
body = existing_entry[SQL_COMMENT['body']]
|
||||
else:
|
||||
body = comment.body
|
||||
|
||||
query = '''
|
||||
UPDATE comments SET
|
||||
score = coalesce(?, score),
|
||||
body = coalesce(?, body),
|
||||
distinguish = coalesce(?, distinguish)
|
||||
WHERE idstr == ?
|
||||
'''
|
||||
bindings = [
|
||||
comment.score,
|
||||
body,
|
||||
comment.distinguished,
|
||||
comment.fullname
|
||||
]
|
||||
cur.execute(query, bindings)
|
||||
|
||||
return existing_entry is None
|
||||
|
||||
|
||||
def binding_filler(column_names, values, require_all=True):
|
||||
'''
|
||||
Manually aligning question marks and bindings is annoying.
|
||||
Given the table's column names and a dictionary of {column: value},
|
||||
return the question marks and the list of bindings in the right order.
|
||||
'''
|
||||
values = values.copy()
|
||||
for column in column_names:
|
||||
if column in values:
|
||||
continue
|
||||
if require_all:
|
||||
raise ValueError('Missing column "%s"' % column)
|
||||
else:
|
||||
values.setdefault(column, None)
|
||||
qmarks = '?' * len(column_names)
|
||||
qmarks = ', '.join(qmarks)
|
||||
bindings = [values[column] for column in column_names]
|
||||
return (qmarks, bindings)
|
Loading…
Reference in a new issue