Add experimental RSS-based refresh_all_channels.
This commit is contained in:
parent
6bd4997f3f
commit
8eaae26e55
3 changed files with 85 additions and 0 deletions
|
@ -1,3 +1,4 @@
|
|||
bs4
|
||||
flask
|
||||
gevent
|
||||
https://github.com/voussoir/else/raw/master/_voussoirkit/voussoirkit.zip
|
||||
|
|
|
@ -8,6 +8,7 @@ from . import constants
|
|||
from . import exceptions
|
||||
from . import objects
|
||||
from . import ytapi
|
||||
from . import ytrss
|
||||
|
||||
from voussoirkit import cacheclass
|
||||
from voussoirkit import configlayers
|
||||
|
@ -147,7 +148,74 @@ class YCDLDBChannelMixin:
|
|||
channels.sort(key=lambda c: c.name.lower())
|
||||
return channels
|
||||
|
||||
def _rss_assisted_refresh(self, skip_failures=False, commit=True):
|
||||
'''
|
||||
Youtube provides RSS feeds for every channel. These feeds do not
|
||||
require the API token and seem to have generous ratelimits, or
|
||||
perhaps no ratelimits at all.
|
||||
|
||||
This RSS-assisted refresh will cut down on tokened API calls by:
|
||||
1. Getting video IDs from the free RSS feed instead of the tokened
|
||||
playlistItems endpoint.
|
||||
2. Batching video IDs from multiple channels and requesting them
|
||||
together, instead of requesting each channel's videos separately in
|
||||
less-than-full batches. In retrospect, this improvement could be
|
||||
applied to the non-RSS refresh method too. If this RSS experiment
|
||||
turns out to be bad, I can at least go ahead with that.
|
||||
|
||||
The RSS has two limitations:
|
||||
1. It does not contain all the properties I want to store, otherwise
|
||||
I'd happily use that data directly instead of passing the ID batches
|
||||
into ytapi.
|
||||
2. It only returns the latest 15 videos, and of course does not
|
||||
paginate. So, for any channel with more than 14 new videos, we'll
|
||||
do a traditional refresh.
|
||||
'''
|
||||
query = 'SELECT id FROM videos WHERE author_id == ? ORDER BY published DESC LIMIT 1'
|
||||
exceptions = []
|
||||
|
||||
def traditional(channel):
|
||||
try:
|
||||
channel.refresh()
|
||||
except Exception as exc:
|
||||
if skip_failures:
|
||||
traceback.print_exc()
|
||||
exceptions.append(exc)
|
||||
else:
|
||||
raise
|
||||
|
||||
def gen():
|
||||
for channel in self.get_channels():
|
||||
most_recent_video = self.sql_select_one(query, [channel.id])[0]
|
||||
try:
|
||||
rss_videos = ytrss.get_user_videos(channel.id)
|
||||
except Exception:
|
||||
# traceback.print_exc()
|
||||
traditional(channel)
|
||||
continue
|
||||
|
||||
try:
|
||||
index = rss_videos.index(most_recent_video)
|
||||
except ValueError:
|
||||
# traceback.print_exc()
|
||||
traditional(channel)
|
||||
continue
|
||||
|
||||
new_ids = rss_videos[:index]
|
||||
yield from new_ids
|
||||
|
||||
for video in self.youtube.get_videos(gen()):
|
||||
self.ingest_video(video, commit=False)
|
||||
|
||||
if commit:
|
||||
self.commit()
|
||||
|
||||
return exceptions
|
||||
|
||||
def refresh_all_channels(self, force=False, skip_failures=False, commit=True):
|
||||
if not force:
|
||||
return self._rss_assisted_refresh(skip_failures=skip_failures, commit=commit)
|
||||
|
||||
exceptions = []
|
||||
for channel in self.get_channels():
|
||||
try:
|
||||
|
@ -160,6 +228,7 @@ class YCDLDBChannelMixin:
|
|||
raise
|
||||
if commit:
|
||||
self.commit()
|
||||
|
||||
return exceptions
|
||||
|
||||
class YCDLSQLMixin:
|
||||
|
|
15
ycdl/ytrss.py
Normal file
15
ycdl/ytrss.py
Normal file
|
@ -0,0 +1,15 @@
|
|||
import bs4
|
||||
import logging
|
||||
import requests
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
def get_user_videos(uid):
|
||||
log.debug(f'Fetching RSS for {uid}.')
|
||||
url = f'https://www.youtube.com/feeds/videos.xml?channel_id={uid}'
|
||||
response = requests.get(url)
|
||||
response.raise_for_status()
|
||||
soup = bs4.BeautifulSoup(response.text, 'lxml')
|
||||
# find_all does not work on namespaced tags unless you add a limit paramter.
|
||||
video_ids = [v.text for v in soup.find_all('yt:videoid', limit=9999)]
|
||||
return video_ids
|
Loading…
Reference in a new issue