Add experimental RSS-based refresh_all_channels.
This commit is contained in:
parent
6bd4997f3f
commit
8eaae26e55
3 changed files with 85 additions and 0 deletions
|
@ -1,3 +1,4 @@
|
||||||
|
bs4
|
||||||
flask
|
flask
|
||||||
gevent
|
gevent
|
||||||
https://github.com/voussoir/else/raw/master/_voussoirkit/voussoirkit.zip
|
https://github.com/voussoir/else/raw/master/_voussoirkit/voussoirkit.zip
|
||||||
|
|
|
@ -8,6 +8,7 @@ from . import constants
|
||||||
from . import exceptions
|
from . import exceptions
|
||||||
from . import objects
|
from . import objects
|
||||||
from . import ytapi
|
from . import ytapi
|
||||||
|
from . import ytrss
|
||||||
|
|
||||||
from voussoirkit import cacheclass
|
from voussoirkit import cacheclass
|
||||||
from voussoirkit import configlayers
|
from voussoirkit import configlayers
|
||||||
|
@ -147,7 +148,74 @@ class YCDLDBChannelMixin:
|
||||||
channels.sort(key=lambda c: c.name.lower())
|
channels.sort(key=lambda c: c.name.lower())
|
||||||
return channels
|
return channels
|
||||||
|
|
||||||
|
def _rss_assisted_refresh(self, skip_failures=False, commit=True):
|
||||||
|
'''
|
||||||
|
Youtube provides RSS feeds for every channel. These feeds do not
|
||||||
|
require the API token and seem to have generous ratelimits, or
|
||||||
|
perhaps no ratelimits at all.
|
||||||
|
|
||||||
|
This RSS-assisted refresh will cut down on tokened API calls by:
|
||||||
|
1. Getting video IDs from the free RSS feed instead of the tokened
|
||||||
|
playlistItems endpoint.
|
||||||
|
2. Batching video IDs from multiple channels and requesting them
|
||||||
|
together, instead of requesting each channel's videos separately in
|
||||||
|
less-than-full batches. In retrospect, this improvement could be
|
||||||
|
applied to the non-RSS refresh method too. If this RSS experiment
|
||||||
|
turns out to be bad, I can at least go ahead with that.
|
||||||
|
|
||||||
|
The RSS has two limitations:
|
||||||
|
1. It does not contain all the properties I want to store, otherwise
|
||||||
|
I'd happily use that data directly instead of passing the ID batches
|
||||||
|
into ytapi.
|
||||||
|
2. It only returns the latest 15 videos, and of course does not
|
||||||
|
paginate. So, for any channel with more than 14 new videos, we'll
|
||||||
|
do a traditional refresh.
|
||||||
|
'''
|
||||||
|
query = 'SELECT id FROM videos WHERE author_id == ? ORDER BY published DESC LIMIT 1'
|
||||||
|
exceptions = []
|
||||||
|
|
||||||
|
def traditional(channel):
|
||||||
|
try:
|
||||||
|
channel.refresh()
|
||||||
|
except Exception as exc:
|
||||||
|
if skip_failures:
|
||||||
|
traceback.print_exc()
|
||||||
|
exceptions.append(exc)
|
||||||
|
else:
|
||||||
|
raise
|
||||||
|
|
||||||
|
def gen():
|
||||||
|
for channel in self.get_channels():
|
||||||
|
most_recent_video = self.sql_select_one(query, [channel.id])[0]
|
||||||
|
try:
|
||||||
|
rss_videos = ytrss.get_user_videos(channel.id)
|
||||||
|
except Exception:
|
||||||
|
# traceback.print_exc()
|
||||||
|
traditional(channel)
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
index = rss_videos.index(most_recent_video)
|
||||||
|
except ValueError:
|
||||||
|
# traceback.print_exc()
|
||||||
|
traditional(channel)
|
||||||
|
continue
|
||||||
|
|
||||||
|
new_ids = rss_videos[:index]
|
||||||
|
yield from new_ids
|
||||||
|
|
||||||
|
for video in self.youtube.get_videos(gen()):
|
||||||
|
self.ingest_video(video, commit=False)
|
||||||
|
|
||||||
|
if commit:
|
||||||
|
self.commit()
|
||||||
|
|
||||||
|
return exceptions
|
||||||
|
|
||||||
def refresh_all_channels(self, force=False, skip_failures=False, commit=True):
|
def refresh_all_channels(self, force=False, skip_failures=False, commit=True):
|
||||||
|
if not force:
|
||||||
|
return self._rss_assisted_refresh(skip_failures=skip_failures, commit=commit)
|
||||||
|
|
||||||
exceptions = []
|
exceptions = []
|
||||||
for channel in self.get_channels():
|
for channel in self.get_channels():
|
||||||
try:
|
try:
|
||||||
|
@ -160,6 +228,7 @@ class YCDLDBChannelMixin:
|
||||||
raise
|
raise
|
||||||
if commit:
|
if commit:
|
||||||
self.commit()
|
self.commit()
|
||||||
|
|
||||||
return exceptions
|
return exceptions
|
||||||
|
|
||||||
class YCDLSQLMixin:
|
class YCDLSQLMixin:
|
||||||
|
|
15
ycdl/ytrss.py
Normal file
15
ycdl/ytrss.py
Normal file
|
@ -0,0 +1,15 @@
|
||||||
|
import bs4
|
||||||
|
import logging
|
||||||
|
import requests
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
def get_user_videos(uid):
|
||||||
|
log.debug(f'Fetching RSS for {uid}.')
|
||||||
|
url = f'https://www.youtube.com/feeds/videos.xml?channel_id={uid}'
|
||||||
|
response = requests.get(url)
|
||||||
|
response.raise_for_status()
|
||||||
|
soup = bs4.BeautifulSoup(response.text, 'lxml')
|
||||||
|
# find_all does not work on namespaced tags unless you add a limit paramter.
|
||||||
|
video_ids = [v.text for v in soup.find_all('yt:videoid', limit=9999)]
|
||||||
|
return video_ids
|
Loading…
Reference in a new issue