Add experimental RSS-based refresh_all_channels.

2020-08-12 12:32:01 -07:00 · 2020-08-12 12:32:01 -07:00 · 8eaae26e55
commit 8eaae26e55
parent 6bd4997f3f
3 changed files with 85 additions and 0 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -1,3 +1,4 @@
+bs4
 flask
 gevent
 https://github.com/voussoir/else/raw/master/_voussoirkit/voussoirkit.zip
--- a/ycdl/ycdldb.py
+++ b/ycdl/ycdldb.py
@ -8,6 +8,7 @@ from . import constants
 from . import exceptions
 from . import objects
 from . import ytapi
+from . import ytrss

 from voussoirkit import cacheclass
 from voussoirkit import configlayers
@ -147,7 +148,74 @@ class YCDLDBChannelMixin:
        channels.sort(key=lambda c: c.name.lower())
        return channels

+    def _rss_assisted_refresh(self, skip_failures=False, commit=True):
+        '''
+        Youtube provides RSS feeds for every channel. These feeds do not
+        require the API token and seem to have generous ratelimits, or
+        perhaps no ratelimits at all.
+
+        This RSS-assisted refresh will cut down on tokened API calls by:
+        1. Getting video IDs from the free RSS feed instead of the tokened
+           playlistItems endpoint.
+        2. Batching video IDs from multiple channels and requesting them
+           together, instead of requesting each channel's videos separately in
+           less-than-full batches. In retrospect, this improvement could be
+           applied to the non-RSS refresh method too. If this RSS experiment
+           turns out to be bad, I can at least go ahead with that.
+
+        The RSS has two limitations:
+        1. It does not contain all the properties I want to store, otherwise
+           I'd happily use that data directly instead of passing the ID batches
+           into ytapi.
+        2. It only returns the latest 15 videos, and of course does not
+           paginate. So, for any channel with more than 14 new videos, we'll
+           do a traditional refresh.
+        '''
+        query = 'SELECT id FROM videos WHERE author_id == ? ORDER BY published DESC LIMIT 1'
+        exceptions = []
+
+        def traditional(channel):
+            try:
+                channel.refresh()
+            except Exception as exc:
+                if skip_failures:
+                    traceback.print_exc()
+                    exceptions.append(exc)
+                else:
+                    raise
+
+        def gen():
+            for channel in self.get_channels():
+                most_recent_video = self.sql_select_one(query, [channel.id])[0]
+                try:
+                    rss_videos = ytrss.get_user_videos(channel.id)
+                except Exception:
+                    # traceback.print_exc()
+                    traditional(channel)
+                    continue
+
+                try:
+                    index = rss_videos.index(most_recent_video)
+                except ValueError:
+                    # traceback.print_exc()
+                    traditional(channel)
+                    continue
+
+                new_ids = rss_videos[:index]
+                yield from new_ids
+
+        for video in self.youtube.get_videos(gen()):
+            self.ingest_video(video, commit=False)
+
+        if commit:
+            self.commit()
+
+        return exceptions
+
    def refresh_all_channels(self, force=False, skip_failures=False, commit=True):
+        if not force:
+            return self._rss_assisted_refresh(skip_failures=skip_failures, commit=commit)
+
        exceptions = []
        for channel in self.get_channels():
            try:
@ -160,6 +228,7 @@ class YCDLDBChannelMixin:
                    raise
        if commit:
            self.commit()
+
        return exceptions

 class YCDLSQLMixin:
--- a/ycdl/ytrss.py
+++ b/ycdl/ytrss.py
@ -0,0 +1,15 @@
+import bs4
+import logging
+import requests
+
+log = logging.getLogger(__name__)
+
+def get_user_videos(uid):
+    log.debug(f'Fetching RSS for {uid}.')
+    url = f'https://www.youtube.com/feeds/videos.xml?channel_id={uid}'
+    response = requests.get(url)
+    response.raise_for_status()
+    soup = bs4.BeautifulSoup(response.text, 'lxml')
+    # find_all does not work on namespaced tags unless you add a limit paramter.
+    video_ids = [v.text for v in soup.find_all('yt:videoid', limit=9999)]
+    return video_ids