Add experimental RSS-based refresh_all_channels.
This commit is contained in:
		
							parent
							
								
									6bd4997f3f
								
							
						
					
					
						commit
						8eaae26e55
					
				
					 3 changed files with 85 additions and 0 deletions
				
			
		|  | @ -1,3 +1,4 @@ | ||||||
|  | bs4 | ||||||
| flask | flask | ||||||
| gevent | gevent | ||||||
| https://github.com/voussoir/else/raw/master/_voussoirkit/voussoirkit.zip | https://github.com/voussoir/else/raw/master/_voussoirkit/voussoirkit.zip | ||||||
|  |  | ||||||
|  | @ -8,6 +8,7 @@ from . import constants | ||||||
| from . import exceptions | from . import exceptions | ||||||
| from . import objects | from . import objects | ||||||
| from . import ytapi | from . import ytapi | ||||||
|  | from . import ytrss | ||||||
| 
 | 
 | ||||||
| from voussoirkit import cacheclass | from voussoirkit import cacheclass | ||||||
| from voussoirkit import configlayers | from voussoirkit import configlayers | ||||||
|  | @ -147,7 +148,74 @@ class YCDLDBChannelMixin: | ||||||
|         channels.sort(key=lambda c: c.name.lower()) |         channels.sort(key=lambda c: c.name.lower()) | ||||||
|         return channels |         return channels | ||||||
| 
 | 
 | ||||||
|  |     def _rss_assisted_refresh(self, skip_failures=False, commit=True): | ||||||
|  |         ''' | ||||||
|  |         Youtube provides RSS feeds for every channel. These feeds do not | ||||||
|  |         require the API token and seem to have generous ratelimits, or | ||||||
|  |         perhaps no ratelimits at all. | ||||||
|  | 
 | ||||||
|  |         This RSS-assisted refresh will cut down on tokened API calls by: | ||||||
|  |         1. Getting video IDs from the free RSS feed instead of the tokened | ||||||
|  |            playlistItems endpoint. | ||||||
|  |         2. Batching video IDs from multiple channels and requesting them | ||||||
|  |            together, instead of requesting each channel's videos separately in | ||||||
|  |            less-than-full batches. In retrospect, this improvement could be | ||||||
|  |            applied to the non-RSS refresh method too. If this RSS experiment | ||||||
|  |            turns out to be bad, I can at least go ahead with that. | ||||||
|  | 
 | ||||||
|  |         The RSS has two limitations: | ||||||
|  |         1. It does not contain all the properties I want to store, otherwise | ||||||
|  |            I'd happily use that data directly instead of passing the ID batches | ||||||
|  |            into ytapi. | ||||||
|  |         2. It only returns the latest 15 videos, and of course does not | ||||||
|  |            paginate. So, for any channel with more than 14 new videos, we'll | ||||||
|  |            do a traditional refresh. | ||||||
|  |         ''' | ||||||
|  |         query = 'SELECT id FROM videos WHERE author_id == ? ORDER BY published DESC LIMIT 1' | ||||||
|  |         exceptions = [] | ||||||
|  | 
 | ||||||
|  |         def traditional(channel): | ||||||
|  |             try: | ||||||
|  |                 channel.refresh() | ||||||
|  |             except Exception as exc: | ||||||
|  |                 if skip_failures: | ||||||
|  |                     traceback.print_exc() | ||||||
|  |                     exceptions.append(exc) | ||||||
|  |                 else: | ||||||
|  |                     raise | ||||||
|  | 
 | ||||||
|  |         def gen(): | ||||||
|  |             for channel in self.get_channels(): | ||||||
|  |                 most_recent_video = self.sql_select_one(query, [channel.id])[0] | ||||||
|  |                 try: | ||||||
|  |                     rss_videos = ytrss.get_user_videos(channel.id) | ||||||
|  |                 except Exception: | ||||||
|  |                     # traceback.print_exc() | ||||||
|  |                     traditional(channel) | ||||||
|  |                     continue | ||||||
|  | 
 | ||||||
|  |                 try: | ||||||
|  |                     index = rss_videos.index(most_recent_video) | ||||||
|  |                 except ValueError: | ||||||
|  |                     # traceback.print_exc() | ||||||
|  |                     traditional(channel) | ||||||
|  |                     continue | ||||||
|  | 
 | ||||||
|  |                 new_ids = rss_videos[:index] | ||||||
|  |                 yield from new_ids | ||||||
|  | 
 | ||||||
|  |         for video in self.youtube.get_videos(gen()): | ||||||
|  |             self.ingest_video(video, commit=False) | ||||||
|  | 
 | ||||||
|  |         if commit: | ||||||
|  |             self.commit() | ||||||
|  | 
 | ||||||
|  |         return exceptions | ||||||
|  | 
 | ||||||
|     def refresh_all_channels(self, force=False, skip_failures=False, commit=True): |     def refresh_all_channels(self, force=False, skip_failures=False, commit=True): | ||||||
|  |         if not force: | ||||||
|  |             return self._rss_assisted_refresh(skip_failures=skip_failures, commit=commit) | ||||||
|  | 
 | ||||||
|         exceptions = [] |         exceptions = [] | ||||||
|         for channel in self.get_channels(): |         for channel in self.get_channels(): | ||||||
|             try: |             try: | ||||||
|  | @ -160,6 +228,7 @@ class YCDLDBChannelMixin: | ||||||
|                     raise |                     raise | ||||||
|         if commit: |         if commit: | ||||||
|             self.commit() |             self.commit() | ||||||
|  | 
 | ||||||
|         return exceptions |         return exceptions | ||||||
| 
 | 
 | ||||||
| class YCDLSQLMixin: | class YCDLSQLMixin: | ||||||
|  |  | ||||||
							
								
								
									
										15
									
								
								ycdl/ytrss.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										15
									
								
								ycdl/ytrss.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,15 @@ | ||||||
|  | import bs4 | ||||||
|  | import logging | ||||||
|  | import requests | ||||||
|  | 
 | ||||||
|  | log = logging.getLogger(__name__) | ||||||
|  | 
 | ||||||
|  | def get_user_videos(uid): | ||||||
|  |     log.debug(f'Fetching RSS for {uid}.') | ||||||
|  |     url = f'https://www.youtube.com/feeds/videos.xml?channel_id={uid}' | ||||||
|  |     response = requests.get(url) | ||||||
|  |     response.raise_for_status() | ||||||
|  |     soup = bs4.BeautifulSoup(response.text, 'lxml') | ||||||
|  |     # find_all does not work on namespaced tags unless you add a limit paramter. | ||||||
|  |     video_ids = [v.text for v in soup.find_all('yt:videoid', limit=9999)] | ||||||
|  |     return video_ids | ||||||
		Loading…
	
		Reference in a new issue