bringrss/bringrss/helpers.py

import bs4
import datetime
import dateutil.parser
import importlib
import sys

from . import constants

from voussoirkit import cacheclass
from voussoirkit import httperrors
from voussoirkit import pathclass
from voussoirkit import vlogging

log = vlogging.get_logger(__name__)

_xml_etag_cache = cacheclass.Cache(maxlen=100)

def dateutil_parse(string):
    return dateutil.parser.parse(string, tzinfos=constants.DATEUTIL_TZINFOS)

def fetch_xml(url, headers={}) -> bs4.BeautifulSoup:
    log.debug('Fetching %s.', url)
    response = constants.http_session.get(url, headers=headers)
    httperrors.raise_for_status(response)
    soup = bs4.BeautifulSoup(response.text, 'xml')
    return soup

def fetch_xml_cached(url, headers={}) -> bs4.BeautifulSoup:
    '''
    Fetch the RSS / Atom feed, using a local cache to take advantage of HTTP304
    responses.
    '''
    cached = _xml_etag_cache.get(url)
    if cached and cached['request_headers'] == headers:
        headers = headers.copy()
        headers['if-none-match'] = cached['etag']

    # To do: use expires / cache-control to avoid making the request at all.
    log.debug('Fetching %s.', url)
    response = constants.http_session.get(url, headers=headers)
    httperrors.raise_for_status(response)

    if cached and response.status_code == 304:
        # Consider: after returning the cached text, it will still go through
        # the rest of the xml parsing and news ingesting steps even though it
        # will almost certainly add nothing new. But I say almost certainly
        # because you could have changed feed settings like isolate_guids.
        # May be room for optimization but it's not worth creating weird edge
        # cases over.
        log.debug('304 Using cached XML for %s.', url)
        response_text = cached['text']
    else:
        response_text = response.text
        if response.headers.get('etag'):
            cached = {
                'request_headers': headers,
                'etag': response.headers['etag'],
                'text': response_text,
            }
            _xml_etag_cache[url] = cached

    soup = bs4.BeautifulSoup(response_text, 'xml')
    return soup

def import_module_by_path(path):
    '''
    Raises pathclass.NotFile if file does not exist.
    Raises ValueError if basename cannot be a Python identifier.
    '''
    given_path = path
    path = pathclass.Path(path)
    path.assert_is_file()
    name = path.basename.split('.', 1)[0]
    if not name.isidentifier():
        raise ValueError(given_path)
    _syspath = sys.path
    _sysmodules = sys.modules.copy()
    sys.path = [path.parent.absolute_path]
    module = importlib.import_module(name)
    sys.path = _syspath
    sys.modules = _sysmodules
    return module

@staticmethod
def normalize_int_or_none(x):
    if x is None:
        return None

    if isinstance(x, int):
        return x

    if isinstance(x, float):
        return int(x)

    raise TypeError(f'{x} should be int or None, not {type(x)}.')

@staticmethod
def normalize_string_blank_to_none(string):
    if string is None:
        return None

    if not isinstance(string, str):
        raise TypeError(string)

    string = string.strip()
    if not string:
        return None

    return string

@staticmethod
def normalize_string_strip(string):
    if not isinstance(string, str):
        raise TypeError(string)

    return string.strip()

@staticmethod
def normalize_string_not_blank(string):
    if not isinstance(string, str):
        raise TypeError(string)

    string = string.strip()
    if not string:
        raise ValueError(string)

    return string

def now(timestamp=True):
    '''
    Return the current UTC timestamp or datetime object.
    '''
    n = datetime.datetime.now(datetime.timezone.utc)
    if timestamp:
        return n.timestamp()
    return n

def pick_web_url_atom(entry:bs4.BeautifulSoup):
    best_web_url = entry.find('link', {'rel': 'alternate', 'type': 'text/html'}, recursive=False)
    if best_web_url:
        return best_web_url['href']

    alternate_url = entry.find('link', {'rel': 'alternate'}, recursive=False)
    if alternate_url:
        return alternate_url['href']

    link = entry.find('link', recursive=False)
    if link:
        return link['href']

    return None

def xml_is_atom(soup:bs4.BeautifulSoup):
    if soup.find('feed'):
        return True

    return False

def xml_is_rss(soup:bs4.BeautifulSoup):
    if soup.find('rss') and soup.find('rss').find('channel'):
        return True

    return False