2022-04-12 21:44:36 +00:00
|
|
|
import argparse
|
|
|
|
import bs4
|
|
|
|
import collections
|
|
|
|
import requests
|
|
|
|
import sys
|
|
|
|
import urllib.parse
|
|
|
|
|
|
|
|
from voussoirkit import dotdict
|
|
|
|
from voussoirkit import niceprints
|
|
|
|
from voussoirkit import progressbars
|
|
|
|
from voussoirkit import vlogging
|
|
|
|
|
|
|
|
log = vlogging.get_logger(__name__, 'linkchecker')
|
|
|
|
vlogging.get_logger('urllib3').setLevel(vlogging.SILENT)
|
|
|
|
|
|
|
|
REDIRECT_CODES = {301, 302, 303, 307}
|
|
|
|
|
|
|
|
session = requests.Session()
|
|
|
|
session.headers['User-Agent'] = 'voussoir.net link health checker'
|
|
|
|
|
|
|
|
class BrokenAnchor(Exception):
|
|
|
|
pass
|
|
|
|
|
|
|
|
def extract_links(url, soup):
|
|
|
|
links = set()
|
|
|
|
for a in soup.select('a[href]'):
|
|
|
|
if not a['href'].startswith('#'):
|
|
|
|
links.add(a['href'])
|
|
|
|
|
|
|
|
for img in soup.select('img[src]'):
|
|
|
|
links.add(img['src'])
|
|
|
|
|
|
|
|
for audio in soup.select('audio[src]'):
|
|
|
|
links.add(audio['src'])
|
|
|
|
|
|
|
|
for video in soup.select('video[src]'):
|
|
|
|
links.add(video['src'])
|
|
|
|
|
|
|
|
links = {urllib.parse.urljoin(url, link) for link in links}
|
|
|
|
links = {link for link in links if not ignore_link(link)}
|
|
|
|
links = {normalize_link(link) for link in links}
|
|
|
|
return links
|
|
|
|
|
|
|
|
def ignore_link(url):
|
|
|
|
return (
|
|
|
|
url.startswith('https://voussoir.net/writing/tags') or
|
|
|
|
url.startswith('https://github.com/voussoir/voussoir.net/commit/')
|
|
|
|
)
|
|
|
|
|
|
|
|
def normalize_link(url):
|
|
|
|
parts = urllib.parse.urlparse(url)
|
|
|
|
# Youtube returns HTTP 200 even for invalid video ids at /watch URLs, but
|
|
|
|
# the thumbnail images return 404.
|
|
|
|
if parts.netloc.endswith('youtube.com'):
|
|
|
|
query = urllib.parse.parse_qs(parts.query, keep_blank_values=False)
|
|
|
|
video_id = query.get('v', [''])[0]
|
|
|
|
if video_id:
|
|
|
|
return f'https://i3.ytimg.com/vi/{video_id}/default.jpg'
|
|
|
|
elif parts.netloc.endswith('youtu.be'):
|
|
|
|
video_id = parts.path.strip('/')
|
|
|
|
if video_id:
|
|
|
|
return f'https://i3.ytimg.com/vi/{video_id}/default.jpg'
|
|
|
|
return url
|
|
|
|
|
|
|
|
def linkchecker(do_external=True):
|
|
|
|
seen = set()
|
|
|
|
queue = collections.deque()
|
|
|
|
queue.append('https://voussoir.net/')
|
|
|
|
if vlogging.ARGV_LEVEL >= vlogging.INFO:
|
|
|
|
progressbar = progressbars.Bar1(total=1)
|
|
|
|
else:
|
|
|
|
progressbar = progressbars.DoNothing()
|
|
|
|
|
|
|
|
results = {}
|
|
|
|
linked_by = {
|
|
|
|
'https://voussoir.net/writing': set()
|
|
|
|
}
|
|
|
|
linked_by['https://voussoir.net/writing/'] = linked_by['https://voussoir.net/writing']
|
|
|
|
|
|
|
|
goods = 0
|
|
|
|
warnings = 0
|
|
|
|
bads = 0
|
|
|
|
|
|
|
|
processed_count = 0
|
|
|
|
while len(queue) > 0:
|
|
|
|
url = queue.popleft()
|
|
|
|
if url == 'https://voussoir.net/writing':
|
|
|
|
url = 'https://voussoir.net/writing/'
|
|
|
|
result = dotdict.DotDict()
|
|
|
|
result.exc = None
|
|
|
|
result.url = url
|
|
|
|
result.url_parts = urllib.parse.urlparse(url)
|
|
|
|
if result.url_parts.netloc == 'voussoir.net':
|
|
|
|
log.debug('HEAD %s', url)
|
|
|
|
result.head = session.head(url, allow_redirects=False)
|
|
|
|
if result.head.status_code in REDIRECT_CODES:
|
|
|
|
link = result.head.headers['Location']
|
|
|
|
linked_by.setdefault(link, set()).add(url)
|
|
|
|
if link not in seen:
|
|
|
|
queue.append(link)
|
|
|
|
seen.add(link)
|
|
|
|
elif result.head.headers['content-type'] == 'text/html' and not url.endswith('.html'):
|
|
|
|
log.debug('GET %s', url)
|
|
|
|
response = session.get(url)
|
|
|
|
soup = bs4.BeautifulSoup(response.text, 'html.parser')
|
|
|
|
if result.url_parts.fragment:
|
2022-10-16 05:35:37 +00:00
|
|
|
if not soup.find(id=result.url_parts.fragment):
|
2022-04-12 21:44:36 +00:00
|
|
|
result.exc = BrokenAnchor(f'Broken anchor: #{result.url_parts.fragment}')
|
|
|
|
links = extract_links(url, soup)
|
|
|
|
for link in links:
|
|
|
|
linked_by.setdefault(link, set()).add(url)
|
|
|
|
new_links = links.difference(seen)
|
|
|
|
seen.update(new_links)
|
|
|
|
queue.extend(new_links)
|
|
|
|
key = (result.exc, result.head.status_code, result.url_parts.netloc)
|
|
|
|
results.setdefault(key, []).append(result)
|
|
|
|
elif do_external:
|
|
|
|
log.debug('HEAD %s', url)
|
|
|
|
status = 999
|
|
|
|
try:
|
|
|
|
result.head = session.head(url, timeout=10)
|
|
|
|
status = result.head.status_code
|
|
|
|
if result.head.status_code == 405:
|
|
|
|
try:
|
|
|
|
result.head = session.get(url)
|
|
|
|
status = result.head.status_code
|
|
|
|
except Exception as exc:
|
|
|
|
result.exc = exc
|
|
|
|
except Exception as exc:
|
|
|
|
result.exc = exc
|
|
|
|
key = (result.exc, status, result.url_parts.netloc)
|
|
|
|
results.setdefault(key, []).append(result)
|
|
|
|
else:
|
|
|
|
continue
|
|
|
|
|
|
|
|
if result.exc or result.head.status_code >= 400:
|
|
|
|
bads += 1
|
|
|
|
elif result.head.status_code == 200:
|
|
|
|
goods += 1
|
|
|
|
else:
|
|
|
|
warnings += 1
|
|
|
|
|
|
|
|
progressbar.set_total(len(seen))
|
|
|
|
processed_count += 1
|
|
|
|
progressbar.step(processed_count)
|
|
|
|
|
|
|
|
progressbar.done()
|
|
|
|
|
|
|
|
results = sorted(results.items(), key=lambda pair: (pair[0][0] is not None, pair[0][1]))
|
|
|
|
|
|
|
|
def makemessage(result):
|
|
|
|
if result.exc:
|
|
|
|
mainline = f'EXC {result.url}\n{result.exc}'
|
|
|
|
else:
|
|
|
|
mainline = f'{result.head.status_code} {result.url}'
|
|
|
|
if result.head.status_code == 200 and vlogging.ARGV_LEVEL > vlogging.DEBUG:
|
|
|
|
return mainline
|
|
|
|
if result.head.status_code in REDIRECT_CODES:
|
|
|
|
mainline += f' -> {result.head.headers.get("location")}'
|
|
|
|
|
|
|
|
lines = [mainline]
|
|
|
|
for linked in linked_by[result.url]:
|
|
|
|
lines.append(f' Linked by {linked}')
|
|
|
|
return '\n'.join(lines)
|
|
|
|
|
|
|
|
for ((exc, status, domain), result_group) in results:
|
|
|
|
print(niceprints.equals_header(f'{status} {domain}'))
|
|
|
|
for result in result_group:
|
|
|
|
message = makemessage(result)
|
|
|
|
if result.exc or result.head.status_code >= 400:
|
|
|
|
log.error(message)
|
|
|
|
elif result.head.status_code == 200:
|
|
|
|
log.info(message)
|
|
|
|
else:
|
|
|
|
log.warning(message)
|
|
|
|
print()
|
|
|
|
|
|
|
|
print(f'{goods} good, {warnings} warnings, {bads} bad.')
|
|
|
|
|
|
|
|
def linkchecker_argparse(args):
|
|
|
|
linkchecker(do_external=not args.internal)
|
|
|
|
return 0
|
|
|
|
|
|
|
|
@vlogging.main_decorator
|
|
|
|
def main(argv):
|
|
|
|
parser = argparse.ArgumentParser(
|
|
|
|
description='''
|
|
|
|
''',
|
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
'--internal',
|
|
|
|
action='store_true',
|
|
|
|
help='''
|
|
|
|
Only check internal links.
|
|
|
|
''',
|
|
|
|
)
|
|
|
|
parser.set_defaults(func=linkchecker_argparse)
|
|
|
|
|
|
|
|
args = parser.parse_args(argv)
|
|
|
|
return args.func(args)
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
raise SystemExit(main(sys.argv[1:]))
|