Add linkchecker.md.

master
voussoir 2022-04-12 14:44:36 -07:00
parent c98dc65b73
commit af06e873e9
No known key found for this signature in database
GPG Key ID: 5F7554F8C26DACCB
2 changed files with 237 additions and 0 deletions

203
linkchecker.py Normal file
View File

@ -0,0 +1,203 @@
import argparse
import bs4
import collections
import requests
import sys
import urllib.parse
from voussoirkit import dotdict
from voussoirkit import niceprints
from voussoirkit import progressbars
from voussoirkit import vlogging
log = vlogging.get_logger(__name__, 'linkchecker')
vlogging.get_logger('urllib3').setLevel(vlogging.SILENT)
REDIRECT_CODES = {301, 302, 303, 307}
session = requests.Session()
session.headers['User-Agent'] = 'voussoir.net link health checker'
class BrokenAnchor(Exception):
pass
def extract_links(url, soup):
links = set()
for a in soup.select('a[href]'):
if not a['href'].startswith('#'):
links.add(a['href'])
for img in soup.select('img[src]'):
links.add(img['src'])
for audio in soup.select('audio[src]'):
links.add(audio['src'])
for video in soup.select('video[src]'):
links.add(video['src'])
links = {urllib.parse.urljoin(url, link) for link in links}
links = {link for link in links if not ignore_link(link)}
links = {normalize_link(link) for link in links}
return links
def ignore_link(url):
return (
url.startswith('https://voussoir.net/writing/tags') or
url.startswith('https://github.com/voussoir/voussoir.net/commit/')
)
def normalize_link(url):
parts = urllib.parse.urlparse(url)
# Youtube returns HTTP 200 even for invalid video ids at /watch URLs, but
# the thumbnail images return 404.
if parts.netloc.endswith('youtube.com'):
query = urllib.parse.parse_qs(parts.query, keep_blank_values=False)
video_id = query.get('v', [''])[0]
if video_id:
return f'https://i3.ytimg.com/vi/{video_id}/default.jpg'
elif parts.netloc.endswith('youtu.be'):
video_id = parts.path.strip('/')
if video_id:
return f'https://i3.ytimg.com/vi/{video_id}/default.jpg'
return url
def linkchecker(do_external=True):
seen = set()
queue = collections.deque()
queue.append('https://voussoir.net/')
if vlogging.ARGV_LEVEL >= vlogging.INFO:
progressbar = progressbars.Bar1(total=1)
else:
progressbar = progressbars.DoNothing()
results = {}
linked_by = {
'https://voussoir.net/writing': set()
}
linked_by['https://voussoir.net/writing/'] = linked_by['https://voussoir.net/writing']
goods = 0
warnings = 0
bads = 0
processed_count = 0
while len(queue) > 0:
url = queue.popleft()
if url == 'https://voussoir.net/writing':
url = 'https://voussoir.net/writing/'
result = dotdict.DotDict()
result.exc = None
result.url = url
result.url_parts = urllib.parse.urlparse(url)
if result.url_parts.netloc == 'voussoir.net':
log.debug('HEAD %s', url)
result.head = session.head(url, allow_redirects=False)
if result.head.status_code in REDIRECT_CODES:
link = result.head.headers['Location']
linked_by.setdefault(link, set()).add(url)
if link not in seen:
queue.append(link)
seen.add(link)
elif result.head.headers['content-type'] == 'text/html' and not url.endswith('.html'):
log.debug('GET %s', url)
response = session.get(url)
soup = bs4.BeautifulSoup(response.text, 'html.parser')
if result.url_parts.fragment:
if not soup.select(f'#{result.url_parts.fragment}'):
result.exc = BrokenAnchor(f'Broken anchor: #{result.url_parts.fragment}')
links = extract_links(url, soup)
for link in links:
linked_by.setdefault(link, set()).add(url)
new_links = links.difference(seen)
seen.update(new_links)
queue.extend(new_links)
key = (result.exc, result.head.status_code, result.url_parts.netloc)
results.setdefault(key, []).append(result)
elif do_external:
log.debug('HEAD %s', url)
status = 999
try:
result.head = session.head(url, timeout=10)
status = result.head.status_code
if result.head.status_code == 405:
try:
result.head = session.get(url)
status = result.head.status_code
except Exception as exc:
result.exc = exc
except Exception as exc:
result.exc = exc
key = (result.exc, status, result.url_parts.netloc)
results.setdefault(key, []).append(result)
else:
continue
if result.exc or result.head.status_code >= 400:
bads += 1
elif result.head.status_code == 200:
goods += 1
else:
warnings += 1
progressbar.set_total(len(seen))
processed_count += 1
progressbar.step(processed_count)
progressbar.done()
results = sorted(results.items(), key=lambda pair: (pair[0][0] is not None, pair[0][1]))
def makemessage(result):
if result.exc:
mainline = f'EXC {result.url}\n{result.exc}'
else:
mainline = f'{result.head.status_code} {result.url}'
if result.head.status_code == 200 and vlogging.ARGV_LEVEL > vlogging.DEBUG:
return mainline
if result.head.status_code in REDIRECT_CODES:
mainline += f' -> {result.head.headers.get("location")}'
lines = [mainline]
for linked in linked_by[result.url]:
lines.append(f' Linked by {linked}')
return '\n'.join(lines)
for ((exc, status, domain), result_group) in results:
print(niceprints.equals_header(f'{status} {domain}'))
for result in result_group:
message = makemessage(result)
if result.exc or result.head.status_code >= 400:
log.error(message)
elif result.head.status_code == 200:
log.info(message)
else:
log.warning(message)
print()
print(f'{goods} good, {warnings} warnings, {bads} bad.')
def linkchecker_argparse(args):
linkchecker(do_external=not args.internal)
return 0
@vlogging.main_decorator
def main(argv):
parser = argparse.ArgumentParser(
description='''
''',
)
parser.add_argument(
'--internal',
action='store_true',
help='''
Only check internal links.
''',
)
parser.set_defaults(func=linkchecker_argparse)
args = parser.parse_args(argv)
return args.func(args)
if __name__ == '__main__':
raise SystemExit(main(sys.argv[1:]))

View File

@ -0,0 +1,34 @@
Finding and fixing dead links on this blog
==========================================
I said from the [beginning](/writing/motivations_for_writing) that gwern.net was a source of motivation for me to start writing, and I specifically linked to his article about [archiving the URLs to which his blog links](https://www.gwern.net/Archiving-URLs). It's time for me to start doing the same! I don't have quite as much to say but I thought it'd be worth documenting. Gwern just recently said he's working on a [new system](https://news.ycombinator.com/item?id=30940141).
## Archiving linked pages
To preserve third-party pages, I use a browser extension called [SingleFile](https://github.com/gildas-lormeau/SingleFile). It saves the entire page into a single .html file by using `data:` URIs to embed images and other resources. Then I just upload that file along with my own article and link to it. At the moment I am not very concerned about cloning entire websites -- I just need one page at a time.
At first I was using [web.archive.org](https://web.archive.org/) to provide archive links, but now I prefer self-hosting SingleFile pages for a few reasons:
While I expect IA to be around for a while, my goal is to reduce dependency on third parties. In some ways, depending on IA as a single third party instead of numerous third parties is better; in some ways it's worse. A single point of trust is also a single point of failure.
IA does not give the reader an easy way to download the archived page for themselves. CTRL+S does not produce a good replica in most cases. With my SingleFile pages, you can just CTRL+S and get the exact same file that I have. My goal is to share information with you, so that's kind of important.
IA might receive a DMCA takedown request and lose the content. If I want to preserve the linked page I'll need to make my own copy of it anyway, so why do both IA plus SingleFile when I can just do the one?
IA is fairly slow to load. I mean no offense to them as they are a fantastic resource and I'm okay with them prioritizing capacity over speed, but it is true.
Sometimes I need to link to a newspaper or other blogger. These articles are often [professionally written](/writing/professionally_written_article) and need to be pasteurized for consumption by a sane audience. Hosting the HTML myself allows me to do this. Here's an article [before](contently_before.html) (4.11 MB) and [after](contently_after.html) (0.14 MB) I cleaned it up [footnote_link]. Oh, and that's with SingleFile's "remove scripts" option enabled. It was 9.75 MB with scripts and I don't want to waste either of our bandwidths by including that here. I shouldn't be too harsh on them, putting two dozen paragraphs of text into a document is really hard and doing it efficiently requires a great engineering team like mine.
For what it's worth, myself and [others](https://news.ycombinator.com/item?id=30777702) have noticed that a lot of news sites today are deathly afraid of including external links in their text -- they'd rather provide a useless link to themselves than a useful link to a third party. I don't want to be like that, so I'll either include the original link alongside the archived one, or I'll edit the archived page to make its above-the-fold title a link back to the original URL.
[footnote_text] Base64 encoding of embedded resources makes them take up about a third more space, a tradeoff for the convenience of having it packed in a single html file. The original source for that article costs me about 3.7 MB over the wire.
## Fixing dead links
I wrote my own [linkchecker.py](https://github.com/voussoir/voussoir.net/blob/master/linkchecker.py) because for some reason I like writing my own solutions instead of using other people's. It gives me a report organized by HTTP status and domain. If the link has a problem, it tells me what article it's on.
It immediately found multiple problematic links, [all](https://github.com/voussoir/voussoir.net/commit/2b966be6357c073043dc2b5b64edc448a559a43b) of [which](https://github.com/voussoir/voussoir.net/commit/9c287c719fdf9c07c098a6430fe0fb5dfcbdbadf) were [my fault](https://github.com/voussoir/voussoir.net/commit/8224052c8cc9d79a3acf832f69fcd46a4d045552). My engineering team dropped the ball on this because they were too focused on innovating new ways to put paragraphs into documents, but the linkchecker should reduce these incidents in the future.
Actually, it's a good thing if all the dead links are my fault, because I can easily fix it. As long as I continue making SingleFile archives, the majority of dead links I encounter should be resources that I renamed or images that I forgot to upload. I should be able to run it on a cronjob and use [operatornotify](/writing/emailing_myself) to get emails about it, after I fine-tune the error / warning levels. I tend to link to a lot of youtube videos which, of course, I won't be rehosting here due to their large file size, but if it's a video I really care I'll have downloaded a copy to my personal computer and can find some way of getting it to you.
So, that's where we're at for now. If you see any other problems that I missed, you can send me an email.