Preventing link rot
An initial approach for checking for link rot.
Rebooting the blog reminded me that one worry I often have is dealing with link rot.
Rough Idea #
- Fetch the content of a link.
- If the link does not come back:
- note the kind of error (404, 4xx, 5xx)
- move on to the next link
- If the link comes back:
- record the success
- if domain of the link is different than the target domain: move on to next link
- look for references to other resources and add them to the queue
<* href="$URL">
- a, area, base, link<* src="$URL">
- audio, embed, iframe, img, input, script, source, track, video
- If the link does not come back:
Attempt 1: Naive Approach #
Let's get this working in the most straightforward way first.
# lib
from bs4 import BeautifulSoup
import requests
def main():
BASE_URL = "http://localhost:8080"
target = "http://localhost:8080/blog"
todo, done = [], {}
todo.append(target)
while todo:
link = todo.pop()
if link in done:
continue
# link is new
try:
req = requests.get(link, timeout=3)
done[link] = dict(code=req.status_code)
print(f"{req.status_code} {link}")
if req.status_code != 200:
continue
except Exception as e:
done[link] = dict(code="ERR")
print(f"ERR {link}", e)
continue
# link is good
if not link.startswith(target):
continue
# link should be scraped
soup = BeautifulSoup(req.content, features="html.parser")
for tag in soup.select("[src],[href]"):
next_link = ""
if "src" in tag.attrs:
next_link = tag["src"]
elif "href" in tag.attrs:
next_link = tag["href"]
if next_link.startswith("#"): # ignore fragments
continue
elif next_link.startswith("/"):
next_link = f"{BASE_URL}{next_link}"
if next_link:
todo.append(next_link)
if __name__ == "__main__":
main()
This takes about 6 minutes to run on my locally-hosted version of my blog. Kinda slow, but a fine start. I also discover that my port to eleventy
missed an important variable.
Attempt 2: Naive Parallel #
Normally, I'd reach for ezq
to do this processing in parallel except that todo
grows as you scrape, so there's not obvious place to send the END_MSG
. Who should notify workers they are done?
# lib
from bs4 import BeautifulSoup
from multiprocess import Manager
import ezq
import requests
BASE_URL = "http://localhost:8080"
def is_alive(link, done):
ok, req = True, None
try:
req = requests.get(link, timeout=3)
done[link] = dict(code=req.status_code)
print(f"{req.status_code} {link}")
if req.status_code != 200:
ok = False
except Exception as e:
ok = False
done[link] = dict(code="ERR")
print(f"ERR {link}", e)
return ok, req
def add_links(req, todo):
soup = BeautifulSoup(req.content, features="html.parser")
for tag in soup.select("[src],[href]"):
next_link = ""
if "src" in tag.attrs:
next_link = tag["src"]
elif "href" in tag.attrs:
next_link = tag["href"]
if next_link.startswith("#"): # ignore fragments
continue
elif next_link.startswith("/"):
next_link = f"{BASE_URL}{next_link}"
if next_link:
todo.append(next_link)
def worker(todo: list, done: dict):
while todo:
link = todo.pop()
if link in done:
continue
# link is new
ok, req = is_alive(link, done)
if not ok:
continue
# link is good
if not link.startswith(BASE_URL):
continue
# link should be scraped
add_links(req, todo)
def main():
with Manager() as manager:
todo, done = manager.list(), manager.dict()
_, req = is_alive(f"{BASE_URL}/blog", done)
add_links(req, todo)
for w in [ezq.run(worker, todo, done) for _ in range(ezq.NUM_CPUS)]:
w.join()
if __name__ == "__main__":
main()
This takes about 40 seconds. I do find the ezq
wrappers are still nicer than creating Process
objects, but now there are a few race conditions:
- checking if a link was processed may return
False
to two workers - adding links to
todo
may take too long and workers may start shutting down prematurely
Approach 3: Parallel with Sleeps #
So now we protect (a little bit) against .pop
from an empty list and from a worker ending early by adding sleep
to the top and bottom of the for
loop. We also restructure the continue
statements so that we don't accidentally end too early.
# native
from time import sleep
# lib
from bs4 import BeautifulSoup
from multiprocess import Manager
import ezq
import requests
BASE_URL = "http://localhost:8080"
def is_alive(link, done):
ok, req = True, None
try:
req = requests.get(link, timeout=3)
done[link] = dict(code=req.status_code)
print(f"{req.status_code} {link}")
if req.status_code != 200:
ok = False
except Exception as e:
ok = False
done[link] = dict(code="ERR")
print(f"ERR {link}", e)
return ok, req
def add_links(text):
soup = BeautifulSoup(text, features="html.parser")
for tag in soup.select("[src],[href]"):
next_link = ""
if "src" in tag.attrs:
next_link = tag["src"]
elif "href" in tag.attrs:
next_link = tag["href"]
if next_link.startswith("#"): # ignore fragments
continue
elif next_link.startswith("/"):
next_link = f"{BASE_URL}{next_link}"
if next_link:
yield next_link
def worker(todo: list, done: dict):
while todo:
if not todo:
sleep(3)
continue
link = todo.pop()
if link not in done: # new link
done[link] = None # prevent other workers
ok, req = is_alive(link, done)
if ok and link.startswith(BASE_URL): # scrape this link
for item in add_links(req.content):
if item not in done: # only add new links
todo.append(item)
if not todo: # maybe we're all done
sleep(3)
def main():
with Manager() as manager:
todo, done = manager.list(), manager.dict()
_, req = is_alive(f"{BASE_URL}/blog", done)
for link in add_links(req.content):
todo.append(link)
for w in [ezq.run(worker, todo, done) for _ in range(ezq.NUM_CPUS)]:
w.join()
if __name__ == "__main__":
main()
This also takes about 40 seconds but with fewer differences in output from the initial naive version.
Approach 4: Process the content type #
Now that I've set up the skeleton, I'm noticing that there are bunch of content errors including trying to parse XML (Atom Feed) and getting lots of 403
responses. We can do a little better by processing the content-type.
# native
from time import sleep
import re
# lib
from bs4 import BeautifulSoup
from multiprocess import Manager
import ezq
import requests
BASE_URL = "http://localhost:8080"
HEADERS = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36"
}
RE_CSS_IMPORT = re.compile(r"""@import ["']([^"']+)""", re.MULTILINE)
RE_CSS_URL = re.compile(r"url\s*\(\s*([^)]+)\s*\)", re.MULTILINE)
def is_alive(link, done):
ok, req = True, None
try:
req = requests.get(link, timeout=3, headers=HEADERS)
done[link] = dict(code=req.status_code)
print(f"{req.status_code} {link}")
if req.status_code != 200:
ok = False
except Exception as e:
ok = False
done[link] = dict(code="ERR")
print(f"ERR {link}", e)
return ok, req
def get_css_links(req):
text = req.text
for pattern in [RE_CSS_IMPORT, RE_CSS_URL]:
for link in pattern.findall(text):
yield link
def get_xhtml_links(soup):
for tag in soup.select("[src],[href]"):
if "src" in tag.attrs:
yield tag["src"]
elif "href" in tag.attrs:
yield tag["href"]
def add_links(req):
ctype = req.headers["Content-Type"]
if ctype.startswith("text/css") or req.url.endswith(".css"):
links = get_css_links(req)
elif ctype.startswith("text/xml") or req.url.endswith(".xml"):
links = get_xhtml_links(BeautifulSoup(req.content, features="xml"))
else:
links = get_xhtml_links(BeautifulSoup(req.content, features="html.parser"))
for link in links:
if (
link.startswith("#") # fragment
or link.startswith("mailto:") # email
or link.startswith("data:") # base64-encoded data
):
continue
if link.startswith("/"):
link = f"{BASE_URL}{link}"
# TODO: add relative links
if link:
yield link
def worker(todo: list, done: dict):
while todo:
if not todo:
sleep(3)
continue
link = todo.pop()
if link not in done: # new link
done[link] = None # prevent other workers
ok, req = is_alive(link, done)
if ok and link.startswith(BASE_URL): # scrape this link
for item in add_links(req):
if item not in done: # only add new links
todo.append(item)
if not todo: # maybe we're all done
sleep(3)
def main():
with Manager() as manager:
todo, done = manager.list(), manager.dict()
_, req = is_alive(f"{BASE_URL}/blog", done)
for link in add_links(req):
todo.append(link)
for w in [ezq.run(worker, todo, done) for _ in range(ezq.NUM_CPUS)]:
w.join()
if __name__ == "__main__":
main()
Open Tasks #
Updates #
2023-11-28 #
- Updated broken links.
2023-11-28 #
freshlinks 1.0.0
is now published.