From 73b621d3adf4c2f55f0150b84b555ce5a14b252b Mon Sep 17 00:00:00 2001 From: Rob Brackett Date: Mon, 13 Oct 2025 19:17:53 -0700 Subject: [PATCH] Mark bad `climate.nasa.gov` redirects as 404 NASA went through a year-and-a-half long transition of pages from `climate.nasa.gov` to `science.nasa.gov/climate-change` that concluded a couple weeks ago. Unfortunately, when they finished, they started redirecting `climate.nasa.gov/*` to the new climate change home page instead of to the matching page on the new site, making a bunch of URLs effectively into 404s. See also: https://github.com/edgi-govdata-archiving/web-monitoring-db/pull/1306 --- analyst_sheets/analyze.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/analyst_sheets/analyze.py b/analyst_sheets/analyze.py index ae195a6..6b72e23 100644 --- a/analyst_sheets/analyze.py +++ b/analyst_sheets/analyze.py @@ -351,6 +351,15 @@ def get_version_status(version: dict) -> int: if redirects and redirects[-1].endswith('epa.gov/sites/production/files/signpost/cc.html'): return 404 + # Special case for climate.nasa.gov getting moved with bad redirects for + # all the sub-pages (they all redirected to the new home page). + if ( + redirects + and re.match(r'^https?://climate.nasa.gov/.+$', url, re.IGNORECASE) + and redirects[-1].endswith('://science.nasa.gov/climate-change/') + ): + return 404 + if version['title']: # Page titles are frequently formulated like " | <site name>" or # "<title> | <site section> | <site name>" (order may also be reversed).