build: add Pyright configuration with type safety improvements

dsmedia · dsmedia · commit 552ecc5190e3 · 2025-10-26T13:56:39.000Z
Adds Pyright type checking to the project with initial coverage of select
scripts. Configuration uses 'basic' mode for gradual typing adoption.

Scripts included in type checking:
- scripts/generate_gallery_examples.py (new)
- scripts/build_datapackage.py
- scripts/species.py
- scripts/flights.py
- scripts/income.py
- scripts/us-state-capitals.py

Type safety improvements to scripts/species.py (required to pass checks):
- Add TypedDict definitions for configuration structures (FilterItem,
  GeographicFilter, ProcessingConfig, Config)
- Add semantic type aliases (ItemId, SpeciesCode, CountyId, FileExtension,
  ExactExtractOp) for domain clarity
- Add type guard function is_file_extension() for FileExtension validation
- Improve function signatures with complete type annotations
- Add TYPE_CHECKING block for type-only imports

These changes ensure the build passes with Pyright enabled while improving
code maintainability and IDE support.
diff --git a/pyproject.toml b/pyproject.toml
@@ -119,17 +119,26 @@ select = [
 [tool.ruff.lint.per-file-ignores]
 "*.ipynb"      = ["ANN", "F401", "W391"]
 "*/**/*.ipynb" = ["ANN", "F401", "W391"]
+# generate_gallery_examples.py: Allow intentional patterns for robustness
+# - BLE001: Broad exception catching needed to continue processing on errors
+# - C901: Complex functions handle nested spec structures from Vega/VL/Altair
+# - SIM102: Nested ifs improve readability when validating deeply nested data
+"scripts/generate_gallery_examples.py" = ["BLE001", "C901", "SIM102"]
 
 [tool.pyright]
 enableExperimentalFeatures = true
-ignore = ["../../../**/Lib", ".venv"]
+ignore = ["../../../**/Lib"]
 include = [
   "./scripts/build_datapackage.py",
   "./scripts/flights.py",
+  "./scripts/generate_gallery_examples.py",
   "./scripts/income.py",
   "./scripts/species.py",
+  "./scripts/us-state-capitals.py",
 ]
 pythonPlatform = "All"
 pythonVersion = "3.12"
 reportUnusedExpression = "none"
 typeCheckingMode = "basic"
+venv = ".venv"
+venvPath = "."
diff --git a/scripts/species.py b/scripts/species.py
@@ -41,12 +41,12 @@
 import geopandas as gpd
 import numpy as np
 import pandas as pd
-import pyarrow as pa
-import pyarrow.feather
-import pyarrow.parquet as pa_pq
+import pyarrow as pa  # type: ignore[import-untyped]
+import pyarrow.feather  # type: ignore[import-untyped]
+import pyarrow.parquet as pa_pq  # type: ignore[import-untyped]
 import requests
-from exactextract import exact_extract
-from sciencebasepy import SbSession
+from exactextract import exact_extract  # type: ignore[import-untyped]
+from sciencebasepy import SbSession  # type: ignore[import-untyped]
 
 if TYPE_CHECKING:
     from collections.abc import Sequence
@@ -463,7 +463,7 @@ def _prepare_county_data(self, gdf: CountyDataFrame) -> CountyDataFrame:
 
         # Rename the county identifier column to 'county_id' for consistency
         if "id" in gdf.columns:
-            gdf = gdf.rename(columns={"id": "county_id"})
+            gdf = cast("CountyDataFrame", gdf.rename(columns={"id": "county_id"}))
         else:
             logger.error(
                 "County ID column not found. Available columns: %s",
@@ -522,7 +522,8 @@ def _filter_to_conterminous_us(self, gdf: CountyDataFrame) -> CountyDataFrame:
         )
 
         for fips_code in excluded_fips:
-            counties = gdf[gdf["state_fips"] == fips_code]["county_id"].unique()
+            county_series: pd.Series = gdf[gdf["state_fips"] == fips_code]["county_id"]  # type: ignore[assignment]
+            counties = county_series.unique()
             area_name = fips_names.get(fips_code, f"FIPS {fips_code}")
 
             if len(counties) > 0:
@@ -547,7 +548,7 @@ def _filter_to_conterminous_us(self, gdf: CountyDataFrame) -> CountyDataFrame:
 
         logger.info("Analyzing %d counties in conterminous US", len(filtered_gdf))
 
-        return filtered_gdf
+        return cast("CountyDataFrame", filtered_gdf)
 
     def _finalize_county_data(self, gdf: CountyDataFrame) -> CountyDataFrame:
         """Projects to equal-area and removes invalid geometries."""
@@ -562,7 +563,7 @@ def _finalize_county_data(self, gdf: CountyDataFrame) -> CountyDataFrame:
                 len(projected_gdf) - len(valid_counties),
             )
 
-        return valid_counties
+        return cast("CountyDataFrame", valid_counties)
 
     def process_habitat_data(
         self, temp_dir: Path
@@ -730,12 +731,19 @@ def save_results(
 
         # Basic setup and column renaming
         self.output_dir.mkdir(exist_ok=True)
-        results_df = results_df.rename(
-            columns={"species_code": "gap_species_code", "pct": "habitat_yearround_pct"}
+        results_df = cast(
+            "ProcessedDataFrame",
+            results_df.rename(
+                columns={
+                    "species_code": "gap_species_code",
+                    "pct": "habitat_yearround_pct",
+                }
+            ),
+        )
+        results_df = cast(
+            "ProcessedDataFrame",
+            results_df[["county_id", "gap_species_code", "habitat_yearround_pct"]],
         )
-        results_df = results_df[
-            ["county_id", "gap_species_code", "habitat_yearround_pct"]
-        ]
 
         # Merge with species info and round percentages
         species_info_df = pd.DataFrame.from_dict(species_info, orient="index")
@@ -755,17 +763,19 @@ def save_results(
         final_df["habitat_yearround_pct"] = final_df["habitat_yearround_pct"].round(4)
 
         # Ensure consistent county_id format
-        final_df["county_id"] = final_df["county_id"].astype(str).str.zfill(5)
+        county_series: pd.Series = final_df["county_id"].astype(str)  # type: ignore[assignment]
+        final_df["county_id"] = county_series.str.zfill(5)
 
         # Get list of all conterminous US counties (already filtered in _load_county_data)
         conterminous_counties = self.gdf["county_id"].unique()
 
         # Create complete dataset with zeros for missing counties
         complete_data = []
         for _species, group in final_df.groupby("gap_species_code"):
+            group_df = cast("pd.DataFrame", group)
             # Create template row with species info
             template = {
-                col: group[col].iloc[0]
+                col: group_df[col].iloc[0]
                 for col in [
                     "item_id",
                     "common_name",
@@ -776,7 +786,11 @@ def save_results(
 
             # Create dictionary of existing county data
             county_data = dict(
-                zip(group["county_id"], group["habitat_yearround_pct"], strict=False)
+                zip(
+                    group_df["county_id"],
+                    group_df["habitat_yearround_pct"],
+                    strict=False,
+                )
             )
 
             # Add rows for all counties (existing values or zeros)