Merge pull request #261 from datakind/develop

vishpillai123 · web-flow · commit cfdaef0eb54f · 2025-07-15T17:15:31.000-04:00
Release 0.3.9
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,8 @@
 # CHANGELOG
 
+## 0.3.9 (2025-07)
+- Add pre-cohort courses to config (PR 258)
+
 ## 0.3.8 (2025-07)
 - Patching up gold volume path due to Azure/GCP naming differences, keeping GCP version (PR 249)
 
diff --git a/pipelines/pdp/institution_id/00-data-assessment-TEMPLATE.py b/pipelines/pdp/institution_id/00-data-assessment-TEMPLATE.py
@@ -25,7 +25,7 @@
 
 # install dependencies, of which most/all should come through our 1st-party SST package
 
-# %pip install "student-success-tool == 0.3.8"
+# %pip install "student-success-tool == 0.3.9"
 
 # COMMAND ----------
 
@@ -291,6 +291,58 @@
 
 # COMMAND ----------
 
+# MAGIC %md
+# MAGIC ### Checking for inconsistencies in raw files
+# MAGIC We want to check for inconsistencies to raise with NSC if need be before any converter functions get applied
+
+# COMMAND ----------
+
+df_raw = (
+    pd.merge(
+        df_cohort,
+        df_course,
+        on=cfg.student_id_col,
+        how="outer",
+        suffixes=("_cohort", "_course"),
+        indicator=True,
+    )
+    # HACK: columns overlap on more than just student_guid
+    # let's rename/drop a relevant few for convenience
+    .rename(
+        columns={
+            "cohort_cohort": "cohort",
+            "cohort_term_cohort": "cohort_term",
+            "student_age_cohort": "student_age",
+            "race_cohort": "race",
+            "ethnicity_cohort": "ethnicity",
+            "gender_cohort": "gender",
+            "institution_id_cohort": "institution_id",
+        }
+    )
+    .drop(
+        columns=[
+            "cohort_course",
+            "cohort_term_course",
+            "student_age_course",
+            "race_course",
+            "ethnicity_course",
+            "gender_course",
+            "institution_id_course",
+        ]
+    )
+)
+df_raw["_merge"].value_counts()
+
+# COMMAND ----------
+
+# any patterns in mis-joined records?
+df_raw.loc[df_raw["_merge"] != "both", :]
+
+# which students don't appear in both datasets?
+df_raw.loc[df_raw["_merge"] != "both", cfg.student_id_col].unique().tolist()
+
+# COMMAND ----------
+
 # MAGIC %md
 # MAGIC ### null values
 
diff --git a/pipelines/pdp/institution_id/01-preprocess-data-TEMPLATE.py b/pipelines/pdp/institution_id/01-preprocess-data-TEMPLATE.py
@@ -24,7 +24,7 @@
 
 # install dependencies, of which most/all should come through our 1st-party SST package
 
-# %pip install "student-success-tool == 0.3.8"
+# %pip install "student-success-tool == 0.3.9"
 
 # COMMAND ----------
 
@@ -132,6 +132,24 @@
 
 # COMMAND ----------
 
+# MAGIC %md
+# MAGIC ### Handling Pre-Cohort Courses
+# MAGIC
+# MAGIC Please rememeber to check with your schools during the data assessment call how they would like pre-cohort course records to be handled.
+# MAGIC
+
+# COMMAND ----------
+
+# We usually drop pre-cohort course records; If school requests otherwise, please set include_pre_cohort_courses in your config to TRUE, re-load the config, THEN run this cell!
+
+if not cfg.preprocessing.include_pre_cohort_courses:
+    df_student_terms = df_student_terms[df_student_terms["term_is_pre_cohort"] == False]
+
+# sanity check; should be 0 True
+df_student_terms["term_is_pre_cohort"].value_counts(dropna=False)
+
+# COMMAND ----------
+
 # MAGIC %md
 # MAGIC # select students and compute targets
 
diff --git a/pipelines/pdp/institution_id/02-train-model-TEMPLATE.py b/pipelines/pdp/institution_id/02-train-model-TEMPLATE.py
@@ -28,7 +28,7 @@
 # we need to manually install a certain version of pandas and scikit-learn in order
 # for our models to load and run properly.
 
-# %pip install "student-success-tool==0.3.8"
+# %pip install "student-success-tool==0.3.9"
 # %pip install "pandas==1.5.3"
 # %pip install "scikit-learn==1.3.0"
 
diff --git a/pipelines/pdp/institution_id/03-make-predictions-TEMPLATE.py b/pipelines/pdp/institution_id/03-make-predictions-TEMPLATE.py
@@ -26,7 +26,7 @@
 # we need to manually install a certain version of pandas and scikit-learn in order
 # for our models to load and run properly.
 
-# %pip install "student-success-tool==0.3.8"
+# %pip install "student-success-tool==0.3.9"
 # %pip install "pandas==1.5.3"
 # %pip install "scikit-learn==1.3.0"
 
diff --git a/pipelines/pdp/institution_id/04-register-model-create-card-TEMPLATE.py b/pipelines/pdp/institution_id/04-register-model-create-card-TEMPLATE.py
@@ -21,7 +21,7 @@
 # we need to manually install a certain version of pandas and scikit-learn in order
 # for our models to load and run properly.
 
-# %pip install "student-success-tool==0.3.8"
+# %pip install "student-success-tool==0.3.9"
 # %pip install "pandas==1.5.3"
 # %pip install "scikit-learn==1.3.0"
 
diff --git a/pipelines/pdp/institution_id/config-TEMPLATE.toml b/pipelines/pdp/institution_id/config-TEMPLATE.toml
@@ -30,6 +30,7 @@ framework = "sklearn"
 [preprocessing]
 splits = { train = 0.6, test = 0.2, validate = 0.2 }
 sample_class_weight = "balanced"
+include_pre_cohort_courses = false
 
 [preprocessing.features]
 min_passing_grade = 1.0
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "student-success-tool"
-version = "0.3.8"
+version = "0.3.9"
 description = "School-agnostic lib for implementing Student Success Tool workflows."
 readme = "README.md"
 requires-python = ">=3.10,<3.13"
diff --git a/src/student_success_tool/configs/pdp.py b/src/student_success_tool/configs/pdp.py
@@ -159,6 +159,12 @@ class PreprocessingConfig(pyd.BaseModel):
             ),
         )
     )
+    include_pre_cohort_courses: bool = pyd.Field(
+        default=False,
+        description=(
+            "Whether to include course records that occurred before the student's cohort term. Usually, we do end up excluding these so the default will always be False unless set otherwise."
+        ),
+    )
 
     @pyd.field_validator("splits", mode="after")
     @classmethod

Original file line number	Diff line number	Diff line change
`@@ -159,6 +159,12 @@ class PreprocessingConfig(pyd.BaseModel):`
`159`	`159`	`),`
`160`	`160`	`)`
`161`	`161`	`)`
	`162`	`+ include_pre_cohort_courses: bool = pyd.Field(`
	`163`	`+ default=False,`
	`164`	`+ description=(`
	`165`	`+ "Whether to include course records that occurred before the student's cohort term. Usually, we do end up excluding these so the default will always be False unless set otherwise."`
	`166`	`+ ),`
	`167`	`+ )`
`162`	`168`
`163`	`169`	`@pyd.field_validator("splits", mode="after")`
`164`	`170`	`@classmethod`