Skip to content

Commit cfdaef0

Browse files
Merge pull request #261 from datakind/develop
Release 0.3.9
2 parents 7369363 + dcc3549 commit cfdaef0

File tree

9 files changed

+86
-6
lines changed

9 files changed

+86
-6
lines changed

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
# CHANGELOG
22

3+
## 0.3.9 (2025-07)
4+
- Add pre-cohort courses to config (PR 258)
5+
36
## 0.3.8 (2025-07)
47
- Patching up gold volume path due to Azure/GCP naming differences, keeping GCP version (PR 249)
58

pipelines/pdp/institution_id/00-data-assessment-TEMPLATE.py

Lines changed: 53 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525

2626
# install dependencies, of which most/all should come through our 1st-party SST package
2727

28-
# %pip install "student-success-tool == 0.3.8"
28+
# %pip install "student-success-tool == 0.3.9"
2929

3030
# COMMAND ----------
3131

@@ -291,6 +291,58 @@
291291

292292
# COMMAND ----------
293293

294+
# MAGIC %md
295+
# MAGIC ### Checking for inconsistencies in raw files
296+
# MAGIC We want to check for inconsistencies to raise with NSC if need be before any converter functions get applied
297+
298+
# COMMAND ----------
299+
300+
df_raw = (
301+
pd.merge(
302+
df_cohort,
303+
df_course,
304+
on=cfg.student_id_col,
305+
how="outer",
306+
suffixes=("_cohort", "_course"),
307+
indicator=True,
308+
)
309+
# HACK: columns overlap on more than just student_guid
310+
# let's rename/drop a relevant few for convenience
311+
.rename(
312+
columns={
313+
"cohort_cohort": "cohort",
314+
"cohort_term_cohort": "cohort_term",
315+
"student_age_cohort": "student_age",
316+
"race_cohort": "race",
317+
"ethnicity_cohort": "ethnicity",
318+
"gender_cohort": "gender",
319+
"institution_id_cohort": "institution_id",
320+
}
321+
)
322+
.drop(
323+
columns=[
324+
"cohort_course",
325+
"cohort_term_course",
326+
"student_age_course",
327+
"race_course",
328+
"ethnicity_course",
329+
"gender_course",
330+
"institution_id_course",
331+
]
332+
)
333+
)
334+
df_raw["_merge"].value_counts()
335+
336+
# COMMAND ----------
337+
338+
# any patterns in mis-joined records?
339+
df_raw.loc[df_raw["_merge"] != "both", :]
340+
341+
# which students don't appear in both datasets?
342+
df_raw.loc[df_raw["_merge"] != "both", cfg.student_id_col].unique().tolist()
343+
344+
# COMMAND ----------
345+
294346
# MAGIC %md
295347
# MAGIC ### null values
296348

pipelines/pdp/institution_id/01-preprocess-data-TEMPLATE.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424

2525
# install dependencies, of which most/all should come through our 1st-party SST package
2626

27-
# %pip install "student-success-tool == 0.3.8"
27+
# %pip install "student-success-tool == 0.3.9"
2828

2929
# COMMAND ----------
3030

@@ -132,6 +132,24 @@
132132

133133
# COMMAND ----------
134134

135+
# MAGIC %md
136+
# MAGIC ### Handling Pre-Cohort Courses
137+
# MAGIC
138+
# MAGIC Please rememeber to check with your schools during the data assessment call how they would like pre-cohort course records to be handled.
139+
# MAGIC
140+
141+
# COMMAND ----------
142+
143+
# We usually drop pre-cohort course records; If school requests otherwise, please set include_pre_cohort_courses in your config to TRUE, re-load the config, THEN run this cell!
144+
145+
if not cfg.preprocessing.include_pre_cohort_courses:
146+
df_student_terms = df_student_terms[df_student_terms["term_is_pre_cohort"] == False]
147+
148+
# sanity check; should be 0 True
149+
df_student_terms["term_is_pre_cohort"].value_counts(dropna=False)
150+
151+
# COMMAND ----------
152+
135153
# MAGIC %md
136154
# MAGIC # select students and compute targets
137155

pipelines/pdp/institution_id/02-train-model-TEMPLATE.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
# we need to manually install a certain version of pandas and scikit-learn in order
2929
# for our models to load and run properly.
3030

31-
# %pip install "student-success-tool==0.3.8"
31+
# %pip install "student-success-tool==0.3.9"
3232
# %pip install "pandas==1.5.3"
3333
# %pip install "scikit-learn==1.3.0"
3434

pipelines/pdp/institution_id/03-make-predictions-TEMPLATE.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
# we need to manually install a certain version of pandas and scikit-learn in order
2727
# for our models to load and run properly.
2828

29-
# %pip install "student-success-tool==0.3.8"
29+
# %pip install "student-success-tool==0.3.9"
3030
# %pip install "pandas==1.5.3"
3131
# %pip install "scikit-learn==1.3.0"
3232

pipelines/pdp/institution_id/04-register-model-create-card-TEMPLATE.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
# we need to manually install a certain version of pandas and scikit-learn in order
2222
# for our models to load and run properly.
2323

24-
# %pip install "student-success-tool==0.3.8"
24+
# %pip install "student-success-tool==0.3.9"
2525
# %pip install "pandas==1.5.3"
2626
# %pip install "scikit-learn==1.3.0"
2727

pipelines/pdp/institution_id/config-TEMPLATE.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ framework = "sklearn"
3030
[preprocessing]
3131
splits = { train = 0.6, test = 0.2, validate = 0.2 }
3232
sample_class_weight = "balanced"
33+
include_pre_cohort_courses = false
3334

3435
[preprocessing.features]
3536
min_passing_grade = 1.0

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "student-success-tool"
3-
version = "0.3.8"
3+
version = "0.3.9"
44
description = "School-agnostic lib for implementing Student Success Tool workflows."
55
readme = "README.md"
66
requires-python = ">=3.10,<3.13"

src/student_success_tool/configs/pdp.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,12 @@ class PreprocessingConfig(pyd.BaseModel):
159159
),
160160
)
161161
)
162+
include_pre_cohort_courses: bool = pyd.Field(
163+
default=False,
164+
description=(
165+
"Whether to include course records that occurred before the student's cohort term. Usually, we do end up excluding these so the default will always be False unless set otherwise."
166+
),
167+
)
162168

163169
@pyd.field_validator("splits", mode="after")
164170
@classmethod

0 commit comments

Comments
 (0)