Skip to content

Commit d047b86

Browse files
Merge pull request #232 from datakind/develop
release 0.3.6
2 parents c5b6e17 + d677669 commit d047b86

File tree

11 files changed

+61
-17
lines changed

11 files changed

+61
-17
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
# CHANGELOG
22

3+
## 0.3.6 (2025-06)
4+
- Fixed bug in features table (PR 229)
5+
- Fixed bug in 12 credit features (PR 230)
6+
37
## 0.3.5 (2025-06)
48
- Added support scores to to features (PR 222)
59
- Limit boolean features to courses and subjects (PR 223)

pipelines/pdp/institution_id/00-data-assessment-TEMPLATE.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525

2626
# install dependencies, of which most/all should come through our 1st-party SST package
2727

28-
# %pip install "student-success-tool == 0.3.5"
28+
# %pip install "student-success-tool == 0.3.6"
2929

3030
# COMMAND ----------
3131

pipelines/pdp/institution_id/01-preprocess-data-TEMPLATE.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424

2525
# install dependencies, of which most/all should come through our 1st-party SST package
2626

27-
# %pip install "student-success-tool == 0.3.5"
27+
# %pip install "student-success-tool == 0.3.6"
2828

2929
# COMMAND ----------
3030

pipelines/pdp/institution_id/02-train-model-TEMPLATE.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
# we need to manually install a certain version of pandas and scikit-learn in order
2929
# for our models to load and run properly.
3030

31-
# %pip install "student-success-tool==0.3.5"
31+
# %pip install "student-success-tool==0.3.6"
3232
# %pip install "pandas==1.5.3"
3333
# %pip install "scikit-learn==1.3.0"
3434

pipelines/pdp/institution_id/03-make-predictions-TEMPLATE.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
# we need to manually install a certain version of pandas and scikit-learn in order
2727
# for our models to load and run properly.
2828

29-
# %pip install "student-success-tool==0.3.5"
29+
# %pip install "student-success-tool==0.3.6"
3030
# %pip install "pandas==1.5.3"
3131
# %pip install "scikit-learn==1.3.0"
3232

pipelines/pdp/institution_id/04-register-model-create-card-TEMPLATE.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
# we need to manually install a certain version of pandas and scikit-learn in order
2222
# for our models to load and run properly.
2323

24-
# %pip install "student-success-tool==0.3.5"
24+
# %pip install "student-success-tool==0.3.6"
2525
# %pip install "pandas==1.5.3"
2626
# %pip install "scikit-learn==1.3.0"
2727

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "student-success-tool"
3-
version = "0.3.5"
3+
version = "0.3.6"
44
description = "School-agnostic lib for implementing Student Success Tool workflows."
55
readme = "README.md"
66
requires-python = ">=3.10,<3.13"

src/student_success_tool/assets/pdp/features_table.toml

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -277,12 +277,11 @@ term_is_pre_cohort_cumsum = { name = "number of terms so far that occurred befor
277277
term_is_while_student_enrolled_at_other_inst = { name = "term occurred while student is enrolled at another institution", desc = "This is another binary feature (True/False) indicating whether a given (usually the checkpoint) academic term overlapped with a period when the student was concurrently enrolled at another institution. A value of \"True\" suggests simultaneous enrollment elsewhere in a given (usually the checkpoint) term, while a value of \"False\" means the student was not enrolled at another institution in a given (usually the checkpoint) term." }
278278
term_is_while_student_enrolled_at_other_inst_cumsum = { name = "number of terms so far that occurred while student was enrolled at another institution", desc = "This feature represents the cumulative number of academic terms in which a student was concurrently enrolled at another institution up until the given (usually the checkpoint) term. A higher value for this feature indicates that a student has taken several courses while enrolled elsewhere over their academic journey so far, while a lower value suggests limited or no overlap with enrollment at other institutions over a student's academic journey so far." }
279279
year_of_enrollment_at_cohort_inst = { name = "year of enrollment at cohort institution", desc = "The current year the student is enrolled in at the cohort insitution. Ex. year 1, year 2, year 3, etc." }
280-
'took_course_course_type_subject_area_(\d+)$' = { name = "took course in subject area '{}' this term", desc = "Whether the student has taken a course in a specified subject area in a given (usually the checkpoint) term. A 1 would indicate the student did take a course in this subject area and a 0 would mean the student did not." }
281280
term_program_of_study = { name = "student's program of study this term", desc = "Student’s term program of study this term based on major CIP code reported by the cohort institution." }
282281
term_program_of_study_area = { name = "student's program of study area this term", desc = "Student’s term program of study area this term based on major CIP code reported by the cohort institution." }
283-
'^took_course_course_id_([a-z]+)_(\d+)$' = { name = "course with id '{}{}' taken this term", desc = "This feature represents if a student has taken in a specific course in a given (usually the checkpoint) term, based on the course ID. A value of 1 for this feature would mean that a student took the specified course in a given (usually the checkpoint) term, while a value of 0 for this feature would mean that the student did not take the specified course in a given (usually the checkpoint) term." }
284-
'^took_course_course_id_([a-z]+)_(\d+)_cummax$' = { name = "course with id '{} {}' taken so far", desc = "This feature represents if a student has ever taken a specific course up until (usually the checkpoint) term, based on the course ID." }
285-
'^took_course_course_id_([a-z]+)_(\d+)_cummax_in_12_creds$' = { name = "course with id '{} {}' taken within student’s first 12 credits", desc = "This feature represents if a student has taken in a specific course in their first 12 credits, based on the course ID. A value of 1 for this feature would mean that a student took the specified course in their first 12 credits, while a value of 0 for this feature would mean that the student did not take the specified course in their first 12 credits." }
286-
'^took_course_course_subject_area_(\d+)$' = { name = "courses taken in subject area '{}' this term", desc = "Indicator whether student took a course in a specified subject area in a given (usually the checkpoint) term." }
287-
'^took_course_course_subject_area_(\d+)_cummax$' = { name = "course in subject area '{}' taken so far", desc = "Whether the student has taken a course in a specified subject area at any point up to this term. A 1 would indicate the student did take a course in this subject area and a 0 would mean the student did not." }
288-
'^took_course_course_subject_area_(\d+)_cummax_in_12_creds$' = { name = "took course in subject area '{}' within student’s first 12 credits", desc = "Whether the student has taken a course in a specified subject area within their first 12 credits. A 1 would indicate the student did take a course in this subject area during their first 12 credits and a 0 would mean the student did not." }
282+
'^took_course_id_([a-z]+)_(\d+)$' = { name = "course with id '{}{}' taken this term", desc = "This feature represents if a student has taken in a specific course in a given (usually the checkpoint) term, based on the course ID. A value of 1 for this feature would mean that a student took the specified course in a given (usually the checkpoint) term, while a value of 0 for this feature would mean that the student did not take the specified course in a given (usually the checkpoint) term." }
283+
'^took_course_id_([a-z]+)_(\d+)_cummax$' = { name = "course with id '{} {}' taken so far", desc = "This feature represents if a student has ever taken a specific course up until (usually the checkpoint) term, based on the course ID. A 1 would indicate the student did take the specified course with this course id at some point up until the given term and a 0 would mean the student did not." }
284+
'^took_course_id_([a-z]+)_(\d+)_cummax_in_12_creds$' = { name = "course with id '{} {}' taken within student’s first 12 credits", desc = "This feature represents if a student has taken in a specific course in their first 12 credits, based on the course ID. A value of 1 for this feature would mean that a student took the specified course in their first 12 credits, while a value of 0 for this feature would mean that the student did not take the specified course in their first 12 credits." }
285+
'^took_course_subject_area_(\d+)$' = { name = "courses taken in subject area '{}' this term", desc = "Indicator whether student took a course in a specified subject area in a given (usually the checkpoint) term. A 1 would indicate the student did take a course in this subject area in the given term and a 0 would mean the student did not." }
286+
'^took_course_subject_area_(\d+)_cummax$' = { name = "course in subject area '{}' taken so far", desc = "Whether the student has taken a course in a specified subject area at any point up to this term. A 1 would indicate the student did take a course in this subject area at some point up until the given term and a 0 would mean the student did not." }
287+
'^took_course_subject_area_(\d+)_cummax_in_12_creds$' = { name = "took course in subject area '{}' within student’s first 12 credits", desc = "Whether the student has taken a course in a specified subject area within their first 12 credits. A 1 would indicate the student did take a course in this subject area during their first 12 credits and a 0 would mean the student did not." }

src/student_success_tool/preprocessing/pdp.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -250,7 +250,7 @@ def clean_up_labeled_dataset_cols_and_vals(
250250
num_credits_col: Name of the column containing cumulative earned credits.
251251
"""
252252
num_credit_check = constants.DEFAULT_COURSE_CREDIT_CHECK
253-
credit_pattern = re.compile(rf"in_{num_credit_check}_credits")
253+
credit_pattern = re.compile(rf"in_{num_credit_check}_creds")
254254
# To prevent data leakage, students that have not reached the 12 credits and not taken the course
255255
# by the checkpoint term (which this data is limited to at the time of this function),
256256
# will have the applicable in_12_credits columns set to null.

tests/assets/test_features_table.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import pytest
2+
import re
23
import os
34
from student_success_tool.dataio.read import from_toml_file
45

@@ -31,3 +32,43 @@ def test_all_features_have_name_and_desc(feature_table_data):
3132
assert "desc" in entry and entry["desc"].strip(), (
3233
f"'desc' missing or empty in feature: {feature_id}"
3334
)
35+
36+
37+
# Add regex test cases to this as needed! This helps us make sure
38+
# that are regex patterns work *as intended* in our features table
39+
VALID_FEATURE_NAMES = [
40+
"took_course_id_eng_101",
41+
"took_course_id_eng_101_cummax",
42+
"took_course_id_eng_101_cummax_in_12_creds",
43+
"took_course_subject_area_51",
44+
"took_course_subject_area_51_cummax",
45+
"took_course_subject_area_51_cummax_in_12_creds",
46+
"num_courses_course_subject_area_51",
47+
"num_courses_course_subject_area_51_cumfrac",
48+
"frac_courses_course_subject_area_51",
49+
"num_courses_course_id_eng_101",
50+
"num_courses_course_id_eng_101_cumfrac",
51+
"frac_courses_course_id_eng_101",
52+
]
53+
54+
55+
@pytest.mark.parametrize("feature_name", VALID_FEATURE_NAMES)
56+
def test_feature_matches_some_regex_key(feature_name, feature_table_data):
57+
"""Check if each valid feature name matches at least one of the TOML regex keys."""
58+
59+
def is_likely_regex(key: str) -> bool:
60+
# Matches if the key contains metacharacters indicating it's a regex
61+
return bool(re.search(r"[\(\[\.\*\+\?\\]", key))
62+
63+
# Only consider keys with \d in them — implying dynamic regex patterns
64+
regex_keys = [key for key in feature_table_data.keys() if is_likely_regex(key)]
65+
66+
# Compile the regex patterns
67+
compiled_patterns = [re.compile(pattern) for pattern in regex_keys]
68+
69+
# Check if the feature_name matches ANY of them
70+
matched = any(pat.fullmatch(feature_name) for pat in compiled_patterns)
71+
72+
assert matched, (
73+
f"Feature '{feature_name}' did not match any regex pattern with \\d in the TOML"
74+
)

0 commit comments

Comments
 (0)