Skip to content

Commit 91ce1b0

Browse files
Merge pull request #270 from datakind/develop
v3.10 dev -> main
2 parents cfdaef0 + d4e9a5a commit 91ce1b0

38 files changed

+2215
-405
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
# CHANGELOG
22

3+
## 0.3.10 (2025-08)
4+
- Created automated model selection (PR 267)
5+
- Corrected aliasing in custom model cards (PR 264)
6+
37
## 0.3.9 (2025-07)
48
- Add pre-cohort courses to config (PR 258)
59

pipelines/custom/institution_id/02-train-model-TEMPLATE.py

Lines changed: 33 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
# we need to manually install a certain version of pandas and scikit-learn in order
2828
# for our models to load and run properly.
2929

30-
# %pip install "student-success-tool==0.3.8"
30+
# %pip install "student-success-tool==0.3.10"
3131
# %pip install "pandas==1.5.3"
3232
# %pip install "scikit-learn==1.3.0"
3333
# %restart_python
@@ -209,15 +209,13 @@
209209
# COMMAND ----------
210210

211211
# Get top runs from experiment for evaluation
212-
# Adjust optimization metrics & topn_runs_included as needed
213212
top_runs = modeling.evaluation.get_top_runs(
214213
experiment_id,
215214
optimization_metrics=[
216215
"test_recall_score",
217-
"val_recall_score",
218216
"test_roc_auc",
219-
"val_roc_auc",
220217
"test_log_loss",
218+
"test_f1_score",
221219
"val_log_loss",
222220
],
223221
topn_runs_included=cfg.modeling.evaluation.topn_runs_included,
@@ -262,3 +260,34 @@
262260
)
263261
logging.info("Run %s: Completed", run_id)
264262
mlflow.end_run()
263+
264+
# COMMAND ----------
265+
266+
# MAGIC %md
267+
# MAGIC # model selection
268+
269+
# COMMAND ----------
270+
271+
# Rank top runs again after evaluation for model selection
272+
selected_runs = modeling.evaluation.get_top_runs(
273+
experiment_id,
274+
optimization_metrics=[
275+
"test_recall_score",
276+
"test_roc_auc",
277+
"test_log_loss",
278+
"test_bias_score_mean",
279+
],
280+
topn_runs_included=cfg.modeling.evaluation.topn_runs_included,
281+
)
282+
# Extract the top run
283+
top_run_name, top_run_id = next(iter(selected_runs.items()))
284+
logging.info(f"Selected top run for perf and bias: {top_run_name} - {top_run_id}")
285+
286+
# COMMAND ----------
287+
288+
# Update config with run and experiment ids
289+
modeling.utils.update_run_metadata_in_toml(
290+
config_path="./config.toml",
291+
run_id=top_run_id,
292+
experiment_id=experiment_id,
293+
)

pipelines/custom/institution_id/03-make-predictions-TEMPLATE.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
# we need to manually install a certain version of pandas and scikit-learn in order
2626
# for our models to load and run properly.
2727

28-
# %pip install "student-success-tool==0.3.8"
28+
# %pip install "student-success-tool==0.3.10"
2929
# %pip install "pandas==1.5.3"
3030
# %pip install "scikit-learn==1.3.0"
3131
# %restart_python

pipelines/custom/institution_id/04-register-model-create-card-TEMPLATE.py

Lines changed: 39 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
# MAGIC
99
# MAGIC - [Data science product components (Confluence doc)](https://datakind.atlassian.net/wiki/spaces/TT/pages/237862913/Data+science+product+components+the+modeling+process)
1010
# MAGIC - [Databricks runtimes release notes](https://docs.databricks.com/en/release-notes/runtime/index.html)
11-
# MAGIC
1211

1312
# COMMAND ----------
1413

@@ -21,7 +20,7 @@
2120
# we need to manually install a certain version of pandas and scikit-learn in order
2221
# for our models to load and run properly.
2322

24-
# %pip install "student-success-tool==0.3.8"
23+
# %pip install "student-success-tool==0.3.10"
2524
# %pip install "pandas==1.5.3"
2625
# %pip install "scikit-learn==1.3.0"
2726
# %restart_python
@@ -34,9 +33,7 @@
3433
from databricks.connect import DatabricksSession
3534

3635
from student_success_tool import dataio, configs, modeling
37-
38-
# TODO for Vish: implement CustomModelCard
39-
# from student_success_tool.reporting.model_card.pdp import PDPModelCard
36+
from student_success_tool.reporting.model_card.custom import CustomModelCard
4037

4138
# COMMAND ----------
4239

@@ -68,10 +65,11 @@
6865

6966
# COMMAND ----------
7067

71-
# project configuration stored as a config file in TOML format
72-
cfg = dataio.read_config(
73-
"./config-TEMPLATE.toml", schema=configs.custom.CustomProjectConfig
74-
)
68+
# project configuration should be stored in a config file in TOML format
69+
# it'll start out with just basic info: institution_id, institution_name
70+
# but as each step of the pipeline gets built, more parameters will be moved
71+
# from hard-coded notebook variables to shareable, persistent config fields
72+
cfg = dataio.read_config("./config-TEMPLATE.toml", schema=configs.pdp.PDPProjectConfig)
7573
cfg
7674

7775
# COMMAND ----------
@@ -98,3 +96,35 @@
9896
registry_uri="databricks-uc",
9997
mlflow_client=client,
10098
)
99+
100+
# COMMAND ----------
101+
102+
# MAGIC %md
103+
# MAGIC # generate model card
104+
105+
# COMMAND ----------
106+
107+
# Initialize card
108+
card = CustomModelCard(
109+
config=cfg, catalog=catalog, model_name=model_name, mlflow_client=client
110+
)
111+
112+
# COMMAND ----------
113+
114+
# Build context and download artifacts
115+
card.build()
116+
117+
# COMMAND ----------
118+
119+
# MAGIC %md
120+
# MAGIC ### Edit model card markdown file as you see fit before exporting as a PDF
121+
# MAGIC - A markdown should now exist in your local directory. Feel free to edit directly in DB's text editor before running the cell below.
122+
# MAGIC - You don't need to refresh the browser or restart your cluster etc, the model card function will re-read the markdown below before exporting as a PDF
123+
# MAGIC - You can access the PDF via ML artifacts in your registered model, as you will not be able to view the PDF locally in DB workspace.
124+
# MAGIC
125+
126+
# COMMAND ----------
127+
128+
# Reload & publish
129+
card.reload_card()
130+
card.export_to_pdf()

pipelines/custom/institution_id/05-inference-validation-TEMPLATE.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
# we need to manually install a certain version of pandas and scikit-learn in order
2222
# for our models to load and run properly.
2323

24-
# %pip install "student-success-tool==0.3.8"
24+
# %pip install "student-success-tool==0.3.10"
2525
# %restart_python
2626

2727
# COMMAND ----------

pipelines/custom/institution_id/config-TEMPLATE.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ timeout_minutes = 10
113113
# exclude_cols = []
114114

115115
[modeling.evaluation]
116-
topn_runs_included = 3
116+
topn_runs_included = 5
117117

118118
[inference]
119119
num_top_features = 5

pipelines/pdp/inference/pdp_inference/workflow_asset_bundle/resources/github_sourced_pdp_inference_pipeline.yml

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ resources:
4949
- --course_dataset_validated_path
5050
- "{{tasks.data_ingestion.values.course_dataset_validated_path}}"
5151
- --toml_file_path
52-
- "/Volumes/{{job.parameters.DB_workspace}}/{{job.parameters.databricks_institution_name}}_gold/gold_volume/configuration_files/{{job.parameters.databricks_institution_name}}_{{job.parameters.model_name}}_configuration_file.toml"
52+
- "/Volumes/{{job.parameters.DB_workspace}}/{{job.parameters.databricks_institution_name}}_gold/gold_volume/inference_inputs/config.toml"
5353
- --custom_schemas_path
5454
- "{{job.parameters.custom_schemas_path}}"
5555
job_cluster_key: pdp-inference-pipeline-cluster
@@ -73,7 +73,7 @@ resources:
7373
- --input_table_path
7474
- "{{tasks.data_preprocessing.values.processed_dataset_path}}"
7575
- --input_schema_path
76-
- /Volumes/{{job.parameters.DB_workspace}}/{{job.parameters.databricks_institution_name}}_gold/gold_volume/configuration_files/schema.pbtxt # TODO(samroon2): Update once finalized.
76+
- /Volumes/{{job.parameters.DB_workspace}}/{{job.parameters.databricks_institution_name}}_gold/gold_volume/inference_inputs/schema.pbtxt # TODO(samroon2): Update once finalized.
7777
- --output_artifact_path
7878
- "{{tasks.data_ingestion.values.job_root_dir}}"
7979
- --environment
@@ -97,7 +97,7 @@ resources:
9797
- --input_table_path
9898
- "{{tasks.data_preprocessing.values.processed_dataset_path}}"
9999
- --input_schema_path
100-
- /Volumes/{{job.parameters.DB_workspace}}/{{job.parameters.databricks_institution_name}}_gold/gold_volume/configuration_files/schema.pbtxt # TODO(samroon2): Update once finalized.
100+
- /Volumes/{{job.parameters.DB_workspace}}/{{job.parameters.databricks_institution_name}}_gold/gold_volume/inference_inputs/schema.pbtxt # TODO(samroon2): Update once finalized.
101101
- --output_artifact_path
102102
- "{{tasks.data_ingestion.values.job_root_dir}}"
103103
- --environment
@@ -135,7 +135,7 @@ resources:
135135
- --DK_CC_EMAIL
136136
- "{{job.parameters.DK_CC_EMAIL}}"
137137
- --modeling_table_path
138-
- "{{job.parameters.DB_workspace}}.{{job.parameters.databricks_institution_name}}_gold.modeling_table"
138+
- "{{job.parameters.DB_workspace}}.{{job.parameters.databricks_institution_name}}_silver.{{job.parameters.databricks_institution_name}}_pdp_modeling_ar_deid"
139139
- --custom_schemas_path
140140
- "{{job.parameters.custom_schemas_path}}"
141141
job_cluster_key: pdp-inference-pipeline-cluster
@@ -212,19 +212,19 @@ resources:
212212
enabled: true
213213
parameters:
214214
- name: cohort_file_name
215-
default: kentucky_state_uni_pdp_ar_deid_20241029000400.csv
215+
default: AO1600pdp_AO1600_AR_DEIDENTIFIED_STUDYID_20250522120554.csv
216216
- name: course_file_name
217-
default: kentucky_state_uni_pdp_course_ar_deid_20241029000414_dedup.csv
217+
default: AO1600pdp_AO1600_COURSE_LEVEL_AR_DEIDENTIFIED_STUDYID_20250522120554.csv
218218
- name: databricks_institution_name
219-
default: kentucky_state_uni
219+
default: midway_uni
220220
- name: db_run_id
221221
default: "{{job.run_id}}"
222222
- name: DB_workspace
223223
default: ${var.DB_workspace}
224224
- name: gcp_bucket_name
225-
default: dev_6782b2f451f84c17ae6e14e918432b65
225+
default: databricks-2052166062819251-unitycatalog
226226
- name: model_name
227-
default: kentucky_state_uni_retention_end_of_first_year
227+
default: midway_uni_graduation_4y_end_of_first_year
228228
- name: model_type
229229
default: sklearn
230230
- name: notification_email

0 commit comments

Comments
 (0)