datakind
diff --git a/‎CHANGELOG.md‎
Lines changed: 4 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎pipelines/custom/institution_id/02-train-model-TEMPLATE.py‎
Lines changed: 33 additions & 4 deletions b/‎pipelines/custom/institution_id/02-train-model-TEMPLATE.py‎
Lines changed: 33 additions & 4 deletions
diff --git a/‎pipelines/custom/institution_id/03-make-predictions-TEMPLATE.py‎
Lines changed: 1 addition & 1 deletion b/‎pipelines/custom/institution_id/03-make-predictions-TEMPLATE.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎pipelines/custom/institution_id/04-register-model-create-card-TEMPLATE.py‎
Lines changed: 39 additions & 9 deletions b/‎pipelines/custom/institution_id/04-register-model-create-card-TEMPLATE.py‎
Lines changed: 39 additions & 9 deletions
diff --git a/‎pipelines/custom/institution_id/05-inference-validation-TEMPLATE.py‎
Lines changed: 1 addition & 1 deletion b/‎pipelines/custom/institution_id/05-inference-validation-TEMPLATE.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎pipelines/custom/institution_id/config-TEMPLATE.toml‎
Lines changed: 1 addition & 1 deletion b/‎pipelines/custom/institution_id/config-TEMPLATE.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎pipelines/pdp/inference/pdp_inference/workflow_asset_bundle/resources/github_sourced_pdp_inference_pipeline.yml‎
Lines changed: 9 additions & 9 deletions b/‎pipelines/pdp/inference/pdp_inference/workflow_asset_bundle/resources/github_sourced_pdp_inference_pipeline.yml‎
Lines changed: 9 additions & 9 deletions
@@ -1,5 +1,9 @@
 # CHANGELOG
 
+## 0.3.10 (2025-08)
+- Created automated model selection (PR 267)
+- Corrected aliasing in custom model cards (PR 264)
+
 ## 0.3.9 (2025-07)
 - Add pre-cohort courses to config (PR 258)
 
 
@@ -27,7 +27,7 @@
 # we need to manually install a certain version of pandas and scikit-learn in order
 # for our models to load and run properly.
 
-# %pip install "student-success-tool==0.3.8"
+# %pip install "student-success-tool==0.3.10"
 # %pip install "pandas==1.5.3"
 # %pip install "scikit-learn==1.3.0"
 # %restart_python
@@ -209,15 +209,13 @@
 # COMMAND ----------
 
 # Get top runs from experiment for evaluation
-# Adjust optimization metrics & topn_runs_included as needed
 top_runs = modeling.evaluation.get_top_runs(
     experiment_id,
     optimization_metrics=[
         "test_recall_score",
-        "val_recall_score",
         "test_roc_auc",
-        "val_roc_auc",
         "test_log_loss",
+        "test_f1_score",
         "val_log_loss",
     ],
     topn_runs_included=cfg.modeling.evaluation.topn_runs_included,
@@ -262,3 +260,34 @@
             )
         logging.info("Run %s: Completed", run_id)
 mlflow.end_run()
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC # model selection
+
+# COMMAND ----------
+
+# Rank top runs again after evaluation for model selection
+selected_runs = modeling.evaluation.get_top_runs(
+    experiment_id,
+    optimization_metrics=[
+        "test_recall_score",
+        "test_roc_auc",
+        "test_log_loss",
+        "test_bias_score_mean",
+    ],
+    topn_runs_included=cfg.modeling.evaluation.topn_runs_included,
+)
+# Extract the top run
+top_run_name, top_run_id = next(iter(selected_runs.items()))
+logging.info(f"Selected top run for perf and bias: {top_run_name} - {top_run_id}")
+
+# COMMAND ----------
+
+# Update config with run and experiment ids
+modeling.utils.update_run_metadata_in_toml(
+    config_path="./config.toml",
+    run_id=top_run_id,
+    experiment_id=experiment_id,
+)
@@ -25,7 +25,7 @@
 # we need to manually install a certain version of pandas and scikit-learn in order
 # for our models to load and run properly.
 
-# %pip install "student-success-tool==0.3.8"
+# %pip install "student-success-tool==0.3.10"
 # %pip install "pandas==1.5.3"
 # %pip install "scikit-learn==1.3.0"
 # %restart_python
 
@@ -8,7 +8,6 @@
 # MAGIC
 # MAGIC - [Data science product components (Confluence doc)](https://datakind.atlassian.net/wiki/spaces/TT/pages/237862913/Data+science+product+components+the+modeling+process)
 # MAGIC - [Databricks runtimes release notes](https://docs.databricks.com/en/release-notes/runtime/index.html)
-# MAGIC
 
 # COMMAND ----------
 
@@ -21,7 +20,7 @@
 # we need to manually install a certain version of pandas and scikit-learn in order
 # for our models to load and run properly.
 
-# %pip install "student-success-tool==0.3.8"
+# %pip install "student-success-tool==0.3.10"
 # %pip install "pandas==1.5.3"
 # %pip install "scikit-learn==1.3.0"
 # %restart_python
@@ -34,9 +33,7 @@
 from databricks.connect import DatabricksSession
 
 from student_success_tool import dataio, configs, modeling
-
-# TODO for Vish: implement CustomModelCard
-# from student_success_tool.reporting.model_card.pdp import PDPModelCard
+from student_success_tool.reporting.model_card.custom import CustomModelCard
 
 # COMMAND ----------
 
@@ -68,10 +65,11 @@
 
 # COMMAND ----------
 
-# project configuration stored as a config file in TOML format
-cfg = dataio.read_config(
-    "./config-TEMPLATE.toml", schema=configs.custom.CustomProjectConfig
-)
+# project configuration should be stored in a config file in TOML format
+# it'll start out with just basic info: institution_id, institution_name
+# but as each step of the pipeline gets built, more parameters will be moved
+# from hard-coded notebook variables to shareable, persistent config fields
+cfg = dataio.read_config("./config-TEMPLATE.toml", schema=configs.pdp.PDPProjectConfig)
 cfg
 
 # COMMAND ----------
@@ -98,3 +96,35 @@
     registry_uri="databricks-uc",
     mlflow_client=client,
 )
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC # generate model card
+
+# COMMAND ----------
+
+# Initialize card
+card = CustomModelCard(
+    config=cfg, catalog=catalog, model_name=model_name, mlflow_client=client
+)
+
+# COMMAND ----------
+
+# Build context and download artifacts
+card.build()
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ### Edit model card markdown file as you see fit before exporting as a PDF
+# MAGIC - A markdown should now exist in your local directory. Feel free to edit directly in DB's text editor before running the cell below.
+# MAGIC - You don't need to refresh the browser or restart your cluster etc, the model card function will re-read the markdown below before exporting as a PDF
+# MAGIC - You can access the PDF via ML artifacts in your registered model, as you will not be able to view the PDF locally in DB workspace.
+# MAGIC
+
+# COMMAND ----------
+
+# Reload & publish
+card.reload_card()
+card.export_to_pdf()
@@ -21,7 +21,7 @@
 # we need to manually install a certain version of pandas and scikit-learn in order
 # for our models to load and run properly.
 
-# %pip install "student-success-tool==0.3.8"
+# %pip install "student-success-tool==0.3.10"
 # %restart_python
 
 # COMMAND ----------
 
@@ -113,7 +113,7 @@ timeout_minutes = 10
 # exclude_cols = []
 
 [modeling.evaluation]
-topn_runs_included = 3
+topn_runs_included = 5
 
 [inference]
 num_top_features = 5
 
@@ -49,7 +49,7 @@ resources:
               - --course_dataset_validated_path
               - "{{tasks.data_ingestion.values.course_dataset_validated_path}}"
               - --toml_file_path
-              - "/Volumes/{{job.parameters.DB_workspace}}/{{job.parameters.databricks_institution_name}}_gold/gold_volume/configuration_files/{{job.parameters.databricks_institution_name}}_{{job.parameters.model_name}}_configuration_file.toml"
+              - "/Volumes/{{job.parameters.DB_workspace}}/{{job.parameters.databricks_institution_name}}_gold/gold_volume/inference_inputs/config.toml"
               - --custom_schemas_path
               - "{{job.parameters.custom_schemas_path}}"
           job_cluster_key: pdp-inference-pipeline-cluster
@@ -73,7 +73,7 @@ resources:
               - --input_table_path
               - "{{tasks.data_preprocessing.values.processed_dataset_path}}"
               - --input_schema_path
-              - /Volumes/{{job.parameters.DB_workspace}}/{{job.parameters.databricks_institution_name}}_gold/gold_volume/configuration_files/schema.pbtxt # TODO(samroon2): Update once finalized.
+              - /Volumes/{{job.parameters.DB_workspace}}/{{job.parameters.databricks_institution_name}}_gold/gold_volume/inference_inputs/schema.pbtxt # TODO(samroon2): Update once finalized.
               - --output_artifact_path
               - "{{tasks.data_ingestion.values.job_root_dir}}"
               - --environment
@@ -97,7 +97,7 @@ resources:
               - --input_table_path
               - "{{tasks.data_preprocessing.values.processed_dataset_path}}"
               - --input_schema_path
-              - /Volumes/{{job.parameters.DB_workspace}}/{{job.parameters.databricks_institution_name}}_gold/gold_volume/configuration_files/schema.pbtxt # TODO(samroon2): Update once finalized.
+              - /Volumes/{{job.parameters.DB_workspace}}/{{job.parameters.databricks_institution_name}}_gold/gold_volume/inference_inputs/schema.pbtxt # TODO(samroon2): Update once finalized.
               - --output_artifact_path
               - "{{tasks.data_ingestion.values.job_root_dir}}"
               - --environment
@@ -135,7 +135,7 @@ resources:
               - --DK_CC_EMAIL
               - "{{job.parameters.DK_CC_EMAIL}}"
               - --modeling_table_path
-              - "{{job.parameters.DB_workspace}}.{{job.parameters.databricks_institution_name}}_gold.modeling_table"
+              - "{{job.parameters.DB_workspace}}.{{job.parameters.databricks_institution_name}}_silver.{{job.parameters.databricks_institution_name}}_pdp_modeling_ar_deid"
               - --custom_schemas_path
               - "{{job.parameters.custom_schemas_path}}"
           job_cluster_key: pdp-inference-pipeline-cluster
@@ -212,19 +212,19 @@ resources:
         enabled: true
       parameters:
         - name: cohort_file_name
-          default: kentucky_state_uni_pdp_ar_deid_20241029000400.csv 
+          default: AO1600pdp_AO1600_AR_DEIDENTIFIED_STUDYID_20250522120554.csv
         - name: course_file_name
-          default: kentucky_state_uni_pdp_course_ar_deid_20241029000414_dedup.csv
+          default: AO1600pdp_AO1600_COURSE_LEVEL_AR_DEIDENTIFIED_STUDYID_20250522120554.csv
         - name: databricks_institution_name
-          default: kentucky_state_uni
+          default: midway_uni
         - name: db_run_id
           default: "{{job.run_id}}"
         - name: DB_workspace
           default: ${var.DB_workspace}
         - name: gcp_bucket_name
-          default: dev_6782b2f451f84c17ae6e14e918432b65
+          default: databricks-2052166062819251-unitycatalog
         - name: model_name
-          default: kentucky_state_uni_retention_end_of_first_year
+          default: midway_uni_graduation_4y_end_of_first_year
         - name: model_type
           default: sklearn
         - name: notification_email