@@ -69,17 +69,18 @@ def get_git_commit_hash() -> str | None:
6969 return None
7070
7171
72- def load_prompts_and_metrics_from_bulk_run () -> tuple [dict [str , str ], dict [str , dict ]]:
72+ def load_prompts_and_metrics_from_bulk_run () -> tuple [dict [str , str ], dict [str , dict ], dict [ str , str ] ]:
7373 """Load prompts and generation metrics using PROMPTS dict from bulk_run.
7474
7575 Returns:
76- (prompts_dict, metrics_dict) where metrics_dict contains cost_usd, input_tokens, output_tokens, turns
76+ (prompts_dict, metrics_dict, run_config_dict) where metrics_dict contains cost_usd, input_tokens, output_tokens, turns
77+ and run_config_dict contains mcp_binary, backend, model
7778 """
7879 try :
7980 # Import PROMPTS from bulk_run.py
8081 from bulk_run import PROMPTS
8182 except ImportError :
82- return {}, {}
83+ return {}, {}, {}
8384
8485 # Look for bulk_run_results file
8586 script_dir = Path (__file__ ).parent
@@ -88,12 +89,22 @@ def load_prompts_and_metrics_from_bulk_run() -> tuple[dict[str, str], dict[str,
8889 results_files = sorted (script_dir .glob ("../app/bulk_run_results_*.json" ), reverse = True )
8990
9091 if not results_files :
91- return dict (PROMPTS ), {}
92+ return dict (PROMPTS ), {}, {}
9293
9394 # Load generation metrics from results file
9495 try :
9596 data = json .loads (results_files [0 ].read_text ())
9697
98+ # Extract run configuration from first result
99+ run_config = {}
100+ if data and len (data ) > 0 :
101+ first_result = data [0 ]
102+ run_config = {
103+ "mcp_binary" : first_result .get ("mcp_binary" , "cargo run (default)" ),
104+ "backend" : first_result .get ("backend" , "claude" ),
105+ "model" : first_result .get ("model" ),
106+ }
107+
97108 # Create a prompt->metrics mapping
98109 prompt_to_metrics = {}
99110 for result in data :
@@ -113,10 +124,10 @@ def load_prompts_and_metrics_from_bulk_run() -> tuple[dict[str, str], dict[str,
113124 if prompt in prompt_to_metrics :
114125 gen_metrics [app_name ] = prompt_to_metrics [prompt ]
115126
116- return dict (PROMPTS ), gen_metrics
127+ return dict (PROMPTS ), gen_metrics , run_config
117128
118129 except Exception :
119- return dict (PROMPTS ), {}
130+ return dict (PROMPTS ), {}, {}
120131
121132
122133def generate_summary_report (results : list [dict ]) -> dict :
@@ -599,6 +610,24 @@ def parse_args():
599610 help = 'Run N evaluations in parallel (default: 1 = sequential). Use -j 0 for auto (CPU count)'
600611 )
601612
613+ parser .add_argument (
614+ '--mcp-binary' ,
615+ metavar = 'PATH' ,
616+ help = 'MCP binary path (overrides value from bulk_run results)'
617+ )
618+
619+ parser .add_argument (
620+ '--backend' ,
621+ metavar = 'NAME' ,
622+ help = 'Backend used (claude/litellm, overrides value from bulk_run results)'
623+ )
624+
625+ parser .add_argument (
626+ '--model' ,
627+ metavar = 'NAME' ,
628+ help = 'Model used (overrides value from bulk_run results)'
629+ )
630+
602631 filter_group = parser .add_argument_group ('app filtering' )
603632 filter_group .add_argument (
604633 '--apps' ,
@@ -742,7 +771,15 @@ async def main_async():
742771 sys .exit (1 )
743772
744773 # Load prompts and generation metrics from bulk_run.py and bulk_run_results
745- prompts , gen_metrics = load_prompts_and_metrics_from_bulk_run ()
774+ prompts , gen_metrics , run_config = load_prompts_and_metrics_from_bulk_run ()
775+
776+ # Override run config with CLI args if provided
777+ if args .mcp_binary :
778+ run_config ["mcp_binary" ] = args .mcp_binary
779+ if args .backend :
780+ run_config ["backend" ] = args .backend
781+ if args .model :
782+ run_config ["model" ] = args .model
746783
747784 # Get all app directories
748785 all_app_dirs = [d for d in sorted (apps_dir .iterdir ()) if d .is_dir () and not d .name .startswith ("." )]
@@ -924,12 +961,22 @@ async def evaluate_with_semaphore(index, app_dir):
924961 run_id = tracker .start_run (run_name = run_name , tags = tags )
925962
926963 # Log parameters
927- tracker .log_evaluation_parameters (
928- mode = "evaluation" ,
929- total_apps = summary ['total_apps' ],
930- timestamp = timestamp ,
931- model_version = "claude-sonnet-4-5-20250929"
932- )
964+ params = {
965+ "mode" : "evaluation" ,
966+ "total_apps" : summary ['total_apps' ],
967+ "timestamp" : timestamp ,
968+ "model_version" : "claude-sonnet-4-5-20250929" ,
969+ }
970+
971+ # Add run config parameters if available
972+ if run_config .get ("mcp_binary" ):
973+ params ["mcp_binary" ] = run_config ["mcp_binary" ]
974+ if run_config .get ("backend" ):
975+ params ["backend" ] = run_config ["backend" ]
976+ if run_config .get ("model" ):
977+ params ["llm_model" ] = run_config ["model" ]
978+
979+ tracker .log_evaluation_parameters (** params )
933980
934981 # Log metrics from evaluation report
935982 tracker .log_evaluation_metrics (full_report )
0 commit comments