Skip to content

Commit 73f5a5f

Browse files
authored
Additional reporting params to MLFlow (#572)
OTEL_SDK_DISABLED=true python3 cli/evaluate_all.py --limit 1 --mcp-binary="test-binary-v1.0" --backend="claude-test" --model="claude-sonnet-4-5" Co-authored-by: Evgenii Kniazev <[email protected]>
1 parent 7b308ed commit 73f5a5f

File tree

2 files changed

+72
-13
lines changed

2 files changed

+72
-13
lines changed

klaudbiusz/cli/bulk_run.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,9 @@ class RunResult(TypedDict):
3535
metrics: GenerationMetrics | None
3636
error: str | None
3737
app_dir: str | None
38+
mcp_binary: str | None
39+
backend: str
40+
model: str | None
3841

3942

4043
def run_single_generation(
@@ -99,6 +102,9 @@ def timeout_handler(signum, frame):
99102
"metrics": metrics,
100103
"error": None,
101104
"app_dir": app_dir,
105+
"mcp_binary": mcp_binary,
106+
"backend": backend,
107+
"model": model,
102108
}
103109
except TimeoutError as e:
104110
signal.alarm(0) # cancel timeout
@@ -109,6 +115,9 @@ def timeout_handler(signum, frame):
109115
"metrics": None,
110116
"error": str(e),
111117
"app_dir": None,
118+
"mcp_binary": mcp_binary,
119+
"backend": backend,
120+
"model": model,
112121
}
113122
except Exception as e:
114123
signal.alarm(0) # cancel timeout
@@ -119,6 +128,9 @@ def timeout_handler(signum, frame):
119128
"metrics": None,
120129
"error": str(e),
121130
"app_dir": None,
131+
"mcp_binary": mcp_binary,
132+
"backend": backend,
133+
"model": model,
122134
}
123135

124136

klaudbiusz/cli/evaluate_all.py

Lines changed: 60 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -69,17 +69,18 @@ def get_git_commit_hash() -> str | None:
6969
return None
7070

7171

72-
def load_prompts_and_metrics_from_bulk_run() -> tuple[dict[str, str], dict[str, dict]]:
72+
def load_prompts_and_metrics_from_bulk_run() -> tuple[dict[str, str], dict[str, dict], dict[str, str]]:
7373
"""Load prompts and generation metrics using PROMPTS dict from bulk_run.
7474
7575
Returns:
76-
(prompts_dict, metrics_dict) where metrics_dict contains cost_usd, input_tokens, output_tokens, turns
76+
(prompts_dict, metrics_dict, run_config_dict) where metrics_dict contains cost_usd, input_tokens, output_tokens, turns
77+
and run_config_dict contains mcp_binary, backend, model
7778
"""
7879
try:
7980
# Import PROMPTS from bulk_run.py
8081
from bulk_run import PROMPTS
8182
except ImportError:
82-
return {}, {}
83+
return {}, {}, {}
8384

8485
# Look for bulk_run_results file
8586
script_dir = Path(__file__).parent
@@ -88,12 +89,22 @@ def load_prompts_and_metrics_from_bulk_run() -> tuple[dict[str, str], dict[str,
8889
results_files = sorted(script_dir.glob("../app/bulk_run_results_*.json"), reverse=True)
8990

9091
if not results_files:
91-
return dict(PROMPTS), {}
92+
return dict(PROMPTS), {}, {}
9293

9394
# Load generation metrics from results file
9495
try:
9596
data = json.loads(results_files[0].read_text())
9697

98+
# Extract run configuration from first result
99+
run_config = {}
100+
if data and len(data) > 0:
101+
first_result = data[0]
102+
run_config = {
103+
"mcp_binary": first_result.get("mcp_binary", "cargo run (default)"),
104+
"backend": first_result.get("backend", "claude"),
105+
"model": first_result.get("model"),
106+
}
107+
97108
# Create a prompt->metrics mapping
98109
prompt_to_metrics = {}
99110
for result in data:
@@ -113,10 +124,10 @@ def load_prompts_and_metrics_from_bulk_run() -> tuple[dict[str, str], dict[str,
113124
if prompt in prompt_to_metrics:
114125
gen_metrics[app_name] = prompt_to_metrics[prompt]
115126

116-
return dict(PROMPTS), gen_metrics
127+
return dict(PROMPTS), gen_metrics, run_config
117128

118129
except Exception:
119-
return dict(PROMPTS), {}
130+
return dict(PROMPTS), {}, {}
120131

121132

122133
def generate_summary_report(results: list[dict]) -> dict:
@@ -599,6 +610,24 @@ def parse_args():
599610
help='Run N evaluations in parallel (default: 1 = sequential). Use -j 0 for auto (CPU count)'
600611
)
601612

613+
parser.add_argument(
614+
'--mcp-binary',
615+
metavar='PATH',
616+
help='MCP binary path (overrides value from bulk_run results)'
617+
)
618+
619+
parser.add_argument(
620+
'--backend',
621+
metavar='NAME',
622+
help='Backend used (claude/litellm, overrides value from bulk_run results)'
623+
)
624+
625+
parser.add_argument(
626+
'--model',
627+
metavar='NAME',
628+
help='Model used (overrides value from bulk_run results)'
629+
)
630+
602631
filter_group = parser.add_argument_group('app filtering')
603632
filter_group.add_argument(
604633
'--apps',
@@ -742,7 +771,15 @@ async def main_async():
742771
sys.exit(1)
743772

744773
# Load prompts and generation metrics from bulk_run.py and bulk_run_results
745-
prompts, gen_metrics = load_prompts_and_metrics_from_bulk_run()
774+
prompts, gen_metrics, run_config = load_prompts_and_metrics_from_bulk_run()
775+
776+
# Override run config with CLI args if provided
777+
if args.mcp_binary:
778+
run_config["mcp_binary"] = args.mcp_binary
779+
if args.backend:
780+
run_config["backend"] = args.backend
781+
if args.model:
782+
run_config["model"] = args.model
746783

747784
# Get all app directories
748785
all_app_dirs = [d for d in sorted(apps_dir.iterdir()) if d.is_dir() and not d.name.startswith(".")]
@@ -924,12 +961,22 @@ async def evaluate_with_semaphore(index, app_dir):
924961
run_id = tracker.start_run(run_name=run_name, tags=tags)
925962

926963
# Log parameters
927-
tracker.log_evaluation_parameters(
928-
mode="evaluation",
929-
total_apps=summary['total_apps'],
930-
timestamp=timestamp,
931-
model_version="claude-sonnet-4-5-20250929"
932-
)
964+
params = {
965+
"mode": "evaluation",
966+
"total_apps": summary['total_apps'],
967+
"timestamp": timestamp,
968+
"model_version": "claude-sonnet-4-5-20250929",
969+
}
970+
971+
# Add run config parameters if available
972+
if run_config.get("mcp_binary"):
973+
params["mcp_binary"] = run_config["mcp_binary"]
974+
if run_config.get("backend"):
975+
params["backend"] = run_config["backend"]
976+
if run_config.get("model"):
977+
params["llm_model"] = run_config["model"]
978+
979+
tracker.log_evaluation_parameters(**params)
933980

934981
# Log metrics from evaluation report
935982
tracker.log_evaluation_metrics(full_report)

0 commit comments

Comments
 (0)