Skip to content

Commit 5939250

Browse files
committed
Reorganize klaudbiusz CLI into logical subdirectories
- Created generation/, evaluation/, utils/ subdirectories - Moved 23 Python files into organized structure - Updated all imports to use new paths - Fixed code quality issues (ruff, pyright) - Updated README.md with new structure - Added build artifacts to .gitignore
1 parent 8f204ec commit 5939250

27 files changed

+140
-130
lines changed

klaudbiusz/.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,3 +18,7 @@ __pycache__/
1818
*$py.class
1919
*.so
2020
.Python
21+
22+
# Build artifacts
23+
.DS_Store
24+
.ruff_cache/

klaudbiusz/README.md

Lines changed: 38 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -30,29 +30,29 @@ cli/archive_evaluation.sh
3030
cli/cleanup_evaluation.sh
3131

3232
# Generate a single app (Claude backend, default)
33-
uv run cli/single_run.py "Create a customer churn analysis dashboard"
33+
uv run cli/generation/single_run.py "Create a customer churn analysis dashboard"
3434

3535
# Use LiteLLM backend with specific model
36-
uv run cli/single_run.py "Create a customer churn analysis dashboard" \
36+
uv run cli/generation/single_run.py "Create a customer churn analysis dashboard" \
3737
--backend=litellm --model=openrouter/minimax/minimax-m2
3838

3939
# Batch generate from prompts (databricks set by default)
40-
uv run cli/bulk_run.py
40+
uv run cli/generation/bulk_run.py
4141

4242
# Batch generate with test prompts
43-
uv run cli/bulk_run.py --prompts=test
43+
uv run cli/generation/bulk_run.py --prompts=test
4444

4545
# Batch generate with LiteLLM backend
46-
uv run cli/bulk_run.py --backend=litellm --model=gemini/gemini-2.5-pro
46+
uv run cli/generation/bulk_run.py --backend=litellm --model=gemini/gemini-2.5-pro
4747

4848
# Custom output directory
49-
uv run cli/bulk_run.py --output-dir=/path/to/custom/folder
49+
uv run cli/generation/bulk_run.py --output-dir=/path/to/custom/folder
5050

5151
# Custom MCP binary (for testing modified edda_mcp)
52-
uv run cli/bulk_run.py --mcp-binary=/path/to/custom/edda_mcp
52+
uv run cli/generation/bulk_run.py --mcp-binary=/path/to/custom/edda_mcp
5353

5454
# Combined example
55-
uv run cli/bulk_run.py \
55+
uv run cli/generation/bulk_run.py \
5656
--backend=litellm \
5757
--model=gemini/gemini-2.5-pro \
5858
--output-dir=./my-apps \
@@ -65,28 +65,28 @@ uv run cli/bulk_run.py \
6565
cd klaudbiusz
6666

6767
# Evaluate all apps
68-
uv run cli/evaluate_all.py
68+
uv run cli/evaluation/evaluate_all.py
6969

7070
# Parallel evaluation (faster for large batches)
71-
uv run cli/evaluate_all.py -j 4 # Run 4 evaluations in parallel
72-
uv run cli/evaluate_all.py -j 0 # Auto-detect CPU count
73-
uv run cli/evaluate_all.py --parallel 8 # Long form
71+
uv run cli/evaluation/evaluate_all.py -j 4 # Run 4 evaluations in parallel
72+
uv run cli/evaluation/evaluate_all.py -j 0 # Auto-detect CPU count
73+
uv run cli/evaluation/evaluate_all.py --parallel 8 # Long form
7474

7575
# Partial evaluation (filter apps)
76-
uv run cli/evaluate_all.py --limit 5 # First 5 apps
77-
uv run cli/evaluate_all.py --apps app1 app2 # Specific apps
78-
uv run cli/evaluate_all.py --pattern "customer*" # Pattern matching
79-
uv run cli/evaluate_all.py --skip 10 --limit 5 # Skip first 10, evaluate next 5
80-
uv run cli/evaluate_all.py --start-from app5 # Start from specific app
76+
uv run cli/evaluation/evaluate_all.py --limit 5 # First 5 apps
77+
uv run cli/evaluation/evaluate_all.py --apps app1 app2 # Specific apps
78+
uv run cli/evaluation/evaluate_all.py --pattern "customer*" # Pattern matching
79+
uv run cli/evaluation/evaluate_all.py --skip 10 --limit 5 # Skip first 10, evaluate next 5
80+
uv run cli/evaluation/evaluate_all.py --start-from app5 # Start from specific app
8181

8282
# Custom directory
83-
uv run cli/evaluate_all.py --dir /path/to/apps # Evaluate apps in custom directory
83+
uv run cli/evaluation/evaluate_all.py --dir /path/to/apps # Evaluate apps in custom directory
8484

8585
# Staging environment (for testing)
86-
uv run cli/evaluate_all.py --staging # Log to staging MLflow experiment
86+
uv run cli/evaluation/evaluate_all.py --staging # Log to staging MLflow experiment
8787

8888
# Evaluate single app
89-
uv run cli/evaluate_app.py ../app/customer-churn-analysis
89+
uv run cli/evaluation/evaluate_app.py ../app/customer-churn-analysis
9090
```
9191

9292
**Results are automatically logged to MLflow:** Navigate to `ML → Experiments → /Shared/klaudbiusz-evaluations` in Databricks UI / Googfooding.
@@ -151,11 +151,21 @@ klaudbiusz/
151151
│ └── DORA_METRICS.md # DORA & agentic DevX
152152
├── app/ # Generated applications (gitignored)
153153
├── cli/ # Generation & evaluation scripts
154-
│ ├── single_run.py # Single app generation
155-
│ ├── bulk_run.py # Batch app generation
156-
│ ├── analyze_trajectories.py # Get LLM recommendations based on previous runs
157-
│ ├── evaluate_all.py # Batch evaluation
158-
│ ├── evaluate_app.py # Single app evaluation
154+
│ ├── generation/ # App generation
155+
│ │ ├── prompts/ # Prompt collections
156+
│ │ ├── codegen.py # Claude Agent SDK backend
157+
│ │ ├── codegen_multi.py # LiteLLM backend
158+
│ │ ├── single_run.py # Single app generation
159+
│ │ ├── bulk_run.py # Batch app generation
160+
│ │ └── screenshot.py # Batch screenshotting
161+
│ ├── evaluation/ # App evaluation
162+
│ │ ├── evaluate_all.py # Batch evaluation
163+
│ │ ├── evaluate_app.py # Single app evaluation (legacy)
164+
│ │ ├── evaluate_app_dagger.py # Dagger-based evaluation
165+
│ │ ├── eval_checks.py # Check functions
166+
│ │ └── eval_metrics.py # Metric definitions
167+
│ ├── utils/ # Shared utilities
168+
│ ├── analyze_trajectories.py # Get LLM recommendations
159169
│ ├── archive_evaluation.sh # Create evaluation archive
160170
│ └── cleanup_evaluation.sh # Clean generated apps
161171
├── EVALUATION_REPORT.md # Latest results (gitignored)
@@ -169,14 +179,14 @@ klaudbiusz/
169179
### Development Workflow
170180

171181
1. Write natural language prompt
172-
2. Generate: `uv run cli/single_run.py "your prompt"` or `uv run cli/bulk_run.py`
173-
3. Evaluate: `uv run cli/evaluate_all.py -j 0` (parallel, auto-detect CPUs)
182+
2. Generate: `uv run cli/generation/single_run.py "your prompt"` or `uv run cli/generation/bulk_run.py`
183+
3. Evaluate: `uv run cli/evaluation/evaluate_all.py -j 0` (parallel, auto-detect CPUs)
174184
4. Review: `cat EVALUATION_REPORT.md`
175185
5. Deploy apps that pass checks
176186

177187
### AI Assisted Edda Improvement Workflow
178188

179-
1. Generate many apps with `uv run cli/bulk_run.py`
189+
1. Generate many apps with `uv run cli/generation/bulk_run.py`
180190
2. Analyze the trajectories with `uv run cli/analyze_trajectories.py`
181191
3. Based on the report, improve Edda tools and scaffolding
182192
4. Rerun the evaluation to measure impact

klaudbiusz/cli/analyze_trajectories.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
query,
2121
)
2222
from dotenv import load_dotenv
23-
from shared import build_mcp_command, validate_mcp_manifest
23+
from cli.utils.shared import build_mcp_command, validate_mcp_manifest
2424

2525
logger = logging.getLogger(__name__)
2626

@@ -153,6 +153,9 @@ def get_mcp_tools_description(mcp_binary: str | None, project_root: Path, mcp_js
153153
stderr=subprocess.PIPE,
154154
)
155155

156+
if proc.stdin is None or proc.stdout is None:
157+
raise RuntimeError("Failed to create subprocess pipes")
158+
156159
init_request = (
157160
json.dumps(
158161
{
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
"""App evaluation tools."""
File renamed without changes.
File renamed without changes.

klaudbiusz/cli/evaluate_all.py renamed to klaudbiusz/cli/evaluation/evaluate_all.py

Lines changed: 23 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -20,35 +20,27 @@
2020
import fnmatch
2121
import json
2222
import sys
23-
from datetime import datetime
24-
from dotenv import load_dotenv
25-
26-
# Load environment variables from .env file
27-
load_dotenv()
2823
import time
2924
from collections import Counter, defaultdict
3025
from dataclasses import asdict
26+
from datetime import datetime
3127
from pathlib import Path
3228

33-
# Load environment variables
34-
try:
35-
from dotenv import load_dotenv
36-
env_paths = [
37-
Path(__file__).parent.parent.parent / "edda" / ".env",
38-
Path(__file__).parent.parent / ".env",
39-
]
40-
for env_path in env_paths:
41-
if env_path.exists():
42-
load_dotenv(env_path)
43-
break
44-
except ImportError:
45-
pass
46-
4729
import dagger
30+
from dotenv import load_dotenv
4831

49-
# Import async Dagger-based evaluation
50-
from evaluate_app_dagger import evaluate_app_async
51-
from eval_metrics import eff_units
32+
from cli.evaluation.eval_metrics import eff_units
33+
from cli.evaluation.evaluate_app_dagger import evaluate_app_async
34+
35+
# Load environment variables from .env file
36+
env_paths = [
37+
Path(__file__).parent.parent.parent / "edda" / ".env",
38+
Path(__file__).parent.parent / ".env",
39+
]
40+
for env_path in env_paths:
41+
if env_path.exists():
42+
load_dotenv(env_path)
43+
break
5244

5345

5446
def get_git_commit_hash() -> str | None:
@@ -78,7 +70,7 @@ def load_prompts_and_metrics_from_bulk_run() -> tuple[dict[str, str], dict[str,
7870
"""
7971
try:
8072
# Import PROMPTS from bulk_run.py
81-
from bulk_run import PROMPTS
73+
from cli.generation.bulk_run import PROMPTS
8274
except ImportError:
8375
return {}, {}, {}
8476

@@ -890,10 +882,11 @@ async def evaluate_with_semaphore(index, app_dir):
890882
estimated_sequential = eval_duration * args.parallel
891883
print(f" ⚡ Parallelization saved ~{estimated_sequential - eval_duration:.1f}s (speedup: {estimated_sequential/eval_duration:.1f}x)")
892884

893-
# Generate summary and report
894-
print("\n📊 Generating summary report...")
895-
summary = generate_summary_report(results)
896-
markdown = generate_markdown_report(results, summary)
885+
# Generate summary and report (filter out None results)
886+
valid_results = [r for r in results if r is not None]
887+
print(f"\n📊 Generating summary report for {len(valid_results)} apps...")
888+
summary = generate_summary_report(valid_results)
889+
markdown = generate_markdown_report(valid_results, summary)
897890

898891
# Determine output paths - save to app-eval directory
899892
output_dir = script_dir.parent / "app-eval"
@@ -919,7 +912,7 @@ async def evaluate_with_semaphore(index, app_dir):
919912
# Save full results
920913
full_report = {
921914
"summary": summary,
922-
"apps": results,
915+
"apps": valid_results,
923916
"timestamp": timestamp,
924917
"evaluation_run_id": timestamp,
925918
}
@@ -932,14 +925,14 @@ async def evaluate_with_semaphore(index, app_dir):
932925

933926
# Save CSV report
934927
csv_output = output_dir / "evaluation_report.csv"
935-
csv_content = generate_csv_report(results)
928+
csv_content = generate_csv_report(valid_results)
936929
csv_output.write_text(csv_content)
937930
print(f"✓ CSV report saved: {csv_output}")
938931

939932
# Log to MLflow
940933
print("\n📊 Logging to MLflow...")
941934
try:
942-
from mlflow_tracker import EvaluationTracker
935+
from cli.utils.mlflow_tracker import EvaluationTracker
943936

944937
# Determine experiment name based on --staging flag
945938
experiment_name = "/Shared/edda-staging-evaluations" if args.staging else None

klaudbiusz/cli/evaluate_app.py renamed to klaudbiusz/cli/evaluation/evaluate_app.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,15 @@
2424
from pathlib import Path
2525
from typing import Any
2626

27+
from dotenv import load_dotenv
28+
29+
from cli.evaluation.eval_checks import check_databricks_connectivity as _check_db_connectivity, extract_sql_queries
30+
from cli.evaluation.eval_metrics import calculate_appeval_100, eff_units
31+
from cli.utils.template_detection import detect_template
32+
2733
# Add the cli directory to Python path for imports
2834
sys.path.insert(0, str(Path(__file__).parent))
2935

30-
from dotenv import load_dotenv
31-
3236
# Load environment variables from .env file - try multiple locations
3337
env_paths = [
3438
Path(__file__).parent.parent.parent / "edda" / ".env",
@@ -45,10 +49,6 @@
4549
except ImportError:
4650
anthropic = None
4751

48-
from eval_metrics import calculate_appeval_100, eff_units
49-
from eval_checks import check_databricks_connectivity as _check_db_connectivity, extract_sql_queries
50-
from template_detection import detect_template
51-
5252

5353
def get_backend_dir(app_dir: Path, template: str) -> Path:
5454
"""Get backend directory based on template type."""
@@ -368,7 +368,7 @@ def _stop_app(app_dir: Path, template: str = "unknown", port: int = 8000) -> boo
368368
timeout=5,
369369
)
370370
time.sleep(1)
371-
except:
371+
except Exception:
372372
pass
373373
return False
374374

klaudbiusz/cli/evaluate_app_dagger.py renamed to klaudbiusz/cli/evaluation/evaluate_app_dagger.py

Lines changed: 23 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,29 @@
1212
from dataclasses import asdict
1313
from pathlib import Path
1414

15+
import dagger
16+
from dotenv import load_dotenv
17+
18+
from cli.evaluation.evaluate_app import (
19+
EvalResult,
20+
FullMetrics,
21+
check_data_validity_llm,
22+
check_databricks_connectivity,
23+
check_deployability,
24+
check_local_runability,
25+
check_ui_functional_vlm,
26+
load_prompts_from_bulk_results,
27+
)
28+
from cli.utils.template_detection import detect_template
29+
from cli.utils.ts_workspace import (
30+
build_app,
31+
check_runtime,
32+
check_types,
33+
create_ts_workspace,
34+
install_dependencies,
35+
run_tests,
36+
)
37+
1538
# Add the cli directory to Python path for imports
1639
sys.path.insert(0, str(Path(__file__).parent))
1740

@@ -20,8 +43,6 @@
2043
if str(agent_path) not in sys.path:
2144
sys.path.insert(0, str(agent_path))
2245

23-
from dotenv import load_dotenv
24-
2546
# Load environment variables
2647
env_paths = [
2748
Path(__file__).parent.parent.parent / "edda" / ".env",
@@ -33,30 +54,6 @@
3354
load_dotenv(env_path, override=True)
3455
break
3556

36-
import dagger
37-
38-
from ts_workspace import (
39-
create_ts_workspace,
40-
install_dependencies,
41-
build_app,
42-
check_runtime,
43-
run_tests,
44-
check_types,
45-
)
46-
47-
# Import original helper functions and classes
48-
from evaluate_app import (
49-
FullMetrics,
50-
EvalResult,
51-
check_databricks_connectivity,
52-
check_data_validity_llm,
53-
check_ui_functional_vlm,
54-
check_local_runability,
55-
check_deployability,
56-
load_prompts_from_bulk_results,
57-
)
58-
from template_detection import detect_template
59-
6057

6158
async def evaluate_app_async(
6259
client: dagger.Client,

klaudbiusz/cli/generate_eval_viewer.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,6 @@ def generate_html_viewer(eval_json_path: Path, output_path: Path):
1313
with open(eval_json_path) as f:
1414
data = json.load(f)
1515

16-
summary = data.get("summary", {})
17-
apps = data.get("apps", [])
18-
1916
# Embed the JSON data directly in the HTML
2017
json_data = json.dumps(data, indent=2)
2118

0 commit comments

Comments
 (0)