neondatabase
diff --git a/‎klaudbiusz/.gitignore‎
Lines changed: 4 additions & 0 deletions b/‎klaudbiusz/.gitignore‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎klaudbiusz/README.md‎
Lines changed: 38 additions & 28 deletions b/‎klaudbiusz/README.md‎
Lines changed: 38 additions & 28 deletions
diff --git a/‎klaudbiusz/cli/analyze_trajectories.py‎
Lines changed: 4 additions & 1 deletion b/‎klaudbiusz/cli/analyze_trajectories.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎klaudbiusz/cli/evaluation/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎klaudbiusz/cli/evaluation/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎klaudbiusz/cli/eval_checks.py‎ renamed to ‎klaudbiusz/cli/evaluation/eval_checks.py‎ b/‎klaudbiusz/cli/eval_checks.py‎ renamed to ‎klaudbiusz/cli/evaluation/eval_checks.py‎
diff --git a/‎klaudbiusz/cli/eval_metrics.py‎ renamed to ‎klaudbiusz/cli/evaluation/eval_metrics.py‎ b/‎klaudbiusz/cli/eval_metrics.py‎ renamed to ‎klaudbiusz/cli/evaluation/eval_metrics.py‎
diff --git a/‎klaudbiusz/cli/evaluate_all.py‎ renamed to ‎klaudbiusz/cli/evaluation/evaluate_all.py‎
Lines changed: 23 additions & 30 deletions b/‎klaudbiusz/cli/evaluate_all.py‎ renamed to ‎klaudbiusz/cli/evaluation/evaluate_all.py‎
Lines changed: 23 additions & 30 deletions
diff --git a/‎klaudbiusz/cli/evaluate_app.py‎ renamed to ‎klaudbiusz/cli/evaluation/evaluate_app.py‎
Lines changed: 7 additions & 7 deletions b/‎klaudbiusz/cli/evaluate_app.py‎ renamed to ‎klaudbiusz/cli/evaluation/evaluate_app.py‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎klaudbiusz/cli/evaluate_app_dagger.py‎ renamed to ‎klaudbiusz/cli/evaluation/evaluate_app_dagger.py‎
Lines changed: 23 additions & 26 deletions b/‎klaudbiusz/cli/evaluate_app_dagger.py‎ renamed to ‎klaudbiusz/cli/evaluation/evaluate_app_dagger.py‎
Lines changed: 23 additions & 26 deletions
diff --git a/‎klaudbiusz/cli/generate_eval_viewer.py‎
Lines changed: 0 additions & 3 deletions b/‎klaudbiusz/cli/generate_eval_viewer.py‎
Lines changed: 0 additions & 3 deletions
@@ -18,3 +18,7 @@ __pycache__/
 *$py.class
 *.so
 .Python
+
+# Build artifacts
+.DS_Store
+.ruff_cache/
@@ -30,29 +30,29 @@ cli/archive_evaluation.sh
 cli/cleanup_evaluation.sh
 
 # Generate a single app (Claude backend, default)
-uv run cli/single_run.py "Create a customer churn analysis dashboard"
+uv run cli/generation/single_run.py "Create a customer churn analysis dashboard"
 
 # Use LiteLLM backend with specific model
-uv run cli/single_run.py "Create a customer churn analysis dashboard" \
+uv run cli/generation/single_run.py "Create a customer churn analysis dashboard" \
   --backend=litellm --model=openrouter/minimax/minimax-m2
 
 # Batch generate from prompts (databricks set by default)
-uv run cli/bulk_run.py
+uv run cli/generation/bulk_run.py
 
 # Batch generate with test prompts
-uv run cli/bulk_run.py --prompts=test
+uv run cli/generation/bulk_run.py --prompts=test
 
 # Batch generate with LiteLLM backend
-uv run cli/bulk_run.py --backend=litellm --model=gemini/gemini-2.5-pro
+uv run cli/generation/bulk_run.py --backend=litellm --model=gemini/gemini-2.5-pro
 
 # Custom output directory
-uv run cli/bulk_run.py --output-dir=/path/to/custom/folder
+uv run cli/generation/bulk_run.py --output-dir=/path/to/custom/folder
 
 # Custom MCP binary (for testing modified edda_mcp)
-uv run cli/bulk_run.py --mcp-binary=/path/to/custom/edda_mcp
+uv run cli/generation/bulk_run.py --mcp-binary=/path/to/custom/edda_mcp
 
 # Combined example
-uv run cli/bulk_run.py \
+uv run cli/generation/bulk_run.py \
   --backend=litellm \
   --model=gemini/gemini-2.5-pro \
   --output-dir=./my-apps \
@@ -65,28 +65,28 @@ uv run cli/bulk_run.py \
 cd klaudbiusz
 
 # Evaluate all apps
-uv run cli/evaluate_all.py
+uv run cli/evaluation/evaluate_all.py
 
 # Parallel evaluation (faster for large batches)
-uv run cli/evaluate_all.py -j 4                         # Run 4 evaluations in parallel
-uv run cli/evaluate_all.py -j 0                         # Auto-detect CPU count
-uv run cli/evaluate_all.py --parallel 8                 # Long form
+uv run cli/evaluation/evaluate_all.py -j 4                         # Run 4 evaluations in parallel
+uv run cli/evaluation/evaluate_all.py -j 0                         # Auto-detect CPU count
+uv run cli/evaluation/evaluate_all.py --parallel 8                 # Long form
 
 # Partial evaluation (filter apps)
-uv run cli/evaluate_all.py --limit 5                    # First 5 apps
-uv run cli/evaluate_all.py --apps app1 app2             # Specific apps
-uv run cli/evaluate_all.py --pattern "customer*"        # Pattern matching
-uv run cli/evaluate_all.py --skip 10 --limit 5          # Skip first 10, evaluate next 5
-uv run cli/evaluate_all.py --start-from app5            # Start from specific app
+uv run cli/evaluation/evaluate_all.py --limit 5                    # First 5 apps
+uv run cli/evaluation/evaluate_all.py --apps app1 app2             # Specific apps
+uv run cli/evaluation/evaluate_all.py --pattern "customer*"        # Pattern matching
+uv run cli/evaluation/evaluate_all.py --skip 10 --limit 5          # Skip first 10, evaluate next 5
+uv run cli/evaluation/evaluate_all.py --start-from app5            # Start from specific app
 
 # Custom directory
-uv run cli/evaluate_all.py --dir /path/to/apps          # Evaluate apps in custom directory
+uv run cli/evaluation/evaluate_all.py --dir /path/to/apps          # Evaluate apps in custom directory
 
 # Staging environment (for testing)
-uv run cli/evaluate_all.py --staging                    # Log to staging MLflow experiment
+uv run cli/evaluation/evaluate_all.py --staging                    # Log to staging MLflow experiment
 
 # Evaluate single app
-uv run cli/evaluate_app.py ../app/customer-churn-analysis
+uv run cli/evaluation/evaluate_app.py ../app/customer-churn-analysis
 ```
 
 **Results are automatically logged to MLflow:** Navigate to `ML → Experiments → /Shared/klaudbiusz-evaluations` in Databricks UI / Googfooding.
@@ -151,11 +151,21 @@ klaudbiusz/
 │   └── DORA_METRICS.md             # DORA & agentic DevX
 ├── app/                             # Generated applications (gitignored)
 ├── cli/                             # Generation & evaluation scripts
-│   ├── single_run.py               # Single app generation
-│   ├── bulk_run.py                 # Batch app generation
-│   ├── analyze_trajectories.py     # Get LLM recommendations based on previous runs
-│   ├── evaluate_all.py             # Batch evaluation
-│   ├── evaluate_app.py             # Single app evaluation
+│   ├── generation/                 # App generation
+│   │   ├── prompts/               # Prompt collections
+│   │   ├── codegen.py             # Claude Agent SDK backend
+│   │   ├── codegen_multi.py       # LiteLLM backend
+│   │   ├── single_run.py          # Single app generation
+│   │   ├── bulk_run.py            # Batch app generation
+│   │   └── screenshot.py          # Batch screenshotting
+│   ├── evaluation/                 # App evaluation
+│   │   ├── evaluate_all.py        # Batch evaluation
+│   │   ├── evaluate_app.py        # Single app evaluation (legacy)
+│   │   ├── evaluate_app_dagger.py # Dagger-based evaluation
+│   │   ├── eval_checks.py         # Check functions
+│   │   └── eval_metrics.py        # Metric definitions
+│   ├── utils/                      # Shared utilities
+│   ├── analyze_trajectories.py     # Get LLM recommendations
 │   ├── archive_evaluation.sh       # Create evaluation archive
 │   └── cleanup_evaluation.sh       # Clean generated apps
 ├── EVALUATION_REPORT.md            # Latest results (gitignored)
@@ -169,14 +179,14 @@ klaudbiusz/
 ### Development Workflow
 
 1. Write natural language prompt
-2. Generate: `uv run cli/single_run.py "your prompt"` or `uv run cli/bulk_run.py`
-3. Evaluate: `uv run cli/evaluate_all.py -j 0` (parallel, auto-detect CPUs)
+2. Generate: `uv run cli/generation/single_run.py "your prompt"` or `uv run cli/generation/bulk_run.py`
+3. Evaluate: `uv run cli/evaluation/evaluate_all.py -j 0` (parallel, auto-detect CPUs)
 4. Review: `cat EVALUATION_REPORT.md`
 5. Deploy apps that pass checks
 
 ### AI Assisted Edda Improvement Workflow
 
-1. Generate many apps with `uv run cli/bulk_run.py`
+1. Generate many apps with `uv run cli/generation/bulk_run.py`
 2. Analyze the trajectories with `uv run cli/analyze_trajectories.py`
 3. Based on the report, improve Edda tools and scaffolding
 4. Rerun the evaluation to measure impact
 
@@ -20,7 +20,7 @@
     query,
 )
 from dotenv import load_dotenv
-from shared import build_mcp_command, validate_mcp_manifest
+from cli.utils.shared import build_mcp_command, validate_mcp_manifest
 
 logger = logging.getLogger(__name__)
 
@@ -153,6 +153,9 @@ def get_mcp_tools_description(mcp_binary: str | None, project_root: Path, mcp_js
         stderr=subprocess.PIPE,
     )
 
+    if proc.stdin is None or proc.stdout is None:
+        raise RuntimeError("Failed to create subprocess pipes")
+
     init_request = (
         json.dumps(
             {
 
@@ -0,0 +1 @@
+"""App evaluation tools."""
@@ -20,35 +20,27 @@
 import fnmatch
 import json
 import sys
-from datetime import datetime
-from dotenv import load_dotenv
-
-# Load environment variables from .env file
-load_dotenv()
 import time
 from collections import Counter, defaultdict
 from dataclasses import asdict
+from datetime import datetime
 from pathlib import Path
 
-# Load environment variables
-try:
-    from dotenv import load_dotenv
-    env_paths = [
-        Path(__file__).parent.parent.parent / "edda" / ".env",
-        Path(__file__).parent.parent / ".env",
-    ]
-    for env_path in env_paths:
-        if env_path.exists():
-            load_dotenv(env_path)
-            break
-except ImportError:
-    pass
-
 import dagger
+from dotenv import load_dotenv
 
-# Import async Dagger-based evaluation
-from evaluate_app_dagger import evaluate_app_async
-from eval_metrics import eff_units
+from cli.evaluation.eval_metrics import eff_units
+from cli.evaluation.evaluate_app_dagger import evaluate_app_async
+
+# Load environment variables from .env file
+env_paths = [
+    Path(__file__).parent.parent.parent / "edda" / ".env",
+    Path(__file__).parent.parent / ".env",
+]
+for env_path in env_paths:
+    if env_path.exists():
+        load_dotenv(env_path)
+        break
 
 
 def get_git_commit_hash() -> str | None:
@@ -78,7 +70,7 @@ def load_prompts_and_metrics_from_bulk_run() -> tuple[dict[str, str], dict[str,
     """
     try:
         # Import PROMPTS from bulk_run.py
-        from bulk_run import PROMPTS
+        from cli.generation.bulk_run import PROMPTS
     except ImportError:
         return {}, {}, {}
 
@@ -890,10 +882,11 @@ async def evaluate_with_semaphore(index, app_dir):
         estimated_sequential = eval_duration * args.parallel
         print(f"   ⚡ Parallelization saved ~{estimated_sequential - eval_duration:.1f}s (speedup: {estimated_sequential/eval_duration:.1f}x)")
 
-    # Generate summary and report
-    print("\n📊 Generating summary report...")
-    summary = generate_summary_report(results)
-    markdown = generate_markdown_report(results, summary)
+    # Generate summary and report (filter out None results)
+    valid_results = [r for r in results if r is not None]
+    print(f"\n📊 Generating summary report for {len(valid_results)} apps...")
+    summary = generate_summary_report(valid_results)
+    markdown = generate_markdown_report(valid_results, summary)
 
     # Determine output paths - save to app-eval directory
     output_dir = script_dir.parent / "app-eval"
@@ -919,7 +912,7 @@ async def evaluate_with_semaphore(index, app_dir):
     # Save full results
     full_report = {
         "summary": summary,
-        "apps": results,
+        "apps": valid_results,
         "timestamp": timestamp,
         "evaluation_run_id": timestamp,
     }
@@ -932,14 +925,14 @@ async def evaluate_with_semaphore(index, app_dir):
 
     # Save CSV report
     csv_output = output_dir / "evaluation_report.csv"
-    csv_content = generate_csv_report(results)
+    csv_content = generate_csv_report(valid_results)
     csv_output.write_text(csv_content)
     print(f"✓ CSV report saved: {csv_output}")
 
     # Log to MLflow
     print("\n📊 Logging to MLflow...")
     try:
-        from mlflow_tracker import EvaluationTracker
+        from cli.utils.mlflow_tracker import EvaluationTracker
 
         # Determine experiment name based on --staging flag
         experiment_name = "/Shared/edda-staging-evaluations" if args.staging else None
 
@@ -24,11 +24,15 @@
 from pathlib import Path
 from typing import Any
 
+from dotenv import load_dotenv
+
+from cli.evaluation.eval_checks import check_databricks_connectivity as _check_db_connectivity, extract_sql_queries
+from cli.evaluation.eval_metrics import calculate_appeval_100, eff_units
+from cli.utils.template_detection import detect_template
+
 # Add the cli directory to Python path for imports
 sys.path.insert(0, str(Path(__file__).parent))
 
-from dotenv import load_dotenv
-
 # Load environment variables from .env file - try multiple locations
 env_paths = [
     Path(__file__).parent.parent.parent / "edda" / ".env",
@@ -45,10 +49,6 @@
 except ImportError:
     anthropic = None
 
-from eval_metrics import calculate_appeval_100, eff_units
-from eval_checks import check_databricks_connectivity as _check_db_connectivity, extract_sql_queries
-from template_detection import detect_template
-
 
 def get_backend_dir(app_dir: Path, template: str) -> Path:
     """Get backend directory based on template type."""
@@ -368,7 +368,7 @@ def _stop_app(app_dir: Path, template: str = "unknown", port: int = 8000) -> boo
                 timeout=5,
             )
             time.sleep(1)
-        except:
+        except Exception:
             pass
         return False
 
 
@@ -12,6 +12,29 @@
 from dataclasses import asdict
 from pathlib import Path
 
+import dagger
+from dotenv import load_dotenv
+
+from cli.evaluation.evaluate_app import (
+    EvalResult,
+    FullMetrics,
+    check_data_validity_llm,
+    check_databricks_connectivity,
+    check_deployability,
+    check_local_runability,
+    check_ui_functional_vlm,
+    load_prompts_from_bulk_results,
+)
+from cli.utils.template_detection import detect_template
+from cli.utils.ts_workspace import (
+    build_app,
+    check_runtime,
+    check_types,
+    create_ts_workspace,
+    install_dependencies,
+    run_tests,
+)
+
 # Add the cli directory to Python path for imports
 sys.path.insert(0, str(Path(__file__).parent))
 
@@ -20,8 +43,6 @@
 if str(agent_path) not in sys.path:
     sys.path.insert(0, str(agent_path))
 
-from dotenv import load_dotenv
-
 # Load environment variables
 env_paths = [
     Path(__file__).parent.parent.parent / "edda" / ".env",
@@ -33,30 +54,6 @@
         load_dotenv(env_path, override=True)
         break
 
-import dagger
-
-from ts_workspace import (
-    create_ts_workspace,
-    install_dependencies,
-    build_app,
-    check_runtime,
-    run_tests,
-    check_types,
-)
-
-# Import original helper functions and classes
-from evaluate_app import (
-    FullMetrics,
-    EvalResult,
-    check_databricks_connectivity,
-    check_data_validity_llm,
-    check_ui_functional_vlm,
-    check_local_runability,
-    check_deployability,
-    load_prompts_from_bulk_results,
-)
-from template_detection import detect_template
-
 
 async def evaluate_app_async(
     client: dagger.Client,
 
@@ -13,9 +13,6 @@ def generate_html_viewer(eval_json_path: Path, output_path: Path):
     with open(eval_json_path) as f:
         data = json.load(f)
 
-    summary = data.get("summary", {})
-    apps = data.get("apps", [])
-
     # Embed the JSON data directly in the HTML
     json_data = json.dumps(data, indent=2)
Original file line number	Diff line number	Diff line change
`@@ -20,7 +20,7 @@`
`20`	`20`	`query,`
`21`	`21`	`)`
`22`	`22`	`from dotenv import load_dotenv`
`23`		`-from shared import build_mcp_command, validate_mcp_manifest`
	`23`	`+from cli.utils.shared import build_mcp_command, validate_mcp_manifest`
`24`	`24`
`25`	`25`	`logger = logging.getLogger(__name__)`
`26`	`26`
`@@ -153,6 +153,9 @@ def get_mcp_tools_description(mcp_binary: str \| None, project_root: Path, mcp_js`
`153`	`153`	`stderr=subprocess.PIPE,`
`154`	`154`	`)`
`155`	`155`
	`156`	`+ if proc.stdin is None or proc.stdout is None:`
	`157`	`+ raise RuntimeError("Failed to create subprocess pipes")`
	`158`	`+`
`156`	`159`	`init_request = (`
`157`	`160`	`json.dumps(`
`158`	`161`	`{`