Skip to content

Commit d98232e

Browse files
authored
Enabled parallel run of evals (#569)
Co-authored-by: Evgenii Kniazev <[email protected]>
1 parent 7b2887b commit d98232e

File tree

18 files changed

+1388
-60
lines changed

18 files changed

+1388
-60
lines changed

klaudbiusz/README.md

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -54,18 +54,32 @@ cd klaudbiusz
5454
# Evaluate all apps
5555
uv run cli/evaluate_all.py
5656

57+
# Parallel evaluation (faster for large batches)
58+
uv run cli/evaluate_all.py -j 4 # Run 4 evaluations in parallel
59+
uv run cli/evaluate_all.py -j 0 # Auto-detect CPU count
60+
uv run cli/evaluate_all.py --parallel 8 # Long form
61+
5762
# Partial evaluation (filter apps)
5863
uv run cli/evaluate_all.py --limit 5 # First 5 apps
5964
uv run cli/evaluate_all.py --apps app1 app2 # Specific apps
6065
uv run cli/evaluate_all.py --pattern "customer*" # Pattern matching
6166
uv run cli/evaluate_all.py --skip 10 --limit 5 # Skip first 10, evaluate next 5
67+
uv run cli/evaluate_all.py --start-from app5 # Start from specific app
68+
69+
# Custom directory
70+
uv run cli/evaluate_all.py --dir /path/to/apps # Evaluate apps in custom directory
71+
72+
# Staging environment (for testing)
73+
uv run cli/evaluate_all.py --staging # Log to staging MLflow experiment
6274

6375
# Evaluate single app
6476
uv run cli/evaluate_app.py ../app/customer-churn-analysis
6577
```
6678

6779
**Results are automatically logged to MLflow:** Navigate to `ML → Experiments → /Shared/klaudbiusz-evaluations` in Databricks UI / Googfooding.
6880

81+
**Performance:** Parallel evaluation with `-j` can provide 3-4x speedup for large batches (e.g., 20 apps in 5 min vs 15+ min sequential).
82+
6983
## Evaluation Framework
7084

7185
We use **9 objective metrics** to measure autonomous deployability:
@@ -143,7 +157,7 @@ klaudbiusz/
143157

144158
1. Write natural language prompt
145159
2. Generate: `uv run cli/single_run.py "your prompt"` or `uv run cli/bulk_run.py`
146-
3. Evaluate: `uv run cli/evaluate_all.py`
160+
3. Evaluate: `uv run cli/evaluate_all.py -j 0` (parallel, auto-detect CPUs)
147161
4. Review: `cat EVALUATION_REPORT.md`
148162
5. Deploy apps that pass checks
149163

@@ -169,9 +183,9 @@ shasum -a 256 -c klaudbiusz_evaluation_*.tar.gz.sha256
169183

170184
## Requirements
171185

172-
- Python 3.11+
186+
- Python 3.12+
173187
- uv (Python package manager)
174-
- Docker (for builds and runtime checks)
188+
- Docker (for Dagger containerized evaluations)
175189
- Node.js 18+ (for generated apps)
176190
- Databricks workspace with access token
177191

klaudbiusz/cli/dagger_utils.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
"""Simplified Dagger utilities for klaudbiusz evaluation."""
2+
3+
import dagger
4+
from typing import Self
5+
6+
7+
class ExecResult:
8+
"""Result of executing a command in a Dagger container."""
9+
10+
exit_code: int
11+
stdout: str
12+
stderr: str
13+
14+
def __init__(self, exit_code: int, stdout: str, stderr: str):
15+
self.exit_code = exit_code
16+
self.stdout = stdout
17+
self.stderr = stderr
18+
19+
@classmethod
20+
async def from_ctr(cls, ctr: dagger.Container) -> Self:
21+
"""Create ExecResult from a Dagger container."""
22+
return cls(
23+
exit_code=await ctr.exit_code(),
24+
stdout=await ctr.stdout(),
25+
stderr=await ctr.stderr(),
26+
)
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
#!/bin/bash
2+
set -e
3+
4+
# DBX SDK template: Build the application
5+
# For DBX SDK, we build from root package.json
6+
7+
echo "Building application..." >&2
8+
9+
if [ -f "package.json" ]; then
10+
if grep -q '"build"' package.json 2>/dev/null; then
11+
echo "Building from root..." >&2
12+
npm run build
13+
echo "✅ Build successful" >&2
14+
else
15+
echo "⚠️ No build script found in package.json" >&2
16+
fi
17+
else
18+
echo "⚠️ No package.json found" >&2
19+
fi
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
#!/bin/bash
2+
set -e
3+
4+
# DBX SDK template: Install dependencies
5+
# This template has a single root package.json
6+
7+
echo "Installing dependencies..." >&2
8+
9+
if [ -f "package.json" ]; then
10+
npm install
11+
echo "✅ Dependencies installed" >&2
12+
else
13+
echo "⚠️ No package.json found" >&2
14+
exit 1
15+
fi

klaudbiusz/cli/eval/dbx-sdk/start.sh

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,9 @@ if [ -z "$DATABRICKS_HOST" ] || [ -z "$DATABRICKS_TOKEN" ]; then
3939
exit 1
4040
fi
4141

42+
# Set default port if not provided
43+
DATABRICKS_APP_PORT="${DATABRICKS_APP_PORT:-8000}"
44+
4245
# Verify package.json exists
4346
if [ ! -f "package.json" ]; then
4447
echo "❌ Error: No package.json found in root directory" >&2
@@ -61,13 +64,13 @@ fi
6164
# Health check with retries (3 attempts, 2s timeout each, 1s apart)
6265
for i in {1..3}; do
6366
# Try healthcheck endpoint first
64-
if curl -f -s --max-time 2 http://localhost:8000/healthcheck >/dev/null 2>&1; then
67+
if curl -f -s --max-time 2 http://localhost:${DATABRICKS_APP_PORT}/healthcheck >/dev/null 2>&1; then
6568
echo "✅ App ready (healthcheck)" >&2
6669
exit 0
6770
fi
6871

6972
# Fallback to root endpoint for npm apps
70-
if curl -f -s --max-time 2 http://localhost:8000/ >/dev/null 2>&1; then
73+
if curl -f -s --max-time 2 http://localhost:${DATABRICKS_APP_PORT}/ >/dev/null 2>&1; then
7174
echo "✅ App ready (root)" >&2
7275
exit 0
7376
fi
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
#!/bin/bash
2+
set -e
3+
4+
# Docker template: Build the application
5+
# For Docker, we use docker build command
6+
7+
echo "Building Docker image..." >&2
8+
9+
if [ ! -f "Dockerfile" ]; then
10+
echo "⚠️ No Dockerfile found" >&2
11+
exit 1
12+
fi
13+
14+
# Get app name from DATABRICKS_APP_NAME env var or use default
15+
APP_NAME="${DATABRICKS_APP_NAME:-app}"
16+
17+
docker build -t "eval-${APP_NAME}" .
18+
echo "✅ Docker image built successfully" >&2
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
#!/bin/bash
2+
set -e
3+
4+
# Docker template: Install dependencies
5+
# This script handles various project structures (trpc, dbx-sdk, or custom)
6+
7+
echo "Installing dependencies..." >&2
8+
9+
# Check if root package.json has install:all script (trpc style)
10+
if [ -f "package.json" ] && grep -q '"install:all"' package.json 2>/dev/null; then
11+
echo "Running npm run install:all..." >&2
12+
npm run install:all
13+
elif [ -f "package.json" ]; then
14+
# Root-level app (dbx-sdk style)
15+
echo "Installing root dependencies..." >&2
16+
npm install
17+
else
18+
# Install server/client separately if they exist
19+
if [ -d "server" ] && [ -f "server/package.json" ]; then
20+
echo "Installing server dependencies..." >&2
21+
cd server && npm install && cd ..
22+
fi
23+
24+
if [ -d "client" ] && [ -f "client/package.json" ]; then
25+
echo "Installing client dependencies..." >&2
26+
cd client && npm install && cd ..
27+
elif [ -d "frontend" ] && [ -f "frontend/package.json" ]; then
28+
echo "Installing frontend dependencies..." >&2
29+
cd frontend && npm install && cd ..
30+
fi
31+
fi
32+
33+
echo "✅ Dependencies installed" >&2

klaudbiusz/cli/eval/docker/start.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ ENV_VARS+=("-e" "DATABRICKS_APP_PORT=${DATABRICKS_APP_PORT}")
6666
ENV_VARS+=("-e" "FLASK_RUN_HOST=${FLASK_RUN_HOST}")
6767

6868
# Run the container
69-
docker run -d -p 8000:8000 \
69+
docker run -d -p ${DATABRICKS_APP_PORT}:8000 \
7070
--name "${CONTAINER_NAME}" \
7171
${ENV_FILE_ARGS} \
7272
"${ENV_VARS[@]}" \
@@ -84,7 +84,7 @@ fi
8484
# Health check with retries (3 attempts, 2s timeout each, 1s apart)
8585
# Docker apps should have proper /healthcheck endpoint
8686
for i in {1..3}; do
87-
if curl -f -s --max-time 2 http://localhost:8000/healthcheck >/dev/null 2>&1; then
87+
if curl -f -s --max-time 2 http://localhost:${DATABRICKS_APP_PORT}/healthcheck >/dev/null 2>&1; then
8888
echo "✅ App ready (healthcheck)" >&2
8989
exit 0
9090
fi

klaudbiusz/cli/eval/trpc/build.sh

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
#!/bin/bash
2+
set -e
3+
4+
# tRPC template: Build the application
5+
# For tRPC, we build the client (React frontend)
6+
7+
echo "Building application..." >&2
8+
9+
# Build client if it exists
10+
if [ -d "client" ] && [ -f "client/package.json" ]; then
11+
if grep -q '"build"' client/package.json 2>/dev/null; then
12+
echo "Building client..." >&2
13+
cd client && npm run build && cd ..
14+
echo "✅ Client built successfully" >&2
15+
else
16+
echo "⚠️ No build script found in client/package.json" >&2
17+
fi
18+
elif [ -d "frontend" ] && [ -f "frontend/package.json" ]; then
19+
if grep -q '"build"' frontend/package.json 2>/dev/null; then
20+
echo "Building frontend..." >&2
21+
cd frontend && npm run build && cd ..
22+
echo "✅ Frontend built successfully" >&2
23+
else
24+
echo "⚠️ No build script found in frontend/package.json" >&2
25+
fi
26+
else
27+
# Try root-level build
28+
if [ -f "package.json" ] && grep -q '"build"' package.json 2>/dev/null; then
29+
echo "Building from root..." >&2
30+
npm run build
31+
echo "✅ Build successful" >&2
32+
else
33+
echo "⚠️ No build script found" >&2
34+
fi
35+
fi
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
#!/bin/bash
2+
set -e
3+
4+
# tRPC template: Install dependencies
5+
# This script handles dependency installation for tRPC monorepo structure
6+
7+
echo "Installing dependencies..." >&2
8+
9+
# Check if root package.json has install:all script
10+
if [ -f "package.json" ] && grep -q '"install:all"' package.json 2>/dev/null; then
11+
echo "Running npm run install:all..." >&2
12+
npm run install:all
13+
else
14+
# Install server dependencies
15+
if [ -d "server" ] && [ -f "server/package.json" ]; then
16+
echo "Installing server dependencies..." >&2
17+
cd server && npm install && cd ..
18+
fi
19+
20+
# Install client dependencies (try both client/ and frontend/)
21+
if [ -d "client" ] && [ -f "client/package.json" ]; then
22+
echo "Installing client dependencies..." >&2
23+
cd client && npm install && cd ..
24+
elif [ -d "frontend" ] && [ -f "frontend/package.json" ]; then
25+
echo "Installing frontend dependencies..." >&2
26+
cd frontend && npm install && cd ..
27+
fi
28+
fi
29+
30+
echo "✅ Dependencies installed" >&2

0 commit comments

Comments
 (0)