@@ -30,29 +30,29 @@ cli/archive_evaluation.sh
3030cli/cleanup_evaluation.sh
3131
3232# Generate a single app (Claude backend, default)
33- uv run cli/single_run.py " Create a customer churn analysis dashboard"
33+ uv run cli/generation/ single_run.py " Create a customer churn analysis dashboard"
3434
3535# Use LiteLLM backend with specific model
36- uv run cli/single_run.py " Create a customer churn analysis dashboard" \
36+ uv run cli/generation/ single_run.py " Create a customer churn analysis dashboard" \
3737 --backend=litellm --model=openrouter/minimax/minimax-m2
3838
3939# Batch generate from prompts (databricks set by default)
40- uv run cli/bulk_run.py
40+ uv run cli/generation/ bulk_run.py
4141
4242# Batch generate with test prompts
43- uv run cli/bulk_run.py --prompts=test
43+ uv run cli/generation/ bulk_run.py --prompts=test
4444
4545# Batch generate with LiteLLM backend
46- uv run cli/bulk_run.py --backend=litellm --model=gemini/gemini-2.5-pro
46+ uv run cli/generation/ bulk_run.py --backend=litellm --model=gemini/gemini-2.5-pro
4747
4848# Custom output directory
49- uv run cli/bulk_run.py --output-dir=/path/to/custom/folder
49+ uv run cli/generation/ bulk_run.py --output-dir=/path/to/custom/folder
5050
5151# Custom MCP binary (for testing modified edda_mcp)
52- uv run cli/bulk_run.py --mcp-binary=/path/to/custom/edda_mcp
52+ uv run cli/generation/ bulk_run.py --mcp-binary=/path/to/custom/edda_mcp
5353
5454# Combined example
55- uv run cli/bulk_run.py \
55+ uv run cli/generation/ bulk_run.py \
5656 --backend=litellm \
5757 --model=gemini/gemini-2.5-pro \
5858 --output-dir=./my-apps \
@@ -65,28 +65,28 @@ uv run cli/bulk_run.py \
6565cd klaudbiusz
6666
6767# Evaluate all apps
68- uv run cli/evaluate_all.py
68+ uv run cli/evaluation/ evaluate_all.py
6969
7070# Parallel evaluation (faster for large batches)
71- uv run cli/evaluate_all.py -j 4 # Run 4 evaluations in parallel
72- uv run cli/evaluate_all.py -j 0 # Auto-detect CPU count
73- uv run cli/evaluate_all.py --parallel 8 # Long form
71+ uv run cli/evaluation/ evaluate_all.py -j 4 # Run 4 evaluations in parallel
72+ uv run cli/evaluation/ evaluate_all.py -j 0 # Auto-detect CPU count
73+ uv run cli/evaluation/ evaluate_all.py --parallel 8 # Long form
7474
7575# Partial evaluation (filter apps)
76- uv run cli/evaluate_all.py --limit 5 # First 5 apps
77- uv run cli/evaluate_all.py --apps app1 app2 # Specific apps
78- uv run cli/evaluate_all.py --pattern " customer*" # Pattern matching
79- uv run cli/evaluate_all.py --skip 10 --limit 5 # Skip first 10, evaluate next 5
80- uv run cli/evaluate_all.py --start-from app5 # Start from specific app
76+ uv run cli/evaluation/ evaluate_all.py --limit 5 # First 5 apps
77+ uv run cli/evaluation/ evaluate_all.py --apps app1 app2 # Specific apps
78+ uv run cli/evaluation/ evaluate_all.py --pattern " customer*" # Pattern matching
79+ uv run cli/evaluation/ evaluate_all.py --skip 10 --limit 5 # Skip first 10, evaluate next 5
80+ uv run cli/evaluation/ evaluate_all.py --start-from app5 # Start from specific app
8181
8282# Custom directory
83- uv run cli/evaluate_all.py --dir /path/to/apps # Evaluate apps in custom directory
83+ uv run cli/evaluation/ evaluate_all.py --dir /path/to/apps # Evaluate apps in custom directory
8484
8585# Staging environment (for testing)
86- uv run cli/evaluate_all.py --staging # Log to staging MLflow experiment
86+ uv run cli/evaluation/ evaluate_all.py --staging # Log to staging MLflow experiment
8787
8888# Evaluate single app
89- uv run cli/evaluate_app.py ../app/customer-churn-analysis
89+ uv run cli/evaluation/ evaluate_app.py ../app/customer-churn-analysis
9090```
9191
9292** Results are automatically logged to MLflow:** Navigate to ` ML → Experiments → /Shared/klaudbiusz-evaluations ` in Databricks UI / Googfooding.
@@ -151,11 +151,21 @@ klaudbiusz/
151151│ └── DORA_METRICS.md # DORA & agentic DevX
152152├── app/ # Generated applications (gitignored)
153153├── cli/ # Generation & evaluation scripts
154- │ ├── single_run.py # Single app generation
155- │ ├── bulk_run.py # Batch app generation
156- │ ├── analyze_trajectories.py # Get LLM recommendations based on previous runs
157- │ ├── evaluate_all.py # Batch evaluation
158- │ ├── evaluate_app.py # Single app evaluation
154+ │ ├── generation/ # App generation
155+ │ │ ├── prompts/ # Prompt collections
156+ │ │ ├── codegen.py # Claude Agent SDK backend
157+ │ │ ├── codegen_multi.py # LiteLLM backend
158+ │ │ ├── single_run.py # Single app generation
159+ │ │ ├── bulk_run.py # Batch app generation
160+ │ │ └── screenshot.py # Batch screenshotting
161+ │ ├── evaluation/ # App evaluation
162+ │ │ ├── evaluate_all.py # Batch evaluation
163+ │ │ ├── evaluate_app.py # Single app evaluation (legacy)
164+ │ │ ├── evaluate_app_dagger.py # Dagger-based evaluation
165+ │ │ ├── eval_checks.py # Check functions
166+ │ │ └── eval_metrics.py # Metric definitions
167+ │ ├── utils/ # Shared utilities
168+ │ ├── analyze_trajectories.py # Get LLM recommendations
159169│ ├── archive_evaluation.sh # Create evaluation archive
160170│ └── cleanup_evaluation.sh # Clean generated apps
161171├── EVALUATION_REPORT.md # Latest results (gitignored)
@@ -169,14 +179,14 @@ klaudbiusz/
169179### Development Workflow
170180
1711811 . Write natural language prompt
172- 2 . Generate: ` uv run cli/single_run.py "your prompt" ` or ` uv run cli/bulk_run.py `
173- 3 . Evaluate: ` uv run cli/evaluate_all.py -j 0 ` (parallel, auto-detect CPUs)
182+ 2 . Generate: ` uv run cli/generation/ single_run.py "your prompt" ` or ` uv run cli/generation /bulk_run.py `
183+ 3 . Evaluate: ` uv run cli/evaluation/ evaluate_all.py -j 0 ` (parallel, auto-detect CPUs)
1741844 . Review: ` cat EVALUATION_REPORT.md `
1751855 . Deploy apps that pass checks
176186
177187### AI Assisted Edda Improvement Workflow
178188
179- 1 . Generate many apps with ` uv run cli/bulk_run.py `
189+ 1 . Generate many apps with ` uv run cli/generation/ bulk_run.py `
1801902 . Analyze the trajectories with ` uv run cli/analyze_trajectories.py `
1811913 . Based on the report, improve Edda tools and scaffolding
1821924 . Rerun the evaluation to measure impact
0 commit comments