Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions klaudbiusz/.dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# ignore generated apps and artifacts
app/
app-eval/
results/

# ignore python cache
__pycache__/
*.py[cod]
*$py.class
*.so
.Python
.ruff_cache/

# ignore git
.git/
.gitignore

# ignore env files (will be mounted)
.env

# ignore venv
.venv/

# ignore build artifacts
.DS_Store
37 changes: 37 additions & 0 deletions klaudbiusz/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
FROM python:3.12-slim

# install databricks cli and node.js (for claude agent sdk)
RUN apt-get update && \
apt-get install -y curl unzip git && \
curl -fsSL https://raw.githubusercontent.com/databricks/setup-cli/main/install.sh | sh && \
curl -fsSL https://deb.nodesource.com/setup_22.x | bash - && \
apt-get install -y nodejs && \
npm install -g @anthropic-ai/claude-code && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*

WORKDIR /workspace

# install uv for package management
RUN pip install uv

# install dependencies (cached until pyproject.toml changes)
COPY pyproject.toml README.md ./
RUN mkdir -p cli && touch cli/__init__.py && \
uv pip install --system -e . && \
rm -rf cli

# copy only generation-related source code (exclude evaluation to prevent reward hacking)
COPY cli/__init__.py ./cli/
COPY cli/trajectory.py ./cli/
COPY cli/generation/ ./cli/generation/
COPY cli/utils/ ./cli/utils/

# create non-root user (claude agent sdk requires non-root for security)
RUN useradd -m -s /bin/bash klaudbiusz && \
chown -R klaudbiusz:klaudbiusz /workspace

USER klaudbiusz

# set working directory for app generation
ENV APP_OUTPUT_DIR=/workspace/app
63 changes: 39 additions & 24 deletions klaudbiusz/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,42 +21,55 @@ cp .env.example .env
```

### Generate Applications

Generation runs inside Dagger containers for isolation and reproducibility.

**Prerequisites:**
- Docker running
- Linux build of edda_mcp binary (for Dagger containers)
- Databricks CLI OAuth configured (`~/.databrickscfg` + `~/.databricks/token-cache.json`)

```bash
cd klaudbiusz


# make sure app folder is empty
cli/archive_evaluation.sh
cli/cleanup_evaluation.sh

# Generate a single app (Claude backend, default)
uv run cli/generation/single_run.py "Create a customer churn analysis dashboard"

# Use LiteLLM backend with specific model
# Generate a single app via Dagger (requires Linux binary)
uv run cli/generation/single_run.py "Create a customer churn analysis dashboard" \
--backend=litellm --model=openrouter/minimax/minimax-m2
--mcp_binary=/path/to/linux/edda_mcp \
--mcp_args='["experimental", "apps-mcp"]'

# Batch generate from prompts (databricks set by default)
uv run cli/generation/bulk_run.py
# Batch generate from prompts
uv run cli/generation/bulk_run.py \
--mcp_binary=/path/to/linux/edda_mcp \
--mcp_args='["experimental", "apps-mcp"]'

# Batch generate with test prompts
uv run cli/generation/bulk_run.py --prompts=test
# Use LiteLLM backend with specific model
uv run cli/generation/single_run.py "Create a customer churn analysis dashboard" \
--backend=litellm --model=gemini/gemini-2.5-pro \
--mcp_binary=/path/to/linux/edda_mcp
```

# Batch generate with LiteLLM backend
uv run cli/generation/bulk_run.py --backend=litellm --model=gemini/gemini-2.5-pro
**Building Linux binary (for macOS users):**
```bash
cd /path/to/cli
GOOS=linux GOARCH=arm64 go build -o cli-linux .
# Then use --mcp_binary=/path/to/cli-linux
```

# Custom output directory
uv run cli/generation/bulk_run.py --output-dir=/path/to/custom/folder
### Local Debugging (without Dagger)

# Custom MCP binary (for testing modified edda_mcp)
uv run cli/generation/bulk_run.py --mcp-binary=/path/to/custom/edda_mcp
For faster iteration during development, run directly on host:

# Combined example
uv run cli/generation/bulk_run.py \
--backend=litellm \
--model=gemini/gemini-2.5-pro \
--output-dir=./my-apps \
--mcp-binary=../edda/target/release/edda_mcp
```bash
# Local run with macOS binary
uv run python cli/generation/container_runner.py "Create a dashboard" \
--app_name=debug-test \
--mcp_binary=/usr/local/bin/edda_mcp \
--mcp_args='["experimental", "apps-mcp"]' \
--output_dir=./app
```

### Evaluate Generated Apps
Expand Down Expand Up @@ -155,8 +168,10 @@ klaudbiusz/
│ │ ├── prompts/ # Prompt collections
│ │ ├── codegen.py # Claude Agent SDK backend
│ │ ├── codegen_multi.py # LiteLLM backend
│ │ ├── single_run.py # Single app generation
│ │ ├── bulk_run.py # Batch app generation
│ │ ├── dagger_run.py # Dagger container orchestration
│ │ ├── container_runner.py # Runner script (inside container or local)
│ │ ├── single_run.py # Single app generation (via Dagger)
│ │ ├── bulk_run.py # Batch app generation (via Dagger)
│ │ └── screenshot.py # Batch screenshotting
│ ├── evaluation/ # App evaluation
│ │ ├── evaluate_all.py # Batch evaluation
Expand Down
15 changes: 12 additions & 3 deletions klaudbiusz/cli/analyze_trajectories.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,12 @@ async def analyze_single_trajectory(trajectory_md: str, app_name: str, model: st
return response.choices[0].message.content # type: ignore[attr-defined]


def get_mcp_tools_description(mcp_binary: str | None, project_root: Path, mcp_json_path: str | None = None) -> str:
def get_mcp_tools_description(
mcp_binary: str | None,
project_root: Path,
mcp_json_path: str | None = None,
mcp_args: list[str] | None = None,
) -> str:
"""Extract MCP tool definitions by querying the MCP server.

Returns empty string if mcp_binary is not provided.
Expand All @@ -145,7 +150,7 @@ def get_mcp_tools_description(mcp_binary: str | None, project_root: Path, mcp_js
return ""

mcp_manifest = validate_mcp_manifest(mcp_binary, project_root)
command, args = build_mcp_command(mcp_binary, mcp_manifest, mcp_json_path)
command, args = build_mcp_command(mcp_binary, mcp_manifest, mcp_json_path, mcp_args)

proc = subprocess.Popen(
[command, *args],
Expand Down Expand Up @@ -403,6 +408,7 @@ async def analyze_trajectories_async(
trajectories_pattern: str = "./app/*/trajectory.jsonl",
eval_report_path: str | None = None,
mcp_json_path: str | None = None,
mcp_args: list[str] | None = None,
):
"""Analyze trajectories using map-reduce approach with LLM, then agent-based analysis."""
litellm.drop_params = True
Expand All @@ -414,7 +420,7 @@ async def analyze_trajectories_async(
mcp_tools_doc = ""
if mcp_binary:
logger.info("📋 Extracting MCP tool definitions")
mcp_tools_doc = get_mcp_tools_description(mcp_binary, project_root, mcp_json_path)
mcp_tools_doc = get_mcp_tools_description(mcp_binary, project_root, mcp_json_path, mcp_args)

eval_report = ""
if eval_report_path:
Expand Down Expand Up @@ -460,6 +466,7 @@ def cli(
map_model: str = "anthropic/claude-haiku-4-5",
eval_report: str | None = None,
mcp_json: str | None = None,
mcp_args: list[str] | None = None,
):
"""Analyze agent trajectories to find friction points and patterns.

Expand All @@ -472,6 +479,7 @@ def cli(
map_model: LiteLLM model identifier for individual trajectory analysis
eval_report: Path to evaluation report JSON (optional)
mcp_json: Optional path to JSON config file for edda_mcp
mcp_args: Optional list of args passed to the MCP server (overrides defaults)
"""
coloredlogs.install(
level=logging.INFO,
Expand All @@ -490,6 +498,7 @@ def cli(
trajectories_pattern,
eval_report,
mcp_json,
mcp_args,
)
)

Expand Down
11 changes: 10 additions & 1 deletion klaudbiusz/cli/evaluation/evaluate_app_dagger.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

import asyncio
import json
import os
import sys
import time
from dataclasses import asdict
Expand Down Expand Up @@ -55,6 +56,11 @@
break


def _restore_terminal_cursor() -> None:
"""Restore terminal cursor after Dagger run (workaround for dagger/dagger#7160)."""
os.system("tput cnorm 2>/dev/null || true")


async def evaluate_app_async(
client: dagger.Client,
app_dir: Path,
Expand Down Expand Up @@ -390,7 +396,10 @@ async def main_async():

def main():
"""Sync wrapper for async main."""
asyncio.run(main_async())
try:
asyncio.run(main_async())
finally:
_restore_terminal_cursor()


if __name__ == "__main__":
Expand Down
Loading