diff --git a/klaudbiusz/.dockerignore b/klaudbiusz/.dockerignore
new file mode 100644
index 00000000..c656b6b1
--- /dev/null
+++ b/klaudbiusz/.dockerignore
@@ -0,0 +1,25 @@
+# ignore generated apps and artifacts
+app/
+app-eval/
+results/
+
+# ignore python cache
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+.ruff_cache/
+
+# ignore git
+.git/
+.gitignore
+
+# ignore env files (will be mounted)
+.env
+
+# ignore venv
+.venv/
+
+# ignore build artifacts
+.DS_Store
diff --git a/klaudbiusz/Dockerfile b/klaudbiusz/Dockerfile
new file mode 100644
index 00000000..8a7c18b4
--- /dev/null
+++ b/klaudbiusz/Dockerfile
@@ -0,0 +1,37 @@
+FROM python:3.12-slim
+
+# install databricks cli and node.js (for claude agent sdk)
+RUN apt-get update && \
+    apt-get install -y curl unzip git && \
+    curl -fsSL https://raw.githubusercontent.com/databricks/setup-cli/main/install.sh | sh && \
+    curl -fsSL https://deb.nodesource.com/setup_22.x | bash - && \
+    apt-get install -y nodejs && \
+    npm install -g @anthropic-ai/claude-code && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+WORKDIR /workspace
+
+# install uv for package management
+RUN pip install uv
+
+# install dependencies (cached until pyproject.toml changes)
+COPY pyproject.toml README.md ./
+RUN mkdir -p cli && touch cli/__init__.py && \
+    uv pip install --system -e . && \
+    rm -rf cli
+
+# copy only generation-related source code (exclude evaluation to prevent reward hacking)
+COPY cli/__init__.py ./cli/
+COPY cli/trajectory.py ./cli/
+COPY cli/generation/ ./cli/generation/
+COPY cli/utils/ ./cli/utils/
+
+# create non-root user (claude agent sdk requires non-root for security)
+RUN useradd -m -s /bin/bash klaudbiusz && \
+    chown -R klaudbiusz:klaudbiusz /workspace
+
+USER klaudbiusz
+
+# set working directory for app generation
+ENV APP_OUTPUT_DIR=/workspace/app
diff --git a/klaudbiusz/README.md b/klaudbiusz/README.md
index 49080ece..c5293ffd 100644
--- a/klaudbiusz/README.md
+++ b/klaudbiusz/README.md
@@ -21,42 +21,55 @@ cp .env.example .env
 ```
 
 ### Generate Applications
+
+Generation runs inside Dagger containers for isolation and reproducibility.
+
+**Prerequisites:**
+- Docker running
+- Linux build of edda_mcp binary (for Dagger containers)
+- Databricks CLI OAuth configured (`~/.databrickscfg` + `~/.databricks/token-cache.json`)
+
 ```bash
 cd klaudbiusz
 
-
 # make sure app folder is empty
 cli/archive_evaluation.sh
 cli/cleanup_evaluation.sh
 
-# Generate a single app (Claude backend, default)
-uv run cli/generation/single_run.py "Create a customer churn analysis dashboard"
-
-# Use LiteLLM backend with specific model
+# Generate a single app via Dagger (requires Linux binary)
 uv run cli/generation/single_run.py "Create a customer churn analysis dashboard" \
-  --backend=litellm --model=openrouter/minimax/minimax-m2
+  --mcp_binary=/path/to/linux/edda_mcp \
+  --mcp_args='["experimental", "apps-mcp"]'
 
-# Batch generate from prompts (databricks set by default)
-uv run cli/generation/bulk_run.py
+# Batch generate from prompts
+uv run cli/generation/bulk_run.py \
+  --mcp_binary=/path/to/linux/edda_mcp \
+  --mcp_args='["experimental", "apps-mcp"]'
 
-# Batch generate with test prompts
-uv run cli/generation/bulk_run.py --prompts=test
+# Use LiteLLM backend with specific model
+uv run cli/generation/single_run.py "Create a customer churn analysis dashboard" \
+  --backend=litellm --model=gemini/gemini-2.5-pro \
+  --mcp_binary=/path/to/linux/edda_mcp
+```
 
-# Batch generate with LiteLLM backend
-uv run cli/generation/bulk_run.py --backend=litellm --model=gemini/gemini-2.5-pro
+**Building Linux binary (for macOS users):**
+```bash
+cd /path/to/cli
+GOOS=linux GOARCH=arm64 go build -o cli-linux .
+# Then use --mcp_binary=/path/to/cli-linux
+```
 
-# Custom output directory
-uv run cli/generation/bulk_run.py --output-dir=/path/to/custom/folder
+### Local Debugging (without Dagger)
 
-# Custom MCP binary (for testing modified edda_mcp)
-uv run cli/generation/bulk_run.py --mcp-binary=/path/to/custom/edda_mcp
+For faster iteration during development, run directly on host:
 
-# Combined example
-uv run cli/generation/bulk_run.py \
-  --backend=litellm \
-  --model=gemini/gemini-2.5-pro \
-  --output-dir=./my-apps \
-  --mcp-binary=../edda/target/release/edda_mcp
+```bash
+# Local run with macOS binary
+uv run python cli/generation/container_runner.py "Create a dashboard" \
+  --app_name=debug-test \
+  --mcp_binary=/usr/local/bin/edda_mcp \
+  --mcp_args='["experimental", "apps-mcp"]' \
+  --output_dir=./app
 ```
 
 ### Evaluate Generated Apps
@@ -155,8 +168,10 @@ klaudbiusz/
 │   │   ├── prompts/               # Prompt collections
 │   │   ├── codegen.py             # Claude Agent SDK backend
 │   │   ├── codegen_multi.py       # LiteLLM backend
-│   │   ├── single_run.py          # Single app generation
-│   │   ├── bulk_run.py            # Batch app generation
+│   │   ├── dagger_run.py          # Dagger container orchestration
+│   │   ├── container_runner.py    # Runner script (inside container or local)
+│   │   ├── single_run.py          # Single app generation (via Dagger)
+│   │   ├── bulk_run.py            # Batch app generation (via Dagger)
 │   │   └── screenshot.py          # Batch screenshotting
 │   ├── evaluation/                 # App evaluation
 │   │   ├── evaluate_all.py        # Batch evaluation
diff --git a/klaudbiusz/cli/analyze_trajectories.py b/klaudbiusz/cli/analyze_trajectories.py
index 1d00eea2..cabdd000 100644
--- a/klaudbiusz/cli/analyze_trajectories.py
+++ b/klaudbiusz/cli/analyze_trajectories.py
@@ -136,7 +136,12 @@ async def analyze_single_trajectory(trajectory_md: str, app_name: str, model: st
     return response.choices[0].message.content  # type: ignore[attr-defined]
 
 
-def get_mcp_tools_description(mcp_binary: str | None, project_root: Path, mcp_json_path: str | None = None) -> str:
+def get_mcp_tools_description(
+    mcp_binary: str | None,
+    project_root: Path,
+    mcp_json_path: str | None = None,
+    mcp_args: list[str] | None = None,
+) -> str:
     """Extract MCP tool definitions by querying the MCP server.
 
     Returns empty string if mcp_binary is not provided.
@@ -145,7 +150,7 @@ def get_mcp_tools_description(mcp_binary: str | None, project_root: Path, mcp_js
         return ""
 
     mcp_manifest = validate_mcp_manifest(mcp_binary, project_root)
-    command, args = build_mcp_command(mcp_binary, mcp_manifest, mcp_json_path)
+    command, args = build_mcp_command(mcp_binary, mcp_manifest, mcp_json_path, mcp_args)
 
     proc = subprocess.Popen(
         [command, *args],
@@ -403,6 +408,7 @@ async def analyze_trajectories_async(
     trajectories_pattern: str = "./app/*/trajectory.jsonl",
     eval_report_path: str | None = None,
     mcp_json_path: str | None = None,
+    mcp_args: list[str] | None = None,
 ):
     """Analyze trajectories using map-reduce approach with LLM, then agent-based analysis."""
     litellm.drop_params = True
@@ -414,7 +420,7 @@ async def analyze_trajectories_async(
     mcp_tools_doc = ""
     if mcp_binary:
         logger.info("📋 Extracting MCP tool definitions")
-        mcp_tools_doc = get_mcp_tools_description(mcp_binary, project_root, mcp_json_path)
+        mcp_tools_doc = get_mcp_tools_description(mcp_binary, project_root, mcp_json_path, mcp_args)
 
     eval_report = ""
     if eval_report_path:
@@ -460,6 +466,7 @@ def cli(
     map_model: str = "anthropic/claude-haiku-4-5",
     eval_report: str | None = None,
     mcp_json: str | None = None,
+    mcp_args: list[str] | None = None,
 ):
     """Analyze agent trajectories to find friction points and patterns.
 
@@ -472,6 +479,7 @@ def cli(
         map_model: LiteLLM model identifier for individual trajectory analysis
         eval_report: Path to evaluation report JSON (optional)
         mcp_json: Optional path to JSON config file for edda_mcp
+        mcp_args: Optional list of args passed to the MCP server (overrides defaults)
     """
     coloredlogs.install(
         level=logging.INFO,
@@ -490,6 +498,7 @@ def cli(
             trajectories_pattern,
             eval_report,
             mcp_json,
+            mcp_args,
         )
     )
 
diff --git a/klaudbiusz/cli/evaluation/evaluate_app_dagger.py b/klaudbiusz/cli/evaluation/evaluate_app_dagger.py
index 90396bc0..54f0f172 100644
--- a/klaudbiusz/cli/evaluation/evaluate_app_dagger.py
+++ b/klaudbiusz/cli/evaluation/evaluate_app_dagger.py
@@ -7,6 +7,7 @@
 
 import asyncio
 import json
+import os
 import sys
 import time
 from dataclasses import asdict
@@ -55,6 +56,11 @@
         break
 
 
+def _restore_terminal_cursor() -> None:
+    """Restore terminal cursor after Dagger run (workaround for dagger/dagger#7160)."""
+    os.system("tput cnorm 2>/dev/null || true")
+
+
 async def evaluate_app_async(
     client: dagger.Client,
     app_dir: Path,
@@ -390,7 +396,10 @@ async def main_async():
 
 def main():
     """Sync wrapper for async main."""
-    asyncio.run(main_async())
+    try:
+        asyncio.run(main_async())
+    finally:
+        _restore_terminal_cursor()
 
 
 if __name__ == "__main__":
diff --git a/klaudbiusz/cli/generation/bulk_run.py b/klaudbiusz/cli/generation/bulk_run.py
index fa78e0a9..7046c1fb 100644
--- a/klaudbiusz/cli/generation/bulk_run.py
+++ b/klaudbiusz/cli/generation/bulk_run.py
@@ -1,199 +1,64 @@
-"""Bulk runner for generating multiple apps from hardcoded prompts."""
+"""Bulk app generation via Dagger with parallelism."""
 
+import asyncio
 import json
 import os
-import signal
-import sys
 from datetime import datetime
 from pathlib import Path
-from typing import TypedDict
 
+import fire
 from dotenv import load_dotenv
-from joblib import Parallel, delayed
+from tqdm import tqdm
 
-from cli.generation.codegen import ClaudeAppBuilder
-from cli.generation.codegen import GenerationMetrics as ClaudeGenerationMetrics
-from cli.generation.codegen_multi import LiteLLMAppBuilder
-from cli.generation.prompts.databricks import PROMPTS as DATABRICKS_PROMPTS
-from cli.utils.litellm_multiprocess_fix import patch_litellm_for_multiprocessing
+from cli.generation.dagger_run import DaggerAppGenerator
 
-patch_litellm_for_multiprocessing()
-
-# Unified type for metrics from both backends
-GenerationMetrics = ClaudeGenerationMetrics
-
-# Load environment variables from .env file
 load_dotenv()
 
-# Re-export for eval compatibility
-PROMPTS = DATABRICKS_PROMPTS
-
-
-class RunResult(TypedDict):
-    prompt: str
-    success: bool
-    metrics: GenerationMetrics | None
-    error: str | None
-    app_dir: str | None
-    mcp_binary: str | None
-    backend: str
-    model: str | None
-
-
-def run_single_generation(
-    app_name: str,
-    prompt: str,
-    backend: str,
-    model: str | None,
-    wipe_db: bool = False,
-    suppress_logs: bool = True,
-    mcp_binary: str | None = None,
-    mcp_json: str | None = None,
-    mcp_args: list[str] | None = None,
-    output_dir: str | None = None,
-) -> RunResult:
-    # re-apply litellm patch in worker process (joblib uses spawn/fork)
-    if backend == "litellm":
-        patch_litellm_for_multiprocessing()
-
-    def timeout_handler(signum, frame):
-        raise TimeoutError("Generation timed out after 1200 seconds")
-
-    try:
-        # set 20 minute timeout for entire generation
-        signal.signal(signal.SIGALRM, timeout_handler)
-        signal.alarm(1200)
-
-        match backend:
-            case "claude":
-                codegen = ClaudeAppBuilder(
-                    app_name=app_name,
-                    wipe_db=wipe_db,
-                    suppress_logs=suppress_logs,
-                    mcp_binary=mcp_binary,
-                    mcp_json_path=mcp_json,
-                    mcp_args=mcp_args,
-                    output_dir=output_dir,
-                )
-                metrics = codegen.run(prompt, wipe_db=wipe_db)
-                app_dir = metrics.get("app_dir") if metrics else None
-            case "litellm":
-                if not model:
-                    raise ValueError("--model is required when using --backend=litellm")
-                builder = LiteLLMAppBuilder(
-                    app_name=app_name,
-                    model=model,
-                    mcp_binary=mcp_binary,
-                    mcp_json_path=mcp_json,
-                    mcp_args=mcp_args,
-                    suppress_logs=suppress_logs,
-                    output_dir=output_dir,
-                )
-                litellm_metrics = builder.run(prompt)
-                # convert LiteLLM metrics to dict format matching Claude SDK
-                metrics: GenerationMetrics = {
-                    "cost_usd": litellm_metrics.cost_usd,
-                    "input_tokens": litellm_metrics.input_tokens,
-                    "output_tokens": litellm_metrics.output_tokens,
-                    "turns": litellm_metrics.turns,
-                    "app_dir": litellm_metrics.app_dir,
-                }
-                app_dir = litellm_metrics.app_dir
-            case _:
-                raise ValueError(f"Unknown backend: {backend}. Use 'claude' or 'litellm'")
-
-        signal.alarm(0)  # cancel timeout
 
-        return {
-            "prompt": prompt,
-            "success": True,
-            "metrics": metrics,
-            "error": None,
-            "app_dir": app_dir,
-            "mcp_binary": mcp_binary,
-            "backend": backend,
-            "model": model,
-        }
-    except TimeoutError as e:
-        signal.alarm(0)  # cancel timeout
-        print(f"[TIMEOUT] {prompt[:80]}...", file=sys.stderr, flush=True)
-        return {
-            "prompt": prompt,
-            "success": False,
-            "metrics": None,
-            "error": str(e),
-            "app_dir": None,
-            "mcp_binary": mcp_binary,
-            "backend": backend,
-            "model": model,
-        }
-    except Exception as e:
-        signal.alarm(0)  # cancel timeout
-        print(f"[ERROR] {prompt[:80]}... - {e}", file=sys.stderr, flush=True)
-        return {
-            "prompt": prompt,
-            "success": False,
-            "metrics": None,
-            "error": str(e),
-            "app_dir": None,
-            "mcp_binary": mcp_binary,
-            "backend": backend,
-            "model": model,
-        }
+def _restore_terminal_cursor() -> None:
+    """Restore terminal cursor after Dagger run (workaround for dagger/dagger#7160)."""
+    os.system("tput cnorm 2>/dev/null || true")
 
 
 def main(
     prompts: str = "databricks",
     backend: str = "claude",
     model: str | None = None,
-    wipe_db: bool = False,
-    n_jobs: int = -1,
     mcp_binary: str | None = None,
-    mcp_json: str | None = None,
     mcp_args: list[str] | None = None,
     output_dir: str | None = None,
+    max_concurrency: int = 6,
 ) -> None:
-    """Bulk app generation from predefined prompt sets.
+    """Bulk app generation via Dagger with parallelism.
 
     Args:
-        prompts: Prompt set to use ("databricks", "databricks_v2", or "test", default: "databricks")
-        backend: Backend to use ("claude" or "litellm", default: "claude")
-        model: LLM model (required if backend=litellm, e.g., "openrouter/minimax/minimax-m2")
-        wipe_db: Whether to wipe database on start
-        n_jobs: Number of parallel jobs (-1 for all cores)
-        mcp_args: Optional list of args passed to the MCP server (overrides defaults)
-        mcp_binary: Optional path to pre-built edda-mcp binary (default: use cargo run)
-        mcp_json: Optional path to JSON config file for edda_mcp
-        output_dir: Custom output directory for generated apps (default: ./app)
+        prompts: Prompt set to use ("databricks", "databricks_v2", or "test")
+        backend: Backend to use ("claude" or "litellm")
+        model: LLM model (required if backend=litellm)
+        mcp_binary: Path to edda_mcp binary (required)
+        mcp_args: Optional list of args passed to the MCP server
+        output_dir: Custom output directory for generated apps
+        max_concurrency: Maximum parallel generations (default: 4)
 
     Usage:
-        # Claude backend (default) with databricks prompts (default)
-        python bulk_run.py
-
-        # Claude backend with databricks_v2 prompts
-        python bulk_run.py --prompts=databricks_v2
+        # Claude backend with databricks prompts
+        python bulk_run.py --mcp_binary=/path/to/edda_mcp
 
-        # Claude backend with test prompts
-        python bulk_run.py --prompts=test
+        # With custom concurrency
+        python bulk_run.py --mcp_binary=/path/to/edda_mcp --max_concurrency=8
 
         # LiteLLM backend
-        python bulk_run.py --backend=litellm --model=openrouter/minimax/minimax-m2
-        python bulk_run.py --prompts=test --backend=litellm --model=gemini/gemini-2.5-pro
-
-        # Custom MCP config
-        python bulk_run.py --mcp_json=./config/databricks-cli.json
-
-        # Custom output directory
-        python bulk_run.py --output-dir=/path/to/custom/folder
+        python bulk_run.py --backend=litellm --model=gemini/gemini-2.5-pro --mcp_binary=/path/to/edda_mcp
+    """
+    if not mcp_binary:
+        raise ValueError("--mcp_binary is required")
 
-        # Custom MCP binary
-        python bulk_run.py --mcp-binary=/path/to/edda_mcp
+    if backend == "litellm" and not model:
+        raise ValueError("--model is required when using --backend=litellm")
 
-        # Optional: Run screenshots after generation
-        python screenshot.py ./app --concurrency=5 --wait-time=120000
-    """
-    # bulk run always suppresses logs
-    suppress_logs = True
+    # validate required environment variables
+    if not os.environ.get("DATABRICKS_HOST") or not os.environ.get("DATABRICKS_TOKEN"):
+        raise ValueError("DATABRICKS_HOST and DATABRICKS_TOKEN environment variables must be set")
 
     # load prompt set
     match prompts:
@@ -206,67 +71,69 @@ def main(
         case _:
             raise ValueError(f"Unknown prompt set: {prompts}. Use 'databricks', 'databricks_v2', or 'test'")
 
-    # validate backend-specific requirements
-    if backend == "litellm" and not model:
-        raise ValueError("--model is required when using --backend=litellm")
-
-    # validate required environment variables
-    if not os.environ.get("DATABRICKS_HOST") or not os.environ.get("DATABRICKS_TOKEN"):
-        raise ValueError("DATABRICKS_HOST and DATABRICKS_TOKEN environment variables must be set")
-
     print(f"Starting bulk generation for {len(selected_prompts)} prompts...")
     print(f"Backend: {backend}")
     if backend == "litellm":
         print(f"Model: {model}")
     print(f"Prompt set: {prompts}")
-    print(f"Parallel jobs: {n_jobs}")
-    if backend == "claude":
-        print(f"Wipe DB: {wipe_db}")
-    print(f"MCP binary: {mcp_binary if mcp_binary else 'cargo run (default)'}")
-    print(f"Output dir: {output_dir if output_dir else './app (default)'}\n")
-
-    # generate all apps
-    results: list[RunResult] = Parallel(n_jobs=n_jobs, backend="loky", verbose=10)(  # type: ignore[assignment]
-        delayed(run_single_generation)(
-            app_name, prompt, backend, model, wipe_db, suppress_logs, mcp_binary, mcp_json, mcp_args, output_dir
-        )
-        for app_name, prompt in selected_prompts.items()
+    print(f"Max concurrency: {max_concurrency}")
+    print(f"MCP binary: {mcp_binary}")
+    out_path = Path(output_dir) if output_dir else Path("./app")
+    print(f"Output dir: {out_path}\n")
+
+    generator = DaggerAppGenerator(
+        mcp_binary=Path(mcp_binary),
+        output_dir=out_path,
+        stream_logs=False,  # disable TUI for bulk runs
     )
 
-    # separate successful and failed generations
-    successful: list[RunResult] = []
-    failed: list[RunResult] = []
-    for r in results:
-        success = r["success"]
+    # progress bar with success/fail tracking
+    pbar = tqdm(total=len(selected_prompts), desc="Generating apps", unit="app")
+    success_count = 0
+    fail_count = 0
+
+    def on_complete(app_name: str, success: bool) -> None:
+        nonlocal success_count, fail_count
         if success:
-            successful.append(r)
+            success_count += 1
+            status = "✓"
         else:
-            failed.append(r)
+            fail_count += 1
+            status = "✗"
+        pbar.set_postfix(ok=success_count, fail=fail_count)
+        pbar.set_description(f"{status} {app_name}")
+        pbar.update(1)
 
-    apps_dir = "./app/"
-    # get apps directory from first successful app (used for output file path)
-    if successful:
-        first_app_dir = next((r["app_dir"] for r in successful if r["app_dir"]), None)
-        if first_app_dir:
-            apps_dir = str(Path(first_app_dir).parent)
+    try:
+        results = asyncio.run(
+            generator.generate_bulk(
+                selected_prompts,
+                backend,
+                model,
+                mcp_args,
+                max_concurrency,
+                on_complete=on_complete,
+            )
+        )
+    finally:
+        pbar.close()
+        _restore_terminal_cursor()
 
-    successful_with_metrics: list[RunResult] = []
-    for r in successful:
-        metrics = r["metrics"]
-        if metrics is not None:
-            successful_with_metrics.append(r)
+    # separate successful and failed (results now include metrics)
+    successful = [(name, app_dir, log, metrics) for name, app_dir, log, metrics, err in results if err is None]
+    failed = [(name, log, err) for name, app_dir, log, metrics, err in results if err is not None]
 
+    # aggregate metrics from successful runs
     total_cost = 0.0
-    total_input_tokens = 0
-    total_output_tokens = 0
+    total_tokens = 0
     total_turns = 0
-    for r in successful_with_metrics:
-        metrics = r["metrics"]
-        assert metrics is not None
-        total_cost += metrics["cost_usd"]
-        total_input_tokens += metrics["input_tokens"]
-        total_output_tokens += metrics["output_tokens"]
-        total_turns += metrics["turns"]
+    metrics_count = 0
+    for _, _, _, metrics in successful:
+        if metrics:
+            total_cost += metrics.get("cost_usd", 0.0)
+            total_tokens += metrics.get("input_tokens", 0)
+            total_turns += metrics.get("turns", 0)
+            metrics_count += 1
 
     print(f"\n{'=' * 80}")
     print("Bulk Generation Summary")
@@ -274,62 +141,63 @@ def main(
     print(f"Total prompts: {len(selected_prompts)}")
     print(f"Successful: {len(successful)}")
     print(f"Failed: {len(failed)}")
-    print(f"\nTotal cost: ${total_cost:.4f}")
-    print(f"Total input tokens: {total_input_tokens}")
-    print(f"Total output tokens: {total_output_tokens}")
-    print(f"Total turns: {total_turns}")
 
-    if successful_with_metrics:
-        avg_cost = total_cost / len(successful_with_metrics)
-        avg_input = total_input_tokens / len(successful_with_metrics)
-        avg_output = total_output_tokens / len(successful_with_metrics)
-        avg_turns = total_turns / len(successful_with_metrics)
-        print("\nAverage per generation:")
-        print(f"  Cost: ${avg_cost:.4f}")
-        print(f"  Input tokens: {avg_input:.0f}")
-        print(f"  Output tokens: {avg_output:.0f}")
-        print(f"  Turns: {avg_turns:.1f}")
+    if metrics_count > 0:
+        print(f"\nMetrics (from {metrics_count} runs):")
+        print(f"  Total cost: ${total_cost:.4f}")
+        print(f"  Avg cost: ${total_cost / metrics_count:.4f}")
+        print(f"  Total tokens: {total_tokens:,}")
+        print(f"  Avg tokens: {total_tokens // metrics_count:,}")
+        print(f"  Avg turns: {total_turns / metrics_count:.1f}")
 
-    if len(failed) > 0:
+    if failed:
         print(f"\n{'=' * 80}")
         print("Failed generations:")
         print(f"{'=' * 80}")
-        for r in failed:
-            prompt = r["prompt"]
-            error = r["error"]
-            print(f"  - {prompt[:50]}...")
-            if error is not None:
-                print(f"    Error: {error}")
+        for name, log, err in failed:
+            print(f"  - {name}")
+            print(f"    Error: {err}")
+            if log:
+                print(f"    Log: {log}")
 
-    if len(successful) > 0:
-        apps_with_dirs: list[tuple[str, str]] = []
-        for r in successful:
-            prompt = r["prompt"]
-            app_dir = r["app_dir"]
-            if app_dir is not None:
-                apps_with_dirs.append((prompt, app_dir))
-
-        if apps_with_dirs:
-            print(f"\n{'=' * 80}")
-            print("Generated apps:")
-            print(f"{'=' * 80}")
-            for prompt, app_dir in apps_with_dirs:
-                print(f"  - {prompt[:60]}...")
-                print(f"    Dir: {app_dir}")
+    if successful:
+        print(f"\n{'=' * 80}")
+        print("Generated apps:")
+        print(f"{'=' * 80}")
+        for name, app_dir, log, metrics in successful:
+            print(f"  - {name}")
+            print(f"    Dir: {app_dir}")
+            if metrics:
+                print(
+                    f"    Cost: ${metrics.get('cost_usd', 0):.4f}, Tokens: {metrics.get('input_tokens', 0):,}, Turns: {metrics.get('turns', 0)}"
+                )
 
     print(f"\n{'=' * 80}\n")
 
+    # save results json
     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
     backend_suffix = f"_{backend}" if backend != "claude" else ""
-    output_file = Path(apps_dir) / Path(f"bulk_run_results{backend_suffix}_{timestamp}.json")
-
-    # ensure directory exists
+    output_file = out_path / f"bulk_run_results{backend_suffix}_{timestamp}.json"
     output_file.parent.mkdir(parents=True, exist_ok=True)
-    output_file.write_text(json.dumps(results, indent=2))
+
+    results_data = [
+        {
+            "app_name": name,
+            "success": err is None,
+            "app_dir": str(app_dir) if app_dir else None,
+            "log_file": str(log) if log else None,
+            "error": err,
+            "backend": backend,
+            "model": model,
+            "cost_usd": metrics.get("cost_usd") if metrics else None,
+            "tokens": metrics.get("input_tokens") if metrics else None,
+            "turns": metrics.get("turns") if metrics else None,
+        }
+        for name, app_dir, log, metrics, err in results
+    ]
+    output_file.write_text(json.dumps(results_data, indent=2))
     print(f"Results saved to {output_file}")
 
 
 if __name__ == "__main__":
-    import fire
-
     fire.Fire(main)
diff --git a/klaudbiusz/cli/generation/codegen.py b/klaudbiusz/cli/generation/codegen.py
index 2a74b585..826b9780 100644
--- a/klaudbiusz/cli/generation/codegen.py
+++ b/klaudbiusz/cli/generation/codegen.py
@@ -95,11 +95,7 @@ async def run_async(self, prompt: str) -> GenerationMetrics:
 Never deploy the app, just scaffold and build it.
 """
 
-        disallowed_tools = [
-            "NotebookEdit",
-            "WebSearch",
-            "WebFetch",
-        ]
+        disallowed_tools = ["NotebookEdit", "WebSearch", "WebFetch", "Bash"]
 
         command, args = build_mcp_command(self.mcp_binary, self.mcp_manifest, self.mcp_json_path, self.mcp_args)
 
@@ -169,6 +165,13 @@ async def run_async(self, prompt: str) -> GenerationMetrics:
                 print(f"\n❌ Error: {e}", file=sys.stderr)
             raise
         finally:
+            # fallback: detect app_dir from filesystem if not tracked
+            if not self.scaffold_tracker.app_dir:
+                detected = self.scaffold_tracker.detect_from_filesystem(self.output_dir)
+                if detected:
+                    self.scaffold_tracker.app_dir = detected
+                    logger.info(f"📁 Detected app directory from filesystem: {detected}")
+
             # save trajectory via tracker
             await self.tracker.save(
                 prompt=prompt,
diff --git a/klaudbiusz/cli/generation/codegen_multi.py b/klaudbiusz/cli/generation/codegen_multi.py
index b26988f4..81de544c 100644
--- a/klaudbiusz/cli/generation/codegen_multi.py
+++ b/klaudbiusz/cli/generation/codegen_multi.py
@@ -31,9 +31,15 @@ class GenerationMetrics:
 
 
 class MCPSession:
-    def __init__(self, mcp_binary: str | None = None, mcp_json_path: str | None = None):
+    def __init__(
+        self,
+        mcp_binary: str | None = None,
+        mcp_json_path: str | None = None,
+        mcp_args: list[str] | None = None,
+    ):
         self.mcp_binary = mcp_binary
         self.mcp_json_path = mcp_json_path
+        self.mcp_args = mcp_args
         self.project_root = Path(__file__).parent.parent.parent.parent
         self.mcp_manifest = validate_mcp_manifest(mcp_binary, self.project_root)
 
@@ -48,10 +54,10 @@ async def __aenter__(self) -> ClientSession:
             "DATABRICKS_WAREHOUSE_ID": os.getenv("DATABRICKS_WAREHOUSE_ID", ""),
         }
 
-        command, args = build_mcp_command(self.mcp_binary, self.mcp_manifest, self.mcp_json_path)
+        command, args = build_mcp_command(self.mcp_binary, self.mcp_manifest, self.mcp_json_path, self.mcp_args)
         # add workspace tools flag for LiteLLM backend (works for both binary and cargo run)
-        # only if not using JSON override
-        if not self.mcp_json_path:
+        # only if not using JSON override or custom mcp_args
+        if not self.mcp_json_path and not self.mcp_args:
             args.append("--with-workspace-tools=true")
         server_params = StdioServerParameters(command=command, args=args, env=env)
 
@@ -284,6 +290,7 @@ def __init__(
         model: str,
         mcp_binary: str | None = None,
         mcp_json_path: str | None = None,
+        mcp_args: list[str] | None = None,
         suppress_logs: bool = False,
         output_dir: str | None = None,
     ):
@@ -291,6 +298,7 @@ def __init__(
         self.model = model
         self.mcp_binary = mcp_binary
         self.mcp_json_path = mcp_json_path
+        self.mcp_args = mcp_args
         self.suppress_logs = suppress_logs
         self.output_dir = Path(output_dir) if output_dir else Path.cwd() / "app"
         litellm.drop_params = True
@@ -335,7 +343,7 @@ def _build_system_prompt(self) -> str:
     async def run_async(self, prompt: str) -> GenerationMetrics:
         setup_logging(self.suppress_logs, self.mcp_binary)
 
-        mcp_session = MCPSession(self.mcp_binary, self.mcp_json_path)
+        mcp_session = MCPSession(self.mcp_binary, self.mcp_json_path, self.mcp_args)
         agent = None
         metrics = None
 
diff --git a/klaudbiusz/cli/generation/container_runner.py b/klaudbiusz/cli/generation/container_runner.py
new file mode 100644
index 00000000..484673b6
--- /dev/null
+++ b/klaudbiusz/cli/generation/container_runner.py
@@ -0,0 +1,76 @@
+"""Runner script executed inside Dagger container."""
+
+import json
+import sys
+
+import fire
+
+
+def run(
+    prompt: str,
+    app_name: str,
+    backend: str = "claude",
+    model: str | None = None,
+    mcp_args: str | list[str] | None = None,
+    mcp_binary: str = "/usr/local/bin/edda_mcp",
+    output_dir: str = "/workspace",
+) -> None:
+    """Run app generation (inside container or locally for debugging).
+
+    Args:
+        prompt: The prompt describing what to build
+        app_name: App name for output directory
+        backend: "claude" or "litellm"
+        model: Model name (required for litellm)
+        mcp_args: JSON-encoded list or already-parsed list of MCP server args
+        mcp_binary: Path to edda_mcp binary (default: /usr/local/bin/edda_mcp for container)
+        output_dir: Output directory for generated app (default: /workspace for container)
+    """
+    # handle both JSON string and already-parsed list (fire may parse it)
+    parsed_mcp_args: list[str] | None
+    match mcp_args:
+        case None:
+            parsed_mcp_args = None
+        case str():
+            parsed_mcp_args = json.loads(mcp_args)
+        case list():
+            parsed_mcp_args = mcp_args
+
+    match backend:
+        case "claude":
+            from cli.generation.codegen import ClaudeAppBuilder
+
+            builder = ClaudeAppBuilder(
+                app_name=app_name,
+                wipe_db=False,
+                suppress_logs=False,
+                mcp_binary=mcp_binary,
+                mcp_args=parsed_mcp_args,
+                output_dir=output_dir,
+            )
+            metrics = builder.run(prompt, wipe_db=False)
+        case "litellm":
+            from cli.generation.codegen_multi import LiteLLMAppBuilder
+
+            if not model:
+                print("Error: --model is required for litellm backend", file=sys.stderr)
+                sys.exit(1)
+
+            builder = LiteLLMAppBuilder(
+                app_name=app_name,
+                model=model,
+                mcp_binary=mcp_binary,
+                mcp_args=parsed_mcp_args,
+                suppress_logs=False,
+                output_dir=output_dir,
+            )
+            metrics = builder.run(prompt)
+        case _:
+            print(f"Error: Unknown backend: {backend}", file=sys.stderr)
+            sys.exit(1)
+
+    print(f"Metrics: {metrics}")
+
+
+if __name__ == "__main__":
+    fire.Fire(run)
diff --git a/klaudbiusz/cli/generation/dagger_run.py b/klaudbiusz/cli/generation/dagger_run.py
new file mode 100644
index 00000000..fbc69f6f
--- /dev/null
+++ b/klaudbiusz/cli/generation/dagger_run.py
@@ -0,0 +1,277 @@
+"""Dagger-based app generation pipeline with caching and parallelism."""
+
+import asyncio
+import json
+import logging
+import os
+import subprocess
+import sys
+from collections.abc import Callable
+from pathlib import Path
+
+import dagger
+
+from cli.generation.codegen import GenerationMetrics
+
+logger = logging.getLogger(__name__)
+
+
+def _read_metrics_from_app(app_dir: Path) -> GenerationMetrics | None:
+    """Read metrics from generation_metrics.json in app directory."""
+    metrics_file = app_dir / "generation_metrics.json"
+    if not metrics_file.exists():
+        return None
+
+    try:
+        data = json.loads(metrics_file.read_text())
+        return GenerationMetrics(
+            cost_usd=data.get("cost_usd", 0.0),
+            input_tokens=data.get("input_tokens", 0),
+            output_tokens=data.get("output_tokens", 0),
+            turns=data.get("turns", 0),
+        )
+    except (json.JSONDecodeError, KeyError) as e:
+        logger.warning(f"Failed to parse generation metrics: {e}")
+        return None
+
+
+def _check_binary_format(binary_path: Path) -> None:
+    """Check if binary is Linux-compatible for container execution.
+
+    Raises:
+        RuntimeError: If binary is not Linux ELF format
+    """
+    try:
+        result = subprocess.run(
+            ["file", str(binary_path)],
+            capture_output=True,
+            text=True,
+            check=True,
+        )
+        output = result.stdout.lower()
+
+        if "mach-o" in output or "darwin" in output:
+            raise RuntimeError(
+                f"Binary {binary_path} is macOS format (Mach-O), but Dagger runs Linux containers.\n"
+                f"Please provide a Linux build. For Go: GOOS=linux GOARCH=arm64 go build ...\n"
+                f"Output from 'file': {result.stdout.strip()}"
+            )
+
+        if "elf" not in output:
+            logger.warning(
+                f"Binary {binary_path} may not be Linux-compatible: {result.stdout.strip()}"
+            )
+    except FileNotFoundError:
+        # 'file' command not available, skip check
+        pass
+
+
+class DaggerAppGenerator:
+    """Runs app generation in Dagger container with caching."""
+
+    def __init__(
+        self,
+        mcp_binary: Path,
+        output_dir: Path,
+        stream_logs: bool = True,
+    ):
+        _check_binary_format(mcp_binary)
+        self.mcp_binary = mcp_binary
+        self.output_dir = output_dir
+        self.stream_logs = stream_logs
+
+    async def generate_single(
+        self,
+        prompt: str,
+        app_name: str,
+        backend: str = "claude",
+        model: str | None = None,
+        mcp_args: list[str] | None = None,
+    ) -> tuple[Path | None, Path, GenerationMetrics | None]:
+        """Generate single app, export app dir + logs.
+
+        Returns:
+            tuple of (app_dir or None, log_file, metrics or None) paths on host.
+            app_dir is None if agent didn't create an app.
+        """
+        if self.stream_logs:
+            cfg = dagger.Config(log_output=sys.stderr)
+        else:
+            cfg = dagger.Config(log_output=open(os.devnull, "w"))
+        async with dagger.Connection(cfg) as client:
+            container = await self._build_container(client)
+            return await self._run_generation(
+                client, container, prompt, app_name, backend, model, mcp_args
+            )
+
+    async def _run_generation(
+        self,
+        client: dagger.Client,
+        base_container: dagger.Container,
+        prompt: str,
+        app_name: str,
+        backend: str,
+        model: str | None,
+        mcp_args: list[str] | None,
+    ) -> tuple[Path | None, Path, GenerationMetrics | None]:
+        """Run generation in container and export results."""
+        # path inside container for generated app
+        app_output = f"/workspace/{app_name}"
+
+        # build command using container_runner.py (already in image via Dockerfile COPY)
+        cmd = [
+            "python",
+            "cli/generation/container_runner.py",
+            prompt,
+            f"--app_name={app_name}",
+            f"--backend={backend}",
+        ]
+        if model:
+            cmd.append(f"--model={model}")
+        if mcp_args:
+            cmd.append(f"--mcp_args={json.dumps(mcp_args)}")
+
+        # ensure log directory exists
+        container = base_container.with_exec(["mkdir", "-p", "/workspace/logs"])
+
+        # run generation
+        result = container.with_exec(cmd)
+
+        # prepare log file path
+        log_file_local = self.output_dir / "logs" / f"{app_name}.log"
+        log_file_local.parent.mkdir(parents=True, exist_ok=True)
+
+        # capture stdout/stderr - even on failure we want to save what we can
+        try:
+            log_content = await result.stdout()
+            stderr_content = await result.stderr()
+            full_log = f"{log_content}\n\n=== STDERR ===\n{stderr_content}" if stderr_content else log_content
+            log_file_local.write_text(full_log)
+        except dagger.ExecError as e:
+            # container command failed - save error output as log
+            full_log = f"=== EXEC ERROR ===\n{e}\n\n=== STDOUT ===\n{e.stdout}\n\n=== STDERR ===\n{e.stderr}"
+            log_file_local.write_text(full_log)
+            raise
+
+        # export app directory (if it exists)
+        app_dir_local = self.output_dir / app_name
+        try:
+            await result.directory(app_output).export(str(app_dir_local))
+        except dagger.QueryError as e:
+            if "no such file or directory" in str(e):
+                # agent didn't create an app directory (e.g. just answered a question)
+                return None, log_file_local, None
+            raise
+
+        # read metrics from generation_metrics.json
+        metrics = _read_metrics_from_app(app_dir_local)
+        return app_dir_local, log_file_local, metrics
+
+    async def generate_bulk(
+        self,
+        prompts: dict[str, str],
+        backend: str = "claude",
+        model: str | None = None,
+        mcp_args: list[str] | None = None,
+        max_concurrency: int = 4,
+        on_complete: Callable[[str, bool], None] | None = None,
+    ) -> list[tuple[str, Path | None, Path | None, GenerationMetrics | None, str | None]]:
+        """Generate multiple apps with Dagger parallelism.
+
+        Uses a single Dagger connection for all generations, allowing Dagger
+        to optimize container reuse and parallel execution.
+
+        Args:
+            prompts: dict mapping app_name to prompt
+            backend: "claude" or "litellm"
+            model: model name (required for litellm)
+            mcp_args: optional MCP server args
+            max_concurrency: max parallel generations
+            on_complete: callback(app_name, success) called when each app finishes
+
+        Returns:
+            list of (app_name, app_dir, log_file, metrics, error) tuples
+        """
+        # suppress dagger output for bulk runs
+        cfg = dagger.Config(log_output=open(os.devnull, "w"))
+
+        async with dagger.Connection(cfg) as client:
+            # build container once, reuse for all generations
+            base_container = await self._build_container(client)
+            sem = asyncio.Semaphore(max_concurrency)
+
+            async def run_with_sem(
+                app_name: str, prompt: str
+            ) -> tuple[str, Path | None, Path | None, GenerationMetrics | None, str | None]:
+                async with sem:
+                    try:
+                        app_dir, log_file, metrics = await self._run_generation(
+                            client, base_container, prompt, app_name, backend, model, mcp_args
+                        )
+                        if on_complete:
+                            on_complete(app_name, True)
+                        return (app_name, app_dir, log_file, metrics, None)
+                    except Exception as e:
+                        if on_complete:
+                            on_complete(app_name, False)
+                        log_path = self.output_dir / "logs" / f"{app_name}.log"
+                        return (app_name, None, log_path if log_path.exists() else None, None, str(e))
+
+            tasks = [run_with_sem(name, prompt) for name, prompt in prompts.items()]
+            return await asyncio.gather(*tasks)
+
+    async def _build_container(self, client: dagger.Client) -> dagger.Container:
+        """Build container from Dockerfile with layer caching."""
+        # build context excluding generated files
+        context = client.host().directory(
+            ".",
+            exclude=[
+                "app/",
+                "app-eval/",
+                "results/",
+                ".venv/",
+                "__pycache__/",
+                ".git/",
+            ],
+        )
+
+        # build from Dockerfile (leverages BuildKit cache)
+        container = context.docker_build()
+
+        # mount mcp binary from host (not baked into image)
+        container = container.with_file(
+            "/usr/local/bin/edda_mcp",
+            client.host().file(str(self.mcp_binary)),
+            permissions=0o755,  # make executable
+        )
+
+        # pass through env vars from host
+        env_vars = [
+            "ANTHROPIC_API_KEY",
+            "NEON_DATABASE_URL",
+        ]
+        for var in env_vars:
+            if val := os.environ.get(var):
+                container = container.with_env_variable(var, val)
+
+        # mount databricks config for CLI authentication (OAuth profile)
+        # container runs as 'klaudbiusz' user (see Dockerfile)
+        databrickscfg = Path.home() / ".databrickscfg"
+        if databrickscfg.exists():
+            container = container.with_file(
+                "/home/klaudbiusz/.databrickscfg",
+                client.host().file(str(databrickscfg)),
+                owner="klaudbiusz:klaudbiusz",
+            )
+
+        # mount databricks directory for OAuth token cache and other CLI state
+        # required when using auth_type = databricks-cli
+        databricks_dir = Path.home() / ".databricks"
+        if databricks_dir.exists():
+            container = container.with_directory(
+                "/home/klaudbiusz/.databricks",
+                client.host().directory(str(databricks_dir)),
+                owner="klaudbiusz:klaudbiusz",
+            )
+
+        return container
diff --git a/klaudbiusz/cli/generation/single_run.py b/klaudbiusz/cli/generation/single_run.py
index fc618cf5..a65f370f 100644
--- a/klaudbiusz/cli/generation/single_run.py
+++ b/klaudbiusz/cli/generation/single_run.py
@@ -1,98 +1,83 @@
-import fire
+"""Single app generation via Dagger."""
+
+import asyncio
+import os
 from datetime import datetime
+from pathlib import Path
+
+import fire
 from dotenv import load_dotenv
 
-from cli.generation.codegen import ClaudeAppBuilder, GenerationMetrics as ClaudeGenerationMetrics
-from cli.generation.codegen_multi import LiteLLMAppBuilder
+from cli.generation.dagger_run import DaggerAppGenerator
 
-# Load environment variables from .env file
 load_dotenv()
 
 
+def _restore_terminal_cursor() -> None:
+    """Restore terminal cursor after Dagger run (workaround for dagger/dagger#7160)."""
+    os.system("tput cnorm 2>/dev/null || true")
+
+
 def run(
     prompt: str,
     app_name: str | None = None,
     backend: str = "claude",
     model: str | None = None,
-    wipe_db: bool = True,
     mcp_binary: str | None = None,
-    mcp_json: str | None = None,
     mcp_args: list[str] | None = None,
-):
-    """Run app builder with given prompt.
+    output_dir: str | None = None,
+) -> dict[str, str | None]:
+    """Run app generation in Dagger container.
 
     Args:
         prompt: The prompt describing what to build
         app_name: Optional app name (default: timestamp-based)
         backend: Backend to use ("claude" or "litellm", default: "claude")
-        model: LLM model (required if backend=litellm, e.g., "openrouter/minimax/minimax-m2")
-        wipe_db: Whether to wipe database on start
-        mcp_binary: Optional path to pre-built edda-mcp binary (default: use cargo run)
-        mcp_json: Optional path to JSON config file for edda_mcp
-        mcp_args: Optional list of args passed to the MCP server (overrides defaults)
+        model: LLM model (required if backend=litellm)
+        mcp_binary: Path to edda_mcp binary (required)
+        mcp_args: Optional list of args passed to the MCP server
 
     Usage:
         # Claude backend (default)
-        python main.py "build dashboard" --app_name=my-dashboard
+        python single_run.py "build dashboard" --mcp_binary=/path/to/edda_mcp
 
         # LiteLLM backend
-        python main.py "build dashboard" --backend=litellm --model=openrouter/minimax/minimax-m2
-        python main.py "build dashboard" --backend=litellm --model=gemini/gemini-2.5-pro
-
-        # Custom MCP config
-        python main.py "build dashboard" --mcp_json=./config/databricks-cli.json
+        python single_run.py "build dashboard" --backend=litellm --model=gemini/gemini-2.5-pro --mcp_binary=/path/to/edda_mcp
 
         # Custom MCP args
-        python main.py "build dashboard" --mcp_args='["experimental", "apps-mcp"]'
+        python single_run.py "build dashboard" --mcp_binary=/path/to/edda_mcp --mcp_args='["experimental", "apps-mcp"]'
     """
+    if not mcp_binary:
+        raise ValueError("--mcp_binary is required")
+
+    if backend == "litellm" and not model:
+        raise ValueError("--model is required when using --backend=litellm")
+
     if app_name is None:
         app_name = f"app-{datetime.now().strftime('%Y%m%d-%H%M%S')}"
 
-    # single run always shows logs
-    suppress_logs = False
-
-    match backend:
-        case "claude":
-            builder = ClaudeAppBuilder(
-                app_name=app_name,
-                wipe_db=wipe_db,
-                suppress_logs=suppress_logs,
-                mcp_binary=mcp_binary,
-                mcp_json_path=mcp_json,
-                mcp_args=mcp_args,
-            )
-            metrics = builder.run(prompt, wipe_db=wipe_db)
-        case "litellm":
-            if not model:
-                raise ValueError("--model is required when using --backend=litellm")
-            builder_litellm = LiteLLMAppBuilder(
-                app_name=app_name,
-                model=model,
-                mcp_binary=mcp_binary,
-                mcp_json_path=mcp_json,
-                mcp_args=mcp_args,
-                suppress_logs=suppress_logs,
-            )
-            litellm_metrics = builder_litellm.run(prompt)
-            # convert to dict format for consistent output
-            metrics: ClaudeGenerationMetrics = {
-                "cost_usd": litellm_metrics.cost_usd,
-                "input_tokens": litellm_metrics.input_tokens,
-                "output_tokens": litellm_metrics.output_tokens,
-                "turns": litellm_metrics.turns,
-                "app_dir": litellm_metrics.app_dir,
-            }
-        case _:
-            raise ValueError(f"Unknown backend: {backend}. Use 'claude' or 'litellm'")
-
-    if metrics:
-        print(f"\n{'=' * 80}")
-        print("Final metrics:")
-        print(f"  Cost: ${metrics['cost_usd']:.4f}")
-        print(f"  Turns: {metrics['turns']}")
-        print(f"  App dir: {metrics.get('app_dir', 'NOT CAPTURED')}")
-        print(f"{'=' * 80}\n")
-    return metrics
+    generator = DaggerAppGenerator(
+        mcp_binary=Path(mcp_binary),
+        output_dir=Path(output_dir) if output_dir else Path("./app"),
+    )
+
+    try:
+        app_dir, log_file = asyncio.run(
+            generator.generate_single(prompt, app_name, backend, model, mcp_args)
+        )
+    finally:
+        _restore_terminal_cursor()
+
+    print(f"\n{'=' * 80}")
+    if app_dir:
+        print("Generation complete:")
+        print(f"  App: {app_dir}")
+    else:
+        print("No app generated (agent may have just answered without creating files)")
+    print(f"  Log: {log_file}")
+    print(f"{'=' * 80}\n")
+
+    return {"app_dir": str(app_dir) if app_dir else None, "log_file": str(log_file)}
 
 
 def main():
diff --git a/klaudbiusz/cli/utils/shared.py b/klaudbiusz/cli/utils/shared.py
index b4e3d69e..3520611c 100644
--- a/klaudbiusz/cli/utils/shared.py
+++ b/klaudbiusz/cli/utils/shared.py
@@ -76,14 +76,16 @@ def log_text(self, role: str, text: str, emoji: str = "💬") -> None:
         if not self.suppress_logs:
             logger.info(f"{emoji} {text}")
 
-        self.trajectory_messages.append(Message(
-            role=role,
-            content=text,
-            tool_calls=None,
-            tool_results=None,
-            timestamp=datetime.now(timezone.utc),
-            tokens=None,
-        ))
+        self.trajectory_messages.append(
+            Message(
+                role=role,
+                content=text,
+                tool_calls=None,
+                tool_results=None,
+                timestamp=datetime.now(timezone.utc),
+                tokens=None,
+            )
+        )
 
     def log_tool_call(self, tool_name: str, arguments: dict[str, Any], tool_id: str) -> None:
         """Log tool call from assistant.
@@ -100,14 +102,16 @@ def log_tool_call(self, tool_name: str, arguments: dict[str, Any], tool_id: str)
             logger.info(f"🔧 Tool: {tool_name}({truncated})")
 
         # trajectory collection
-        self.trajectory_messages.append(Message(
-            role="assistant",
-            content=None,
-            tool_calls=[ToolCall(id=tool_id, name=tool_name, arguments=arguments)],
-            tool_results=None,
-            timestamp=datetime.now(timezone.utc),
-            tokens=None,
-        ))
+        self.trajectory_messages.append(
+            Message(
+                role="assistant",
+                content=None,
+                tool_calls=[ToolCall(id=tool_id, name=tool_name, arguments=arguments)],
+                tool_results=None,
+                timestamp=datetime.now(timezone.utc),
+                tokens=None,
+            )
+        )
 
     def log_tool_result(self, tool_id: str, result: str, is_error: bool = False) -> None:
         """Log tool result from environment.
@@ -126,14 +130,16 @@ def log_tool_result(self, tool_id: str, result: str, is_error: bool = False) ->
                 logger.info(f"✅ Tool result: {truncated}")
 
         # trajectory collection
-        self.trajectory_messages.append(Message(
-            role="tool",
-            content=None,
-            tool_calls=None,
-            tool_results=[ToolResult(tool_call_id=tool_id, content=result, is_error=is_error)],
-            timestamp=datetime.now(timezone.utc),
-            tokens=None,
-        ))
+        self.trajectory_messages.append(
+            Message(
+                role="tool",
+                content=None,
+                tool_calls=None,
+                tool_results=[ToolResult(tool_call_id=tool_id, content=result, is_error=is_error)],
+                timestamp=datetime.now(timezone.utc),
+                tokens=None,
+            )
+        )
 
     def log_subagent_invoke(self, subagent_type: str, description: str, prompt: str) -> None:
         """Log subagent delegation (Claude SDK specific)."""
@@ -144,14 +150,16 @@ def log_subagent_invoke(self, subagent_type: str, description: str, prompt: str)
             logger.info(f"   Instructions: {truncated}")
 
         # add to trajectory as assistant message (contextual info)
-        self.trajectory_messages.append(Message(
-            role="assistant",
-            content=f"[Delegating to subagent: {subagent_type}] {description}",
-            tool_calls=None,
-            tool_results=None,
-            timestamp=datetime.now(timezone.utc),
-            tokens=None,
-        ))
+        self.trajectory_messages.append(
+            Message(
+                role="assistant",
+                content=f"[Delegating to subagent: {subagent_type}] {description}",
+                tool_calls=None,
+                tool_results=None,
+                timestamp=datetime.now(timezone.utc),
+                tokens=None,
+            )
+        )
 
     def log_todo_update(self, todos: list[dict[str, Any]]) -> None:
         """Log todo list update."""
@@ -165,14 +173,16 @@ def log_todo_update(self, todos: list[dict[str, Any]]) -> None:
 
         # add to trajectory as assistant message (contextual info)
         summary = f"Todo update: {sum(1 for t in todos if t.get('status') == 'completed')}/{len(todos)} completed"
-        self.trajectory_messages.append(Message(
-            role="assistant",
-            content=f"[{summary}]",
-            tool_calls=None,
-            tool_results=None,
-            timestamp=datetime.now(timezone.utc),
-            tokens=None,
-        ))
+        self.trajectory_messages.append(
+            Message(
+                role="assistant",
+                content=f"[{summary}]",
+                tool_calls=None,
+                tool_results=None,
+                timestamp=datetime.now(timezone.utc),
+                tokens=None,
+            )
+        )
 
     def log_session_complete(
         self,
@@ -219,6 +229,9 @@ async def save(
         if not self.trajectory_messages:
             return
 
+        if not self.app_name:
+            logger.warning("⚠️ App name not set, skipping trajectory save")
+
         trajectory = Trajectory(
             run_id=str(self.run_id),
             app_name=self.app_name,
@@ -261,6 +274,36 @@ def resolve(self, tool_id: str) -> None:
         if tool_id in self._pending:
             self.app_dir = self._pending.pop(tool_id)
 
+    def detect_from_filesystem(self, search_root: Path | None = None) -> str | None:
+        """Fallback: detect scaffold by finding package.json at top level.
+
+        Globs for package.json and chooses the one closest to search_root.
+        This works in dagger environments where /workspace is the base.
+
+        Args:
+            search_root: Root directory to search from (e.g., /workspace or output_dir)
+
+        Returns:
+            Path to detected app directory, or None if not found
+        """
+        if search_root is None:
+            search_root = Path("/workspace")
+
+        if not search_root.exists():
+            logger.warning(f"⚠️ Search root does not exist: {search_root}")
+            return None
+
+        logger.info(f"🔍 Searching for scaffolded app directory under {search_root}")
+        # glob for all package.json files
+        marker_files = list(search_root.glob("**/databricks.yml"))
+        if not marker_files:
+            logger.warning("⚠️ Could not detect scaffolded app directory from filesystem")
+
+        file, *_ = marker_files
+        app_dir = file.parent
+        logger.info(f"✅ Detected scaffolded app directory: {app_dir}")
+        return str(app_dir)
+
 
 def validate_mcp_manifest(mcp_binary: str | None, project_root: Path) -> Path | None:
     """Validate MCP manifest exists if using cargo run.
@@ -293,6 +336,7 @@ def setup_logging(suppress_logs: bool, mcp_binary: str | None = None) -> None:
     else:
         try:
             import coloredlogs  # type: ignore[import-untyped]
+
             coloredlogs.install(level="INFO")
         except ImportError:
             logging.basicConfig(level=logging.INFO)