diff --git a/klaudbiusz/.dockerignore b/klaudbiusz/.dockerignore new file mode 100644 index 00000000..c656b6b1 --- /dev/null +++ b/klaudbiusz/.dockerignore @@ -0,0 +1,25 @@ +# ignore generated apps and artifacts +app/ +app-eval/ +results/ + +# ignore python cache +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +.ruff_cache/ + +# ignore git +.git/ +.gitignore + +# ignore env files (will be mounted) +.env + +# ignore venv +.venv/ + +# ignore build artifacts +.DS_Store diff --git a/klaudbiusz/Dockerfile b/klaudbiusz/Dockerfile new file mode 100644 index 00000000..8a7c18b4 --- /dev/null +++ b/klaudbiusz/Dockerfile @@ -0,0 +1,37 @@ +FROM python:3.12-slim + +# install databricks cli and node.js (for claude agent sdk) +RUN apt-get update && \ + apt-get install -y curl unzip git && \ + curl -fsSL https://raw.githubusercontent.com/databricks/setup-cli/main/install.sh | sh && \ + curl -fsSL https://deb.nodesource.com/setup_22.x | bash - && \ + apt-get install -y nodejs && \ + npm install -g @anthropic-ai/claude-code && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +WORKDIR /workspace + +# install uv for package management +RUN pip install uv + +# install dependencies (cached until pyproject.toml changes) +COPY pyproject.toml README.md ./ +RUN mkdir -p cli && touch cli/__init__.py && \ + uv pip install --system -e . && \ + rm -rf cli + +# copy only generation-related source code (exclude evaluation to prevent reward hacking) +COPY cli/__init__.py ./cli/ +COPY cli/trajectory.py ./cli/ +COPY cli/generation/ ./cli/generation/ +COPY cli/utils/ ./cli/utils/ + +# create non-root user (claude agent sdk requires non-root for security) +RUN useradd -m -s /bin/bash klaudbiusz && \ + chown -R klaudbiusz:klaudbiusz /workspace + +USER klaudbiusz + +# set working directory for app generation +ENV APP_OUTPUT_DIR=/workspace/app diff --git a/klaudbiusz/README.md b/klaudbiusz/README.md index 49080ece..c5293ffd 100644 --- a/klaudbiusz/README.md +++ b/klaudbiusz/README.md @@ -21,42 +21,55 @@ cp .env.example .env ``` ### Generate Applications + +Generation runs inside Dagger containers for isolation and reproducibility. + +**Prerequisites:** +- Docker running +- Linux build of edda_mcp binary (for Dagger containers) +- Databricks CLI OAuth configured (`~/.databrickscfg` + `~/.databricks/token-cache.json`) + ```bash cd klaudbiusz - # make sure app folder is empty cli/archive_evaluation.sh cli/cleanup_evaluation.sh -# Generate a single app (Claude backend, default) -uv run cli/generation/single_run.py "Create a customer churn analysis dashboard" - -# Use LiteLLM backend with specific model +# Generate a single app via Dagger (requires Linux binary) uv run cli/generation/single_run.py "Create a customer churn analysis dashboard" \ - --backend=litellm --model=openrouter/minimax/minimax-m2 + --mcp_binary=/path/to/linux/edda_mcp \ + --mcp_args='["experimental", "apps-mcp"]' -# Batch generate from prompts (databricks set by default) -uv run cli/generation/bulk_run.py +# Batch generate from prompts +uv run cli/generation/bulk_run.py \ + --mcp_binary=/path/to/linux/edda_mcp \ + --mcp_args='["experimental", "apps-mcp"]' -# Batch generate with test prompts -uv run cli/generation/bulk_run.py --prompts=test +# Use LiteLLM backend with specific model +uv run cli/generation/single_run.py "Create a customer churn analysis dashboard" \ + --backend=litellm --model=gemini/gemini-2.5-pro \ + --mcp_binary=/path/to/linux/edda_mcp +``` -# Batch generate with LiteLLM backend -uv run cli/generation/bulk_run.py --backend=litellm --model=gemini/gemini-2.5-pro +**Building Linux binary (for macOS users):** +```bash +cd /path/to/cli +GOOS=linux GOARCH=arm64 go build -o cli-linux . +# Then use --mcp_binary=/path/to/cli-linux +``` -# Custom output directory -uv run cli/generation/bulk_run.py --output-dir=/path/to/custom/folder +### Local Debugging (without Dagger) -# Custom MCP binary (for testing modified edda_mcp) -uv run cli/generation/bulk_run.py --mcp-binary=/path/to/custom/edda_mcp +For faster iteration during development, run directly on host: -# Combined example -uv run cli/generation/bulk_run.py \ - --backend=litellm \ - --model=gemini/gemini-2.5-pro \ - --output-dir=./my-apps \ - --mcp-binary=../edda/target/release/edda_mcp +```bash +# Local run with macOS binary +uv run python cli/generation/container_runner.py "Create a dashboard" \ + --app_name=debug-test \ + --mcp_binary=/usr/local/bin/edda_mcp \ + --mcp_args='["experimental", "apps-mcp"]' \ + --output_dir=./app ``` ### Evaluate Generated Apps @@ -155,8 +168,10 @@ klaudbiusz/ │ │ ├── prompts/ # Prompt collections │ │ ├── codegen.py # Claude Agent SDK backend │ │ ├── codegen_multi.py # LiteLLM backend -│ │ ├── single_run.py # Single app generation -│ │ ├── bulk_run.py # Batch app generation +│ │ ├── dagger_run.py # Dagger container orchestration +│ │ ├── container_runner.py # Runner script (inside container or local) +│ │ ├── single_run.py # Single app generation (via Dagger) +│ │ ├── bulk_run.py # Batch app generation (via Dagger) │ │ └── screenshot.py # Batch screenshotting │ ├── evaluation/ # App evaluation │ │ ├── evaluate_all.py # Batch evaluation diff --git a/klaudbiusz/cli/analyze_trajectories.py b/klaudbiusz/cli/analyze_trajectories.py index 1d00eea2..cabdd000 100644 --- a/klaudbiusz/cli/analyze_trajectories.py +++ b/klaudbiusz/cli/analyze_trajectories.py @@ -136,7 +136,12 @@ async def analyze_single_trajectory(trajectory_md: str, app_name: str, model: st return response.choices[0].message.content # type: ignore[attr-defined] -def get_mcp_tools_description(mcp_binary: str | None, project_root: Path, mcp_json_path: str | None = None) -> str: +def get_mcp_tools_description( + mcp_binary: str | None, + project_root: Path, + mcp_json_path: str | None = None, + mcp_args: list[str] | None = None, +) -> str: """Extract MCP tool definitions by querying the MCP server. Returns empty string if mcp_binary is not provided. @@ -145,7 +150,7 @@ def get_mcp_tools_description(mcp_binary: str | None, project_root: Path, mcp_js return "" mcp_manifest = validate_mcp_manifest(mcp_binary, project_root) - command, args = build_mcp_command(mcp_binary, mcp_manifest, mcp_json_path) + command, args = build_mcp_command(mcp_binary, mcp_manifest, mcp_json_path, mcp_args) proc = subprocess.Popen( [command, *args], @@ -403,6 +408,7 @@ async def analyze_trajectories_async( trajectories_pattern: str = "./app/*/trajectory.jsonl", eval_report_path: str | None = None, mcp_json_path: str | None = None, + mcp_args: list[str] | None = None, ): """Analyze trajectories using map-reduce approach with LLM, then agent-based analysis.""" litellm.drop_params = True @@ -414,7 +420,7 @@ async def analyze_trajectories_async( mcp_tools_doc = "" if mcp_binary: logger.info("📋 Extracting MCP tool definitions") - mcp_tools_doc = get_mcp_tools_description(mcp_binary, project_root, mcp_json_path) + mcp_tools_doc = get_mcp_tools_description(mcp_binary, project_root, mcp_json_path, mcp_args) eval_report = "" if eval_report_path: @@ -460,6 +466,7 @@ def cli( map_model: str = "anthropic/claude-haiku-4-5", eval_report: str | None = None, mcp_json: str | None = None, + mcp_args: list[str] | None = None, ): """Analyze agent trajectories to find friction points and patterns. @@ -472,6 +479,7 @@ def cli( map_model: LiteLLM model identifier for individual trajectory analysis eval_report: Path to evaluation report JSON (optional) mcp_json: Optional path to JSON config file for edda_mcp + mcp_args: Optional list of args passed to the MCP server (overrides defaults) """ coloredlogs.install( level=logging.INFO, @@ -490,6 +498,7 @@ def cli( trajectories_pattern, eval_report, mcp_json, + mcp_args, ) ) diff --git a/klaudbiusz/cli/evaluation/evaluate_app_dagger.py b/klaudbiusz/cli/evaluation/evaluate_app_dagger.py index 90396bc0..54f0f172 100644 --- a/klaudbiusz/cli/evaluation/evaluate_app_dagger.py +++ b/klaudbiusz/cli/evaluation/evaluate_app_dagger.py @@ -7,6 +7,7 @@ import asyncio import json +import os import sys import time from dataclasses import asdict @@ -55,6 +56,11 @@ break +def _restore_terminal_cursor() -> None: + """Restore terminal cursor after Dagger run (workaround for dagger/dagger#7160).""" + os.system("tput cnorm 2>/dev/null || true") + + async def evaluate_app_async( client: dagger.Client, app_dir: Path, @@ -390,7 +396,10 @@ async def main_async(): def main(): """Sync wrapper for async main.""" - asyncio.run(main_async()) + try: + asyncio.run(main_async()) + finally: + _restore_terminal_cursor() if __name__ == "__main__": diff --git a/klaudbiusz/cli/generation/bulk_run.py b/klaudbiusz/cli/generation/bulk_run.py index fa78e0a9..7046c1fb 100644 --- a/klaudbiusz/cli/generation/bulk_run.py +++ b/klaudbiusz/cli/generation/bulk_run.py @@ -1,199 +1,64 @@ -"""Bulk runner for generating multiple apps from hardcoded prompts.""" +"""Bulk app generation via Dagger with parallelism.""" +import asyncio import json import os -import signal -import sys from datetime import datetime from pathlib import Path -from typing import TypedDict +import fire from dotenv import load_dotenv -from joblib import Parallel, delayed +from tqdm import tqdm -from cli.generation.codegen import ClaudeAppBuilder -from cli.generation.codegen import GenerationMetrics as ClaudeGenerationMetrics -from cli.generation.codegen_multi import LiteLLMAppBuilder -from cli.generation.prompts.databricks import PROMPTS as DATABRICKS_PROMPTS -from cli.utils.litellm_multiprocess_fix import patch_litellm_for_multiprocessing +from cli.generation.dagger_run import DaggerAppGenerator -patch_litellm_for_multiprocessing() - -# Unified type for metrics from both backends -GenerationMetrics = ClaudeGenerationMetrics - -# Load environment variables from .env file load_dotenv() -# Re-export for eval compatibility -PROMPTS = DATABRICKS_PROMPTS - - -class RunResult(TypedDict): - prompt: str - success: bool - metrics: GenerationMetrics | None - error: str | None - app_dir: str | None - mcp_binary: str | None - backend: str - model: str | None - - -def run_single_generation( - app_name: str, - prompt: str, - backend: str, - model: str | None, - wipe_db: bool = False, - suppress_logs: bool = True, - mcp_binary: str | None = None, - mcp_json: str | None = None, - mcp_args: list[str] | None = None, - output_dir: str | None = None, -) -> RunResult: - # re-apply litellm patch in worker process (joblib uses spawn/fork) - if backend == "litellm": - patch_litellm_for_multiprocessing() - - def timeout_handler(signum, frame): - raise TimeoutError("Generation timed out after 1200 seconds") - - try: - # set 20 minute timeout for entire generation - signal.signal(signal.SIGALRM, timeout_handler) - signal.alarm(1200) - - match backend: - case "claude": - codegen = ClaudeAppBuilder( - app_name=app_name, - wipe_db=wipe_db, - suppress_logs=suppress_logs, - mcp_binary=mcp_binary, - mcp_json_path=mcp_json, - mcp_args=mcp_args, - output_dir=output_dir, - ) - metrics = codegen.run(prompt, wipe_db=wipe_db) - app_dir = metrics.get("app_dir") if metrics else None - case "litellm": - if not model: - raise ValueError("--model is required when using --backend=litellm") - builder = LiteLLMAppBuilder( - app_name=app_name, - model=model, - mcp_binary=mcp_binary, - mcp_json_path=mcp_json, - mcp_args=mcp_args, - suppress_logs=suppress_logs, - output_dir=output_dir, - ) - litellm_metrics = builder.run(prompt) - # convert LiteLLM metrics to dict format matching Claude SDK - metrics: GenerationMetrics = { - "cost_usd": litellm_metrics.cost_usd, - "input_tokens": litellm_metrics.input_tokens, - "output_tokens": litellm_metrics.output_tokens, - "turns": litellm_metrics.turns, - "app_dir": litellm_metrics.app_dir, - } - app_dir = litellm_metrics.app_dir - case _: - raise ValueError(f"Unknown backend: {backend}. Use 'claude' or 'litellm'") - - signal.alarm(0) # cancel timeout - return { - "prompt": prompt, - "success": True, - "metrics": metrics, - "error": None, - "app_dir": app_dir, - "mcp_binary": mcp_binary, - "backend": backend, - "model": model, - } - except TimeoutError as e: - signal.alarm(0) # cancel timeout - print(f"[TIMEOUT] {prompt[:80]}...", file=sys.stderr, flush=True) - return { - "prompt": prompt, - "success": False, - "metrics": None, - "error": str(e), - "app_dir": None, - "mcp_binary": mcp_binary, - "backend": backend, - "model": model, - } - except Exception as e: - signal.alarm(0) # cancel timeout - print(f"[ERROR] {prompt[:80]}... - {e}", file=sys.stderr, flush=True) - return { - "prompt": prompt, - "success": False, - "metrics": None, - "error": str(e), - "app_dir": None, - "mcp_binary": mcp_binary, - "backend": backend, - "model": model, - } +def _restore_terminal_cursor() -> None: + """Restore terminal cursor after Dagger run (workaround for dagger/dagger#7160).""" + os.system("tput cnorm 2>/dev/null || true") def main( prompts: str = "databricks", backend: str = "claude", model: str | None = None, - wipe_db: bool = False, - n_jobs: int = -1, mcp_binary: str | None = None, - mcp_json: str | None = None, mcp_args: list[str] | None = None, output_dir: str | None = None, + max_concurrency: int = 6, ) -> None: - """Bulk app generation from predefined prompt sets. + """Bulk app generation via Dagger with parallelism. Args: - prompts: Prompt set to use ("databricks", "databricks_v2", or "test", default: "databricks") - backend: Backend to use ("claude" or "litellm", default: "claude") - model: LLM model (required if backend=litellm, e.g., "openrouter/minimax/minimax-m2") - wipe_db: Whether to wipe database on start - n_jobs: Number of parallel jobs (-1 for all cores) - mcp_args: Optional list of args passed to the MCP server (overrides defaults) - mcp_binary: Optional path to pre-built edda-mcp binary (default: use cargo run) - mcp_json: Optional path to JSON config file for edda_mcp - output_dir: Custom output directory for generated apps (default: ./app) + prompts: Prompt set to use ("databricks", "databricks_v2", or "test") + backend: Backend to use ("claude" or "litellm") + model: LLM model (required if backend=litellm) + mcp_binary: Path to edda_mcp binary (required) + mcp_args: Optional list of args passed to the MCP server + output_dir: Custom output directory for generated apps + max_concurrency: Maximum parallel generations (default: 4) Usage: - # Claude backend (default) with databricks prompts (default) - python bulk_run.py - - # Claude backend with databricks_v2 prompts - python bulk_run.py --prompts=databricks_v2 + # Claude backend with databricks prompts + python bulk_run.py --mcp_binary=/path/to/edda_mcp - # Claude backend with test prompts - python bulk_run.py --prompts=test + # With custom concurrency + python bulk_run.py --mcp_binary=/path/to/edda_mcp --max_concurrency=8 # LiteLLM backend - python bulk_run.py --backend=litellm --model=openrouter/minimax/minimax-m2 - python bulk_run.py --prompts=test --backend=litellm --model=gemini/gemini-2.5-pro - - # Custom MCP config - python bulk_run.py --mcp_json=./config/databricks-cli.json - - # Custom output directory - python bulk_run.py --output-dir=/path/to/custom/folder + python bulk_run.py --backend=litellm --model=gemini/gemini-2.5-pro --mcp_binary=/path/to/edda_mcp + """ + if not mcp_binary: + raise ValueError("--mcp_binary is required") - # Custom MCP binary - python bulk_run.py --mcp-binary=/path/to/edda_mcp + if backend == "litellm" and not model: + raise ValueError("--model is required when using --backend=litellm") - # Optional: Run screenshots after generation - python screenshot.py ./app --concurrency=5 --wait-time=120000 - """ - # bulk run always suppresses logs - suppress_logs = True + # validate required environment variables + if not os.environ.get("DATABRICKS_HOST") or not os.environ.get("DATABRICKS_TOKEN"): + raise ValueError("DATABRICKS_HOST and DATABRICKS_TOKEN environment variables must be set") # load prompt set match prompts: @@ -206,67 +71,69 @@ def main( case _: raise ValueError(f"Unknown prompt set: {prompts}. Use 'databricks', 'databricks_v2', or 'test'") - # validate backend-specific requirements - if backend == "litellm" and not model: - raise ValueError("--model is required when using --backend=litellm") - - # validate required environment variables - if not os.environ.get("DATABRICKS_HOST") or not os.environ.get("DATABRICKS_TOKEN"): - raise ValueError("DATABRICKS_HOST and DATABRICKS_TOKEN environment variables must be set") - print(f"Starting bulk generation for {len(selected_prompts)} prompts...") print(f"Backend: {backend}") if backend == "litellm": print(f"Model: {model}") print(f"Prompt set: {prompts}") - print(f"Parallel jobs: {n_jobs}") - if backend == "claude": - print(f"Wipe DB: {wipe_db}") - print(f"MCP binary: {mcp_binary if mcp_binary else 'cargo run (default)'}") - print(f"Output dir: {output_dir if output_dir else './app (default)'}\n") - - # generate all apps - results: list[RunResult] = Parallel(n_jobs=n_jobs, backend="loky", verbose=10)( # type: ignore[assignment] - delayed(run_single_generation)( - app_name, prompt, backend, model, wipe_db, suppress_logs, mcp_binary, mcp_json, mcp_args, output_dir - ) - for app_name, prompt in selected_prompts.items() + print(f"Max concurrency: {max_concurrency}") + print(f"MCP binary: {mcp_binary}") + out_path = Path(output_dir) if output_dir else Path("./app") + print(f"Output dir: {out_path}\n") + + generator = DaggerAppGenerator( + mcp_binary=Path(mcp_binary), + output_dir=out_path, + stream_logs=False, # disable TUI for bulk runs ) - # separate successful and failed generations - successful: list[RunResult] = [] - failed: list[RunResult] = [] - for r in results: - success = r["success"] + # progress bar with success/fail tracking + pbar = tqdm(total=len(selected_prompts), desc="Generating apps", unit="app") + success_count = 0 + fail_count = 0 + + def on_complete(app_name: str, success: bool) -> None: + nonlocal success_count, fail_count if success: - successful.append(r) + success_count += 1 + status = "✓" else: - failed.append(r) + fail_count += 1 + status = "✗" + pbar.set_postfix(ok=success_count, fail=fail_count) + pbar.set_description(f"{status} {app_name}") + pbar.update(1) - apps_dir = "./app/" - # get apps directory from first successful app (used for output file path) - if successful: - first_app_dir = next((r["app_dir"] for r in successful if r["app_dir"]), None) - if first_app_dir: - apps_dir = str(Path(first_app_dir).parent) + try: + results = asyncio.run( + generator.generate_bulk( + selected_prompts, + backend, + model, + mcp_args, + max_concurrency, + on_complete=on_complete, + ) + ) + finally: + pbar.close() + _restore_terminal_cursor() - successful_with_metrics: list[RunResult] = [] - for r in successful: - metrics = r["metrics"] - if metrics is not None: - successful_with_metrics.append(r) + # separate successful and failed (results now include metrics) + successful = [(name, app_dir, log, metrics) for name, app_dir, log, metrics, err in results if err is None] + failed = [(name, log, err) for name, app_dir, log, metrics, err in results if err is not None] + # aggregate metrics from successful runs total_cost = 0.0 - total_input_tokens = 0 - total_output_tokens = 0 + total_tokens = 0 total_turns = 0 - for r in successful_with_metrics: - metrics = r["metrics"] - assert metrics is not None - total_cost += metrics["cost_usd"] - total_input_tokens += metrics["input_tokens"] - total_output_tokens += metrics["output_tokens"] - total_turns += metrics["turns"] + metrics_count = 0 + for _, _, _, metrics in successful: + if metrics: + total_cost += metrics.get("cost_usd", 0.0) + total_tokens += metrics.get("input_tokens", 0) + total_turns += metrics.get("turns", 0) + metrics_count += 1 print(f"\n{'=' * 80}") print("Bulk Generation Summary") @@ -274,62 +141,63 @@ def main( print(f"Total prompts: {len(selected_prompts)}") print(f"Successful: {len(successful)}") print(f"Failed: {len(failed)}") - print(f"\nTotal cost: ${total_cost:.4f}") - print(f"Total input tokens: {total_input_tokens}") - print(f"Total output tokens: {total_output_tokens}") - print(f"Total turns: {total_turns}") - if successful_with_metrics: - avg_cost = total_cost / len(successful_with_metrics) - avg_input = total_input_tokens / len(successful_with_metrics) - avg_output = total_output_tokens / len(successful_with_metrics) - avg_turns = total_turns / len(successful_with_metrics) - print("\nAverage per generation:") - print(f" Cost: ${avg_cost:.4f}") - print(f" Input tokens: {avg_input:.0f}") - print(f" Output tokens: {avg_output:.0f}") - print(f" Turns: {avg_turns:.1f}") + if metrics_count > 0: + print(f"\nMetrics (from {metrics_count} runs):") + print(f" Total cost: ${total_cost:.4f}") + print(f" Avg cost: ${total_cost / metrics_count:.4f}") + print(f" Total tokens: {total_tokens:,}") + print(f" Avg tokens: {total_tokens // metrics_count:,}") + print(f" Avg turns: {total_turns / metrics_count:.1f}") - if len(failed) > 0: + if failed: print(f"\n{'=' * 80}") print("Failed generations:") print(f"{'=' * 80}") - for r in failed: - prompt = r["prompt"] - error = r["error"] - print(f" - {prompt[:50]}...") - if error is not None: - print(f" Error: {error}") + for name, log, err in failed: + print(f" - {name}") + print(f" Error: {err}") + if log: + print(f" Log: {log}") - if len(successful) > 0: - apps_with_dirs: list[tuple[str, str]] = [] - for r in successful: - prompt = r["prompt"] - app_dir = r["app_dir"] - if app_dir is not None: - apps_with_dirs.append((prompt, app_dir)) - - if apps_with_dirs: - print(f"\n{'=' * 80}") - print("Generated apps:") - print(f"{'=' * 80}") - for prompt, app_dir in apps_with_dirs: - print(f" - {prompt[:60]}...") - print(f" Dir: {app_dir}") + if successful: + print(f"\n{'=' * 80}") + print("Generated apps:") + print(f"{'=' * 80}") + for name, app_dir, log, metrics in successful: + print(f" - {name}") + print(f" Dir: {app_dir}") + if metrics: + print( + f" Cost: ${metrics.get('cost_usd', 0):.4f}, Tokens: {metrics.get('input_tokens', 0):,}, Turns: {metrics.get('turns', 0)}" + ) print(f"\n{'=' * 80}\n") + # save results json timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") backend_suffix = f"_{backend}" if backend != "claude" else "" - output_file = Path(apps_dir) / Path(f"bulk_run_results{backend_suffix}_{timestamp}.json") - - # ensure directory exists + output_file = out_path / f"bulk_run_results{backend_suffix}_{timestamp}.json" output_file.parent.mkdir(parents=True, exist_ok=True) - output_file.write_text(json.dumps(results, indent=2)) + + results_data = [ + { + "app_name": name, + "success": err is None, + "app_dir": str(app_dir) if app_dir else None, + "log_file": str(log) if log else None, + "error": err, + "backend": backend, + "model": model, + "cost_usd": metrics.get("cost_usd") if metrics else None, + "tokens": metrics.get("input_tokens") if metrics else None, + "turns": metrics.get("turns") if metrics else None, + } + for name, app_dir, log, metrics, err in results + ] + output_file.write_text(json.dumps(results_data, indent=2)) print(f"Results saved to {output_file}") if __name__ == "__main__": - import fire - fire.Fire(main) diff --git a/klaudbiusz/cli/generation/codegen.py b/klaudbiusz/cli/generation/codegen.py index 2a74b585..826b9780 100644 --- a/klaudbiusz/cli/generation/codegen.py +++ b/klaudbiusz/cli/generation/codegen.py @@ -95,11 +95,7 @@ async def run_async(self, prompt: str) -> GenerationMetrics: Never deploy the app, just scaffold and build it. """ - disallowed_tools = [ - "NotebookEdit", - "WebSearch", - "WebFetch", - ] + disallowed_tools = ["NotebookEdit", "WebSearch", "WebFetch", "Bash"] command, args = build_mcp_command(self.mcp_binary, self.mcp_manifest, self.mcp_json_path, self.mcp_args) @@ -169,6 +165,13 @@ async def run_async(self, prompt: str) -> GenerationMetrics: print(f"\n❌ Error: {e}", file=sys.stderr) raise finally: + # fallback: detect app_dir from filesystem if not tracked + if not self.scaffold_tracker.app_dir: + detected = self.scaffold_tracker.detect_from_filesystem(self.output_dir) + if detected: + self.scaffold_tracker.app_dir = detected + logger.info(f"📁 Detected app directory from filesystem: {detected}") + # save trajectory via tracker await self.tracker.save( prompt=prompt, diff --git a/klaudbiusz/cli/generation/codegen_multi.py b/klaudbiusz/cli/generation/codegen_multi.py index b26988f4..81de544c 100644 --- a/klaudbiusz/cli/generation/codegen_multi.py +++ b/klaudbiusz/cli/generation/codegen_multi.py @@ -31,9 +31,15 @@ class GenerationMetrics: class MCPSession: - def __init__(self, mcp_binary: str | None = None, mcp_json_path: str | None = None): + def __init__( + self, + mcp_binary: str | None = None, + mcp_json_path: str | None = None, + mcp_args: list[str] | None = None, + ): self.mcp_binary = mcp_binary self.mcp_json_path = mcp_json_path + self.mcp_args = mcp_args self.project_root = Path(__file__).parent.parent.parent.parent self.mcp_manifest = validate_mcp_manifest(mcp_binary, self.project_root) @@ -48,10 +54,10 @@ async def __aenter__(self) -> ClientSession: "DATABRICKS_WAREHOUSE_ID": os.getenv("DATABRICKS_WAREHOUSE_ID", ""), } - command, args = build_mcp_command(self.mcp_binary, self.mcp_manifest, self.mcp_json_path) + command, args = build_mcp_command(self.mcp_binary, self.mcp_manifest, self.mcp_json_path, self.mcp_args) # add workspace tools flag for LiteLLM backend (works for both binary and cargo run) - # only if not using JSON override - if not self.mcp_json_path: + # only if not using JSON override or custom mcp_args + if not self.mcp_json_path and not self.mcp_args: args.append("--with-workspace-tools=true") server_params = StdioServerParameters(command=command, args=args, env=env) @@ -284,6 +290,7 @@ def __init__( model: str, mcp_binary: str | None = None, mcp_json_path: str | None = None, + mcp_args: list[str] | None = None, suppress_logs: bool = False, output_dir: str | None = None, ): @@ -291,6 +298,7 @@ def __init__( self.model = model self.mcp_binary = mcp_binary self.mcp_json_path = mcp_json_path + self.mcp_args = mcp_args self.suppress_logs = suppress_logs self.output_dir = Path(output_dir) if output_dir else Path.cwd() / "app" litellm.drop_params = True @@ -335,7 +343,7 @@ def _build_system_prompt(self) -> str: async def run_async(self, prompt: str) -> GenerationMetrics: setup_logging(self.suppress_logs, self.mcp_binary) - mcp_session = MCPSession(self.mcp_binary, self.mcp_json_path) + mcp_session = MCPSession(self.mcp_binary, self.mcp_json_path, self.mcp_args) agent = None metrics = None diff --git a/klaudbiusz/cli/generation/container_runner.py b/klaudbiusz/cli/generation/container_runner.py new file mode 100644 index 00000000..484673b6 --- /dev/null +++ b/klaudbiusz/cli/generation/container_runner.py @@ -0,0 +1,76 @@ +"""Runner script executed inside Dagger container.""" + +import json +import sys + +import fire + + +def run( + prompt: str, + app_name: str, + backend: str = "claude", + model: str | None = None, + mcp_args: str | list[str] | None = None, + mcp_binary: str = "/usr/local/bin/edda_mcp", + output_dir: str = "/workspace", +) -> None: + """Run app generation (inside container or locally for debugging). + + Args: + prompt: The prompt describing what to build + app_name: App name for output directory + backend: "claude" or "litellm" + model: Model name (required for litellm) + mcp_args: JSON-encoded list or already-parsed list of MCP server args + mcp_binary: Path to edda_mcp binary (default: /usr/local/bin/edda_mcp for container) + output_dir: Output directory for generated app (default: /workspace for container) + """ + # handle both JSON string and already-parsed list (fire may parse it) + parsed_mcp_args: list[str] | None + match mcp_args: + case None: + parsed_mcp_args = None + case str(): + parsed_mcp_args = json.loads(mcp_args) + case list(): + parsed_mcp_args = mcp_args + + match backend: + case "claude": + from cli.generation.codegen import ClaudeAppBuilder + + builder = ClaudeAppBuilder( + app_name=app_name, + wipe_db=False, + suppress_logs=False, + mcp_binary=mcp_binary, + mcp_args=parsed_mcp_args, + output_dir=output_dir, + ) + metrics = builder.run(prompt, wipe_db=False) + case "litellm": + from cli.generation.codegen_multi import LiteLLMAppBuilder + + if not model: + print("Error: --model is required for litellm backend", file=sys.stderr) + sys.exit(1) + + builder = LiteLLMAppBuilder( + app_name=app_name, + model=model, + mcp_binary=mcp_binary, + mcp_args=parsed_mcp_args, + suppress_logs=False, + output_dir=output_dir, + ) + metrics = builder.run(prompt) + case _: + print(f"Error: Unknown backend: {backend}", file=sys.stderr) + sys.exit(1) + + print(f"Metrics: {metrics}") + + +if __name__ == "__main__": + fire.Fire(run) diff --git a/klaudbiusz/cli/generation/dagger_run.py b/klaudbiusz/cli/generation/dagger_run.py new file mode 100644 index 00000000..fbc69f6f --- /dev/null +++ b/klaudbiusz/cli/generation/dagger_run.py @@ -0,0 +1,277 @@ +"""Dagger-based app generation pipeline with caching and parallelism.""" + +import asyncio +import json +import logging +import os +import subprocess +import sys +from collections.abc import Callable +from pathlib import Path + +import dagger + +from cli.generation.codegen import GenerationMetrics + +logger = logging.getLogger(__name__) + + +def _read_metrics_from_app(app_dir: Path) -> GenerationMetrics | None: + """Read metrics from generation_metrics.json in app directory.""" + metrics_file = app_dir / "generation_metrics.json" + if not metrics_file.exists(): + return None + + try: + data = json.loads(metrics_file.read_text()) + return GenerationMetrics( + cost_usd=data.get("cost_usd", 0.0), + input_tokens=data.get("input_tokens", 0), + output_tokens=data.get("output_tokens", 0), + turns=data.get("turns", 0), + ) + except (json.JSONDecodeError, KeyError) as e: + logger.warning(f"Failed to parse generation metrics: {e}") + return None + + +def _check_binary_format(binary_path: Path) -> None: + """Check if binary is Linux-compatible for container execution. + + Raises: + RuntimeError: If binary is not Linux ELF format + """ + try: + result = subprocess.run( + ["file", str(binary_path)], + capture_output=True, + text=True, + check=True, + ) + output = result.stdout.lower() + + if "mach-o" in output or "darwin" in output: + raise RuntimeError( + f"Binary {binary_path} is macOS format (Mach-O), but Dagger runs Linux containers.\n" + f"Please provide a Linux build. For Go: GOOS=linux GOARCH=arm64 go build ...\n" + f"Output from 'file': {result.stdout.strip()}" + ) + + if "elf" not in output: + logger.warning( + f"Binary {binary_path} may not be Linux-compatible: {result.stdout.strip()}" + ) + except FileNotFoundError: + # 'file' command not available, skip check + pass + + +class DaggerAppGenerator: + """Runs app generation in Dagger container with caching.""" + + def __init__( + self, + mcp_binary: Path, + output_dir: Path, + stream_logs: bool = True, + ): + _check_binary_format(mcp_binary) + self.mcp_binary = mcp_binary + self.output_dir = output_dir + self.stream_logs = stream_logs + + async def generate_single( + self, + prompt: str, + app_name: str, + backend: str = "claude", + model: str | None = None, + mcp_args: list[str] | None = None, + ) -> tuple[Path | None, Path, GenerationMetrics | None]: + """Generate single app, export app dir + logs. + + Returns: + tuple of (app_dir or None, log_file, metrics or None) paths on host. + app_dir is None if agent didn't create an app. + """ + if self.stream_logs: + cfg = dagger.Config(log_output=sys.stderr) + else: + cfg = dagger.Config(log_output=open(os.devnull, "w")) + async with dagger.Connection(cfg) as client: + container = await self._build_container(client) + return await self._run_generation( + client, container, prompt, app_name, backend, model, mcp_args + ) + + async def _run_generation( + self, + client: dagger.Client, + base_container: dagger.Container, + prompt: str, + app_name: str, + backend: str, + model: str | None, + mcp_args: list[str] | None, + ) -> tuple[Path | None, Path, GenerationMetrics | None]: + """Run generation in container and export results.""" + # path inside container for generated app + app_output = f"/workspace/{app_name}" + + # build command using container_runner.py (already in image via Dockerfile COPY) + cmd = [ + "python", + "cli/generation/container_runner.py", + prompt, + f"--app_name={app_name}", + f"--backend={backend}", + ] + if model: + cmd.append(f"--model={model}") + if mcp_args: + cmd.append(f"--mcp_args={json.dumps(mcp_args)}") + + # ensure log directory exists + container = base_container.with_exec(["mkdir", "-p", "/workspace/logs"]) + + # run generation + result = container.with_exec(cmd) + + # prepare log file path + log_file_local = self.output_dir / "logs" / f"{app_name}.log" + log_file_local.parent.mkdir(parents=True, exist_ok=True) + + # capture stdout/stderr - even on failure we want to save what we can + try: + log_content = await result.stdout() + stderr_content = await result.stderr() + full_log = f"{log_content}\n\n=== STDERR ===\n{stderr_content}" if stderr_content else log_content + log_file_local.write_text(full_log) + except dagger.ExecError as e: + # container command failed - save error output as log + full_log = f"=== EXEC ERROR ===\n{e}\n\n=== STDOUT ===\n{e.stdout}\n\n=== STDERR ===\n{e.stderr}" + log_file_local.write_text(full_log) + raise + + # export app directory (if it exists) + app_dir_local = self.output_dir / app_name + try: + await result.directory(app_output).export(str(app_dir_local)) + except dagger.QueryError as e: + if "no such file or directory" in str(e): + # agent didn't create an app directory (e.g. just answered a question) + return None, log_file_local, None + raise + + # read metrics from generation_metrics.json + metrics = _read_metrics_from_app(app_dir_local) + return app_dir_local, log_file_local, metrics + + async def generate_bulk( + self, + prompts: dict[str, str], + backend: str = "claude", + model: str | None = None, + mcp_args: list[str] | None = None, + max_concurrency: int = 4, + on_complete: Callable[[str, bool], None] | None = None, + ) -> list[tuple[str, Path | None, Path | None, GenerationMetrics | None, str | None]]: + """Generate multiple apps with Dagger parallelism. + + Uses a single Dagger connection for all generations, allowing Dagger + to optimize container reuse and parallel execution. + + Args: + prompts: dict mapping app_name to prompt + backend: "claude" or "litellm" + model: model name (required for litellm) + mcp_args: optional MCP server args + max_concurrency: max parallel generations + on_complete: callback(app_name, success) called when each app finishes + + Returns: + list of (app_name, app_dir, log_file, metrics, error) tuples + """ + # suppress dagger output for bulk runs + cfg = dagger.Config(log_output=open(os.devnull, "w")) + + async with dagger.Connection(cfg) as client: + # build container once, reuse for all generations + base_container = await self._build_container(client) + sem = asyncio.Semaphore(max_concurrency) + + async def run_with_sem( + app_name: str, prompt: str + ) -> tuple[str, Path | None, Path | None, GenerationMetrics | None, str | None]: + async with sem: + try: + app_dir, log_file, metrics = await self._run_generation( + client, base_container, prompt, app_name, backend, model, mcp_args + ) + if on_complete: + on_complete(app_name, True) + return (app_name, app_dir, log_file, metrics, None) + except Exception as e: + if on_complete: + on_complete(app_name, False) + log_path = self.output_dir / "logs" / f"{app_name}.log" + return (app_name, None, log_path if log_path.exists() else None, None, str(e)) + + tasks = [run_with_sem(name, prompt) for name, prompt in prompts.items()] + return await asyncio.gather(*tasks) + + async def _build_container(self, client: dagger.Client) -> dagger.Container: + """Build container from Dockerfile with layer caching.""" + # build context excluding generated files + context = client.host().directory( + ".", + exclude=[ + "app/", + "app-eval/", + "results/", + ".venv/", + "__pycache__/", + ".git/", + ], + ) + + # build from Dockerfile (leverages BuildKit cache) + container = context.docker_build() + + # mount mcp binary from host (not baked into image) + container = container.with_file( + "/usr/local/bin/edda_mcp", + client.host().file(str(self.mcp_binary)), + permissions=0o755, # make executable + ) + + # pass through env vars from host + env_vars = [ + "ANTHROPIC_API_KEY", + "NEON_DATABASE_URL", + ] + for var in env_vars: + if val := os.environ.get(var): + container = container.with_env_variable(var, val) + + # mount databricks config for CLI authentication (OAuth profile) + # container runs as 'klaudbiusz' user (see Dockerfile) + databrickscfg = Path.home() / ".databrickscfg" + if databrickscfg.exists(): + container = container.with_file( + "/home/klaudbiusz/.databrickscfg", + client.host().file(str(databrickscfg)), + owner="klaudbiusz:klaudbiusz", + ) + + # mount databricks directory for OAuth token cache and other CLI state + # required when using auth_type = databricks-cli + databricks_dir = Path.home() / ".databricks" + if databricks_dir.exists(): + container = container.with_directory( + "/home/klaudbiusz/.databricks", + client.host().directory(str(databricks_dir)), + owner="klaudbiusz:klaudbiusz", + ) + + return container diff --git a/klaudbiusz/cli/generation/single_run.py b/klaudbiusz/cli/generation/single_run.py index fc618cf5..a65f370f 100644 --- a/klaudbiusz/cli/generation/single_run.py +++ b/klaudbiusz/cli/generation/single_run.py @@ -1,98 +1,83 @@ -import fire +"""Single app generation via Dagger.""" + +import asyncio +import os from datetime import datetime +from pathlib import Path + +import fire from dotenv import load_dotenv -from cli.generation.codegen import ClaudeAppBuilder, GenerationMetrics as ClaudeGenerationMetrics -from cli.generation.codegen_multi import LiteLLMAppBuilder +from cli.generation.dagger_run import DaggerAppGenerator -# Load environment variables from .env file load_dotenv() +def _restore_terminal_cursor() -> None: + """Restore terminal cursor after Dagger run (workaround for dagger/dagger#7160).""" + os.system("tput cnorm 2>/dev/null || true") + + def run( prompt: str, app_name: str | None = None, backend: str = "claude", model: str | None = None, - wipe_db: bool = True, mcp_binary: str | None = None, - mcp_json: str | None = None, mcp_args: list[str] | None = None, -): - """Run app builder with given prompt. + output_dir: str | None = None, +) -> dict[str, str | None]: + """Run app generation in Dagger container. Args: prompt: The prompt describing what to build app_name: Optional app name (default: timestamp-based) backend: Backend to use ("claude" or "litellm", default: "claude") - model: LLM model (required if backend=litellm, e.g., "openrouter/minimax/minimax-m2") - wipe_db: Whether to wipe database on start - mcp_binary: Optional path to pre-built edda-mcp binary (default: use cargo run) - mcp_json: Optional path to JSON config file for edda_mcp - mcp_args: Optional list of args passed to the MCP server (overrides defaults) + model: LLM model (required if backend=litellm) + mcp_binary: Path to edda_mcp binary (required) + mcp_args: Optional list of args passed to the MCP server Usage: # Claude backend (default) - python main.py "build dashboard" --app_name=my-dashboard + python single_run.py "build dashboard" --mcp_binary=/path/to/edda_mcp # LiteLLM backend - python main.py "build dashboard" --backend=litellm --model=openrouter/minimax/minimax-m2 - python main.py "build dashboard" --backend=litellm --model=gemini/gemini-2.5-pro - - # Custom MCP config - python main.py "build dashboard" --mcp_json=./config/databricks-cli.json + python single_run.py "build dashboard" --backend=litellm --model=gemini/gemini-2.5-pro --mcp_binary=/path/to/edda_mcp # Custom MCP args - python main.py "build dashboard" --mcp_args='["experimental", "apps-mcp"]' + python single_run.py "build dashboard" --mcp_binary=/path/to/edda_mcp --mcp_args='["experimental", "apps-mcp"]' """ + if not mcp_binary: + raise ValueError("--mcp_binary is required") + + if backend == "litellm" and not model: + raise ValueError("--model is required when using --backend=litellm") + if app_name is None: app_name = f"app-{datetime.now().strftime('%Y%m%d-%H%M%S')}" - # single run always shows logs - suppress_logs = False - - match backend: - case "claude": - builder = ClaudeAppBuilder( - app_name=app_name, - wipe_db=wipe_db, - suppress_logs=suppress_logs, - mcp_binary=mcp_binary, - mcp_json_path=mcp_json, - mcp_args=mcp_args, - ) - metrics = builder.run(prompt, wipe_db=wipe_db) - case "litellm": - if not model: - raise ValueError("--model is required when using --backend=litellm") - builder_litellm = LiteLLMAppBuilder( - app_name=app_name, - model=model, - mcp_binary=mcp_binary, - mcp_json_path=mcp_json, - mcp_args=mcp_args, - suppress_logs=suppress_logs, - ) - litellm_metrics = builder_litellm.run(prompt) - # convert to dict format for consistent output - metrics: ClaudeGenerationMetrics = { - "cost_usd": litellm_metrics.cost_usd, - "input_tokens": litellm_metrics.input_tokens, - "output_tokens": litellm_metrics.output_tokens, - "turns": litellm_metrics.turns, - "app_dir": litellm_metrics.app_dir, - } - case _: - raise ValueError(f"Unknown backend: {backend}. Use 'claude' or 'litellm'") - - if metrics: - print(f"\n{'=' * 80}") - print("Final metrics:") - print(f" Cost: ${metrics['cost_usd']:.4f}") - print(f" Turns: {metrics['turns']}") - print(f" App dir: {metrics.get('app_dir', 'NOT CAPTURED')}") - print(f"{'=' * 80}\n") - return metrics + generator = DaggerAppGenerator( + mcp_binary=Path(mcp_binary), + output_dir=Path(output_dir) if output_dir else Path("./app"), + ) + + try: + app_dir, log_file = asyncio.run( + generator.generate_single(prompt, app_name, backend, model, mcp_args) + ) + finally: + _restore_terminal_cursor() + + print(f"\n{'=' * 80}") + if app_dir: + print("Generation complete:") + print(f" App: {app_dir}") + else: + print("No app generated (agent may have just answered without creating files)") + print(f" Log: {log_file}") + print(f"{'=' * 80}\n") + + return {"app_dir": str(app_dir) if app_dir else None, "log_file": str(log_file)} def main(): diff --git a/klaudbiusz/cli/utils/shared.py b/klaudbiusz/cli/utils/shared.py index b4e3d69e..3520611c 100644 --- a/klaudbiusz/cli/utils/shared.py +++ b/klaudbiusz/cli/utils/shared.py @@ -76,14 +76,16 @@ def log_text(self, role: str, text: str, emoji: str = "💬") -> None: if not self.suppress_logs: logger.info(f"{emoji} {text}") - self.trajectory_messages.append(Message( - role=role, - content=text, - tool_calls=None, - tool_results=None, - timestamp=datetime.now(timezone.utc), - tokens=None, - )) + self.trajectory_messages.append( + Message( + role=role, + content=text, + tool_calls=None, + tool_results=None, + timestamp=datetime.now(timezone.utc), + tokens=None, + ) + ) def log_tool_call(self, tool_name: str, arguments: dict[str, Any], tool_id: str) -> None: """Log tool call from assistant. @@ -100,14 +102,16 @@ def log_tool_call(self, tool_name: str, arguments: dict[str, Any], tool_id: str) logger.info(f"🔧 Tool: {tool_name}({truncated})") # trajectory collection - self.trajectory_messages.append(Message( - role="assistant", - content=None, - tool_calls=[ToolCall(id=tool_id, name=tool_name, arguments=arguments)], - tool_results=None, - timestamp=datetime.now(timezone.utc), - tokens=None, - )) + self.trajectory_messages.append( + Message( + role="assistant", + content=None, + tool_calls=[ToolCall(id=tool_id, name=tool_name, arguments=arguments)], + tool_results=None, + timestamp=datetime.now(timezone.utc), + tokens=None, + ) + ) def log_tool_result(self, tool_id: str, result: str, is_error: bool = False) -> None: """Log tool result from environment. @@ -126,14 +130,16 @@ def log_tool_result(self, tool_id: str, result: str, is_error: bool = False) -> logger.info(f"✅ Tool result: {truncated}") # trajectory collection - self.trajectory_messages.append(Message( - role="tool", - content=None, - tool_calls=None, - tool_results=[ToolResult(tool_call_id=tool_id, content=result, is_error=is_error)], - timestamp=datetime.now(timezone.utc), - tokens=None, - )) + self.trajectory_messages.append( + Message( + role="tool", + content=None, + tool_calls=None, + tool_results=[ToolResult(tool_call_id=tool_id, content=result, is_error=is_error)], + timestamp=datetime.now(timezone.utc), + tokens=None, + ) + ) def log_subagent_invoke(self, subagent_type: str, description: str, prompt: str) -> None: """Log subagent delegation (Claude SDK specific).""" @@ -144,14 +150,16 @@ def log_subagent_invoke(self, subagent_type: str, description: str, prompt: str) logger.info(f" Instructions: {truncated}") # add to trajectory as assistant message (contextual info) - self.trajectory_messages.append(Message( - role="assistant", - content=f"[Delegating to subagent: {subagent_type}] {description}", - tool_calls=None, - tool_results=None, - timestamp=datetime.now(timezone.utc), - tokens=None, - )) + self.trajectory_messages.append( + Message( + role="assistant", + content=f"[Delegating to subagent: {subagent_type}] {description}", + tool_calls=None, + tool_results=None, + timestamp=datetime.now(timezone.utc), + tokens=None, + ) + ) def log_todo_update(self, todos: list[dict[str, Any]]) -> None: """Log todo list update.""" @@ -165,14 +173,16 @@ def log_todo_update(self, todos: list[dict[str, Any]]) -> None: # add to trajectory as assistant message (contextual info) summary = f"Todo update: {sum(1 for t in todos if t.get('status') == 'completed')}/{len(todos)} completed" - self.trajectory_messages.append(Message( - role="assistant", - content=f"[{summary}]", - tool_calls=None, - tool_results=None, - timestamp=datetime.now(timezone.utc), - tokens=None, - )) + self.trajectory_messages.append( + Message( + role="assistant", + content=f"[{summary}]", + tool_calls=None, + tool_results=None, + timestamp=datetime.now(timezone.utc), + tokens=None, + ) + ) def log_session_complete( self, @@ -219,6 +229,9 @@ async def save( if not self.trajectory_messages: return + if not self.app_name: + logger.warning("⚠️ App name not set, skipping trajectory save") + trajectory = Trajectory( run_id=str(self.run_id), app_name=self.app_name, @@ -261,6 +274,36 @@ def resolve(self, tool_id: str) -> None: if tool_id in self._pending: self.app_dir = self._pending.pop(tool_id) + def detect_from_filesystem(self, search_root: Path | None = None) -> str | None: + """Fallback: detect scaffold by finding package.json at top level. + + Globs for package.json and chooses the one closest to search_root. + This works in dagger environments where /workspace is the base. + + Args: + search_root: Root directory to search from (e.g., /workspace or output_dir) + + Returns: + Path to detected app directory, or None if not found + """ + if search_root is None: + search_root = Path("/workspace") + + if not search_root.exists(): + logger.warning(f"⚠️ Search root does not exist: {search_root}") + return None + + logger.info(f"🔍 Searching for scaffolded app directory under {search_root}") + # glob for all package.json files + marker_files = list(search_root.glob("**/databricks.yml")) + if not marker_files: + logger.warning("⚠️ Could not detect scaffolded app directory from filesystem") + + file, *_ = marker_files + app_dir = file.parent + logger.info(f"✅ Detected scaffolded app directory: {app_dir}") + return str(app_dir) + def validate_mcp_manifest(mcp_binary: str | None, project_root: Path) -> Path | None: """Validate MCP manifest exists if using cargo run. @@ -293,6 +336,7 @@ def setup_logging(suppress_logs: bool, mcp_binary: str | None = None) -> None: else: try: import coloredlogs # type: ignore[import-untyped] + coloredlogs.install(level="INFO") except ImportError: logging.basicConfig(level=logging.INFO)