|
20 | 20 | from evals._util import ( |
21 | 21 | load_cache_lookup, |
22 | 22 | append_results_to_cache, |
23 | | - get_cache_file_path, |
24 | | - construct_fake_azure_result |
25 | 23 | ) |
26 | 24 |
|
27 | 25 | DEFAULT_NUM_RUNS: int = 1 |
@@ -260,12 +258,18 @@ def _execute_target(self, target: EvaluationTarget) -> EvaluationResult: |
260 | 258 | cached_rows = [row for row in cached_azure_rows] |
261 | 259 | fresh_rows = [row for result in fresh_results for row in result.get("rows", [])] |
262 | 260 | all_cached_rows = cached_rows + fresh_rows |
263 | | - combined_result = construct_fake_azure_result(all_cached_rows) |
| 261 | + combined_result = { |
| 262 | + "rows": all_cached_rows, |
| 263 | + "metrics": {}, |
| 264 | + "studio_url": None |
| 265 | + } |
| 266 | + |
| 267 | + all_passed = all(row.get("outputs.metrics.correct_action", False) for row in all_cached_rows) |
264 | 268 |
|
265 | 269 | return EvaluationResult( |
266 | 270 | target=target, |
267 | 271 | raw_results=[{f"{target.workflow_name}.jsonl": combined_result}], |
268 | | - success=True, |
| 272 | + success=all_passed, |
269 | 273 | ) |
270 | 274 |
|
271 | 275 | except Exception as e: |
@@ -320,63 +324,78 @@ def show_results(self, results: list[EvaluationResult]): |
320 | 324 | print() |
321 | 325 |
|
322 | 326 | successful = [r for r in results if r.success and r.raw_results] |
323 | | - failed = [r for r in results if not r.success] |
| 327 | + failed = [r for r in results if not r.success and r.raw_results and r.error is None] |
| 328 | + errored = [r for r in results if r.error is not None] |
324 | 329 |
|
325 | 330 | if successful: |
326 | 331 | for result in successful: |
327 | | - print(f"✅ {result.workflow_name}") |
| 332 | + print(f" ✅ {result.workflow_name}") |
328 | 333 | raw_results = result.raw_results[0] |
329 | 334 | for filename, eval_result in raw_results.items(): |
330 | | - print(f" == {filename} ==") |
| 335 | + print(f" == {filename} ==") |
331 | 336 | for res in eval_result["rows"]: |
332 | 337 | success = res["outputs.metrics.correct_action"] |
333 | 338 | testcase_name = res["inputs.testcase"] |
334 | 339 | score = res["outputs.metrics.score"] |
335 | | - print(f" - {'✅' if success else '❌'} {score} - {testcase_name}") |
336 | | - print() |
| 340 | + print(f" - {'✅' if success else '❌'} {score} - {testcase_name}") |
| 341 | + print() |
337 | 342 |
|
338 | 343 | if failed: |
339 | | - print("❌ FAILED EVALUATIONS:") |
340 | 344 | for result in failed: |
341 | | - print(f" • {result.workflow_name}: {result.error}") |
| 345 | + print(f" ❌ {result.workflow_name}") |
| 346 | + raw_results = result.raw_results[0] |
| 347 | + for filename, eval_result in raw_results.items(): |
| 348 | + print(f" == {filename} ==") |
| 349 | + for res in eval_result["rows"]: |
| 350 | + success = res["outputs.metrics.correct_action"] |
| 351 | + testcase_name = res["inputs.testcase"] |
| 352 | + score = res["outputs.metrics.score"] |
| 353 | + print(f" - {'✅' if success else '❌'} {score} - {testcase_name}") |
342 | 354 | print() |
343 | 355 |
|
344 | | - if not successful and not failed: |
| 356 | + if errored: |
| 357 | + print("💥 ERRORED EVALUATIONS:") |
| 358 | + for result in errored: |
| 359 | + print(f" 💥 {result.workflow_name}: {result.error}") |
| 360 | + print() |
| 361 | + |
| 362 | + if not successful and not failed and not errored: |
345 | 363 | print("No evaluation results to display.") |
346 | 364 | print() |
347 | 365 |
|
348 | 366 | def show_summary(self, results: list[EvaluationResult]): |
349 | 367 | """Display aggregated results from all evaluations.""" |
350 | | - successful = [r for r in results if r.success] |
351 | | - failed = [r for r in results if not r.success] |
| 368 | + successful = [r for r in results if r.success and r.raw_results] |
| 369 | + failed = [r for r in results if not r.success and r.raw_results and r.error is None] |
| 370 | + errored = [r for r in results if r.error is not None] |
352 | 371 |
|
353 | 372 | print("=" * 60) |
354 | 373 | print("📈 EVALUATION SUMMARY") |
355 | 374 | print("=" * 60) |
356 | 375 | print(f"Total targets: {len(results)}") |
357 | 376 | print(f"✅ Successful: {len(successful)}") |
358 | 377 | print(f"❌ Failed: {len(failed)}") |
| 378 | + print(f"💥 Errored: {len(errored)}") |
359 | 379 | print() |
360 | 380 |
|
| 381 | + if errored: |
| 382 | + print("💥 ERRORED EVALUATIONS:") |
| 383 | + for result in errored: |
| 384 | + print(f" • {result.workflow_name}: {result.error}") |
| 385 | + print() |
| 386 | + |
361 | 387 | if failed: |
362 | 388 | print("❌ FAILED EVALUATIONS:") |
363 | 389 | for result in failed: |
364 | | - print(f" • {result.workflow_name}: {result.error}") |
| 390 | + print(f" • {result.workflow_name}") |
365 | 391 | print() |
366 | 392 |
|
367 | 393 | if successful: |
368 | 394 | print("✅ SUCCESSFUL EVALUATIONS:") |
369 | | - total_test_files = sum(r.num_test_files for r in successful) |
370 | | - print(f" • Processed {total_test_files} test files across {len(successful)} workflows") |
371 | | - |
372 | | - # Group by workflow type |
373 | | - by_type = {} |
374 | 395 | for result in successful: |
375 | | - workflow_type = result.target.config.kind |
376 | | - by_type.setdefault(workflow_type, []).append(result) |
| 396 | + print(f" • {result.workflow_name}") |
| 397 | + print() |
377 | 398 |
|
378 | | - for workflow_type, type_results in by_type.items(): |
379 | | - print(f" • {workflow_type}: {len(type_results)} workflows") |
380 | 399 |
|
381 | 400 | def cleanup(self): |
382 | 401 | """Clean up resources.""" |
|
0 commit comments