Skip to content

Commit 1972180

Browse files
authored
[AVC] Remove incorrect row processing causing false positives (#12661)
* Remove incorrect row processing causing false positives * added regression test * Fixed incorrect result separation, now separated into successful/failed/errored * Removed unit test for incorrect row processing
1 parent e2ebcc0 commit 1972180

File tree

2 files changed

+44
-41
lines changed

2 files changed

+44
-41
lines changed

packages/python-packages/apiview-copilot/evals/_runner.py

Lines changed: 43 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,6 @@
2020
from evals._util import (
2121
load_cache_lookup,
2222
append_results_to_cache,
23-
get_cache_file_path,
24-
construct_fake_azure_result
2523
)
2624

2725
DEFAULT_NUM_RUNS: int = 1
@@ -260,12 +258,18 @@ def _execute_target(self, target: EvaluationTarget) -> EvaluationResult:
260258
cached_rows = [row for row in cached_azure_rows]
261259
fresh_rows = [row for result in fresh_results for row in result.get("rows", [])]
262260
all_cached_rows = cached_rows + fresh_rows
263-
combined_result = construct_fake_azure_result(all_cached_rows)
261+
combined_result = {
262+
"rows": all_cached_rows,
263+
"metrics": {},
264+
"studio_url": None
265+
}
266+
267+
all_passed = all(row.get("outputs.metrics.correct_action", False) for row in all_cached_rows)
264268

265269
return EvaluationResult(
266270
target=target,
267271
raw_results=[{f"{target.workflow_name}.jsonl": combined_result}],
268-
success=True,
272+
success=all_passed,
269273
)
270274

271275
except Exception as e:
@@ -320,63 +324,78 @@ def show_results(self, results: list[EvaluationResult]):
320324
print()
321325

322326
successful = [r for r in results if r.success and r.raw_results]
323-
failed = [r for r in results if not r.success]
327+
failed = [r for r in results if not r.success and r.raw_results and r.error is None]
328+
errored = [r for r in results if r.error is not None]
324329

325330
if successful:
326331
for result in successful:
327-
print(f"✅ {result.workflow_name}")
332+
print(f" {result.workflow_name}")
328333
raw_results = result.raw_results[0]
329334
for filename, eval_result in raw_results.items():
330-
print(f" == {filename} ==")
335+
print(f" == {filename} ==")
331336
for res in eval_result["rows"]:
332337
success = res["outputs.metrics.correct_action"]
333338
testcase_name = res["inputs.testcase"]
334339
score = res["outputs.metrics.score"]
335-
print(f" - {'✅' if success else '❌'} {score} - {testcase_name}")
336-
print()
340+
print(f" - {'✅' if success else '❌'} {score} - {testcase_name}")
341+
print()
337342

338343
if failed:
339-
print("❌ FAILED EVALUATIONS:")
340344
for result in failed:
341-
print(f" • {result.workflow_name}: {result.error}")
345+
print(f" ❌ {result.workflow_name}")
346+
raw_results = result.raw_results[0]
347+
for filename, eval_result in raw_results.items():
348+
print(f" == {filename} ==")
349+
for res in eval_result["rows"]:
350+
success = res["outputs.metrics.correct_action"]
351+
testcase_name = res["inputs.testcase"]
352+
score = res["outputs.metrics.score"]
353+
print(f" - {'✅' if success else '❌'} {score} - {testcase_name}")
342354
print()
343355

344-
if not successful and not failed:
356+
if errored:
357+
print("💥 ERRORED EVALUATIONS:")
358+
for result in errored:
359+
print(f" 💥 {result.workflow_name}: {result.error}")
360+
print()
361+
362+
if not successful and not failed and not errored:
345363
print("No evaluation results to display.")
346364
print()
347365

348366
def show_summary(self, results: list[EvaluationResult]):
349367
"""Display aggregated results from all evaluations."""
350-
successful = [r for r in results if r.success]
351-
failed = [r for r in results if not r.success]
368+
successful = [r for r in results if r.success and r.raw_results]
369+
failed = [r for r in results if not r.success and r.raw_results and r.error is None]
370+
errored = [r for r in results if r.error is not None]
352371

353372
print("=" * 60)
354373
print("📈 EVALUATION SUMMARY")
355374
print("=" * 60)
356375
print(f"Total targets: {len(results)}")
357376
print(f"✅ Successful: {len(successful)}")
358377
print(f"❌ Failed: {len(failed)}")
378+
print(f"💥 Errored: {len(errored)}")
359379
print()
360380

381+
if errored:
382+
print("💥 ERRORED EVALUATIONS:")
383+
for result in errored:
384+
print(f" • {result.workflow_name}: {result.error}")
385+
print()
386+
361387
if failed:
362388
print("❌ FAILED EVALUATIONS:")
363389
for result in failed:
364-
print(f" • {result.workflow_name}: {result.error}")
390+
print(f" • {result.workflow_name}")
365391
print()
366392

367393
if successful:
368394
print("✅ SUCCESSFUL EVALUATIONS:")
369-
total_test_files = sum(r.num_test_files for r in successful)
370-
print(f" • Processed {total_test_files} test files across {len(successful)} workflows")
371-
372-
# Group by workflow type
373-
by_type = {}
374395
for result in successful:
375-
workflow_type = result.target.config.kind
376-
by_type.setdefault(workflow_type, []).append(result)
396+
print(f" • {result.workflow_name}")
397+
print()
377398

378-
for workflow_type, type_results in by_type.items():
379-
print(f" • {workflow_type}: {len(type_results)} workflows")
380399

381400
def cleanup(self):
382401
"""Clean up resources."""

packages/python-packages/apiview-copilot/evals/_util.py

Lines changed: 1 addition & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -141,20 +141,4 @@ def save_result_to_cache(testcase_id: str, test_file_path: Path, azure_result_ro
141141
except IOError:
142142
# Continue without caching if write fails
143143
pass
144-
145-
def construct_fake_azure_result(cached_rows: list[dict]) -> dict:
146-
"""Construct a fake Azure AI evaluation result from cached rows."""
147-
# Extract the actual Azure AI result rows from cache
148-
result_rows = []
149-
for cached_row in cached_rows:
150-
if "row" in cached_row:
151-
result_rows.append(cached_row["row"])
152-
153-
# Create a minimal Azure AI evaluation result structure
154-
fake_result = {
155-
"rows": result_rows,
156-
"metrics": {}, # Azure AI framework will populate this
157-
"studio_url": None # Not needed for cached results
158-
}
159-
160-
return fake_result
144+

0 commit comments

Comments
 (0)