From a9e2db72f7fb657ce12ebdeeff1054d8a37af3a7 Mon Sep 17 00:00:00 2001 From: rkritika1508 Date: Mon, 23 Feb 2026 14:32:11 +0530 Subject: [PATCH 1/3] Added ban list evaluation --- backend/README.md | 16 ++++- backend/app/evaluation/ban_list/run.py | 82 ++++++++++++++++++++++++++ backend/scripts/run_all_evaluations.sh | 14 +++++ 3 files changed, 110 insertions(+), 2 deletions(-) create mode 100644 backend/app/evaluation/ban_list/run.py diff --git a/backend/README.md b/backend/README.md index e90b491..c8a0c80 100644 --- a/backend/README.md +++ b/backend/README.md @@ -103,10 +103,11 @@ Important: each `run.py` expects a specific filename, so dataset files must be n - `app/evaluation/lexical_slur/run.py` expects `lexical_slur_testing_dataset.csv` - `app/evaluation/pii/run.py` expects `pii_detection_testing_dataset.csv` - `app/evaluation/gender_assumption_bias/run.py` expects `gender_bias_assumption_dataset.csv` +- `app/evaluation/ban_list/run.py` expects `ban_list_testing_dataset.csv` Once these files are in place with the exact names above, run the evaluation scripts. -Unit tests for lexical slur match, ban list, and gender assumption bias validators have limited value because their logic is deterministic. However, curated datasets exist for lexical slur match and gender assumption bias to benchmark accuracy and latency. The lexical slur dataset will also be used in future toxicity detection workflows. +Unit tests for lexical slur match, ban list, and gender assumption bias validators have limited value because their logic is deterministic. Curated datasets are used to benchmark accuracy and latency for lexical slur, gender assumption bias, and ban list. The lexical slur dataset will also be used in future toxicity detection workflows. Each validator produces: - predictions.csv – row-level outputs for debugging and analysis @@ -121,6 +122,9 @@ app/evaluation/outputs/ gender_assumption_bias/ predictions.csv metrics.json + ban_list/ + predictions.csv + metrics.json pii_remover/ predictions.csv metrics.json @@ -128,12 +132,20 @@ app/evaluation/outputs/ - To run all evaluation scripts together, use: ```bash -bash scripts/run_all_evaluations.sh +BAN_LIST_WORDS="word1,word2" bash scripts/run_all_evaluations.sh +``` +or +```bash +bash scripts/run_all_evaluations.sh BAN_LIST_WORDS="word1,word2" ``` + +`BAN_LIST_WORDS` is required for the `ban_list` evaluator and should be a comma-separated list. + This script runs the evaluators in sequence: - `app/evaluation/lexical_slur/run.py` - `app/evaluation/pii/run.py` - `app/evaluation/gender_assumption_bias/run.py` +- `app/evaluation/ban_list/run.py` To evaluate any specific evaluator, run the offline evaluation script: `python ` diff --git a/backend/app/evaluation/ban_list/run.py b/backend/app/evaluation/ban_list/run.py new file mode 100644 index 0000000..d221b22 --- /dev/null +++ b/backend/app/evaluation/ban_list/run.py @@ -0,0 +1,82 @@ +import os +from pathlib import Path + +import pandas as pd +from guardrails.hub import BanList +from guardrails.validators import FailResult + +from app.evaluation.common.helper import ( + Profiler, + compute_binary_metrics, + write_csv, + write_json, +) + +BASE_DIR = Path(__file__).resolve().parent.parent +OUT_DIR = BASE_DIR / "outputs" / "ban_list" +DATASET_PATH = BASE_DIR / "datasets" / "ban_list_testing_dataset.csv" + +# Provide comma-separated words via env var BAN_LIST_WORDS, e.g.: +# BAN_LIST_WORDS="badword,slur,profanity" +DEFAULT_BANNED_WORDS = ["badword"] +BANNED_WORDS = [ + word.strip() + for word in os.getenv("BAN_LIST_WORDS", ",".join(DEFAULT_BANNED_WORDS)).split(",") + if word.strip() +] + +df = pd.read_csv(DATASET_PATH) + +validator = BanList( + banned_words=BANNED_WORDS, +) + + +def run_ban_list(text: str): + result = validator.validate(text, metadata=None) + if isinstance(result, FailResult): + return result.fix_value, 1 + return text, 0 + + +with Profiler() as p: + outputs = df["source_text"].astype(str).apply(lambda x: p.record(run_ban_list, x)) + +df["redacted_text"] = outputs.apply(lambda x: x[0]) +df["y_pred"] = outputs.apply(lambda x: x[1]) + +if "label" in df.columns: + df["y_true"] = df["label"].astype(int) +else: + df["y_true"] = ( + df["source_text"].astype(str) != df["target_text"].astype(str) + ).astype(int) + +metrics = compute_binary_metrics(df["y_true"], df["y_pred"]) + +if "target_text" in df.columns: + exact_match = ( + df["redacted_text"].astype(str) == df["target_text"].astype(str) + ).mean() + metrics["exact_match"] = round(float(exact_match), 2) + +write_csv(df, OUT_DIR / "predictions.csv") + +write_json( + { + "guardrail": "ban_list", + "num_samples": len(df), + "banned_words": BANNED_WORDS, + "dataset": str(DATASET_PATH.name), + "metrics": metrics, + "performance": { + "latency_ms": { + "mean": round(sum(p.latencies) / len(p.latencies), 2), + "p95": round(sorted(p.latencies)[int(len(p.latencies) * 0.95)], 2), + "max": round(max(p.latencies), 2), + }, + "memory_mb": round(p.peak_memory_mb, 2), + }, + }, + OUT_DIR / "metrics.json", +) diff --git a/backend/scripts/run_all_evaluations.sh b/backend/scripts/run_all_evaluations.sh index 65de917..157c5f2 100755 --- a/backend/scripts/run_all_evaluations.sh +++ b/backend/scripts/run_all_evaluations.sh @@ -5,10 +5,19 @@ set -euo pipefail BACKEND_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" EVAL_DIR="$BACKEND_DIR/app/evaluation" +# Support passing env assignments as args, e.g.: +# scripts/run_all_evaluations.sh BAN_LIST_WORDS="foo,bar" +for arg in "$@"; do + if [[ "$arg" == *=* ]]; then + export "$arg" + fi +done + RUNNERS=( "$EVAL_DIR/lexical_slur/run.py" "$EVAL_DIR/pii/run.py" "$EVAL_DIR/gender_assumption_bias/run.py" + "$EVAL_DIR/ban_list/run.py" ) echo "Running validator evaluations..." @@ -18,6 +27,11 @@ for runner in "${RUNNERS[@]}"; do name="$(basename "$(dirname "$runner")")" echo "" echo "==> [$name] $runner" + + if [[ "$name" == "ban_list" ]]; then + : "${BAN_LIST_WORDS:?BAN_LIST_WORDS must be set for ban_list evaluation (comma-separated)}" + fi + uv run python "$runner" done From 946c70559ee1ac768e2454f1a3e5e8917c0a9af6 Mon Sep 17 00:00:00 2001 From: rkritika1508 Date: Tue, 24 Feb 2026 06:53:58 +0530 Subject: [PATCH 2/3] resolved comments --- backend/app/evaluation/ban_list/run.py | 67 +++++++++---------- backend/app/evaluation/common/helper.py | 36 ++++++++++ .../evaluation/gender_assumption_bias/run.py | 20 ++---- backend/app/evaluation/lexical_slur/run.py | 22 +++--- backend/app/evaluation/pii/run.py | 26 ++++--- backend/scripts/run_all_evaluations.sh | 5 -- 6 files changed, 95 insertions(+), 81 deletions(-) diff --git a/backend/app/evaluation/ban_list/run.py b/backend/app/evaluation/ban_list/run.py index d221b22..9312988 100644 --- a/backend/app/evaluation/ban_list/run.py +++ b/backend/app/evaluation/ban_list/run.py @@ -6,6 +6,7 @@ from guardrails.validators import FailResult from app.evaluation.common.helper import ( + build_evaluation_report, Profiler, compute_binary_metrics, write_csv, @@ -18,65 +19,61 @@ # Provide comma-separated words via env var BAN_LIST_WORDS, e.g.: # BAN_LIST_WORDS="badword,slur,profanity" -DEFAULT_BANNED_WORDS = ["badword"] -BANNED_WORDS = [ - word.strip() - for word in os.getenv("BAN_LIST_WORDS", ",".join(DEFAULT_BANNED_WORDS)).split(",") - if word.strip() -] +BAN_LIST_WORDS_RAW = os.getenv("BAN_LIST_WORDS") +if not BAN_LIST_WORDS_RAW: + raise ValueError( + "BAN_LIST_WORDS must be set for ban_list evaluation (comma-separated)." + ) -df = pd.read_csv(DATASET_PATH) +BANNED_WORDS = [word.strip() for word in BAN_LIST_WORDS_RAW.split(",") if word.strip()] + +dataset = pd.read_csv(DATASET_PATH) validator = BanList( banned_words=BANNED_WORDS, ) -def run_ban_list(text: str): +def run_ban_list(text: str) -> tuple[str, int]: result = validator.validate(text, metadata=None) if isinstance(result, FailResult): - return result.fix_value, 1 + return (result.fix_value or text), 1 return text, 0 with Profiler() as p: - outputs = df["source_text"].astype(str).apply(lambda x: p.record(run_ban_list, x)) + results = ( + dataset["source_text"].astype(str).apply(lambda x: p.record(run_ban_list, x)) + ) -df["redacted_text"] = outputs.apply(lambda x: x[0]) -df["y_pred"] = outputs.apply(lambda x: x[1]) +dataset["redacted_text"] = results.apply(lambda x: x[0]) +dataset["y_pred"] = results.apply(lambda x: x[1]) -if "label" in df.columns: - df["y_true"] = df["label"].astype(int) +if "label" in dataset.columns: + dataset["y_true"] = dataset["label"].astype(int) else: - df["y_true"] = ( - df["source_text"].astype(str) != df["target_text"].astype(str) + dataset["y_true"] = ( + dataset["source_text"].astype(str) != dataset["target_text"].astype(str) ).astype(int) -metrics = compute_binary_metrics(df["y_true"], df["y_pred"]) +metrics = compute_binary_metrics(dataset["y_true"], dataset["y_pred"]) -if "target_text" in df.columns: +if "target_text" in dataset.columns: exact_match = ( - df["redacted_text"].astype(str) == df["target_text"].astype(str) + dataset["redacted_text"].astype(str) == dataset["target_text"].astype(str) ).mean() metrics["exact_match"] = round(float(exact_match), 2) -write_csv(df, OUT_DIR / "predictions.csv") +write_csv(dataset, OUT_DIR / "predictions.csv") write_json( - { - "guardrail": "ban_list", - "num_samples": len(df), - "banned_words": BANNED_WORDS, - "dataset": str(DATASET_PATH.name), - "metrics": metrics, - "performance": { - "latency_ms": { - "mean": round(sum(p.latencies) / len(p.latencies), 2), - "p95": round(sorted(p.latencies)[int(len(p.latencies) * 0.95)], 2), - "max": round(max(p.latencies), 2), - }, - "memory_mb": round(p.peak_memory_mb, 2), - }, - }, + build_evaluation_report( + guardrail="ban_list", + num_samples=len(dataset), + profiler=p, + banned_words=BANNED_WORDS, + dataset=str(DATASET_PATH.name), + metrics=metrics, + ), OUT_DIR / "metrics.json", ) diff --git a/backend/app/evaluation/common/helper.py b/backend/app/evaluation/common/helper.py index 80dfa94..d2a5fad 100644 --- a/backend/app/evaluation/common/helper.py +++ b/backend/app/evaluation/common/helper.py @@ -1,4 +1,5 @@ from pathlib import Path +from typing import Any import json import pandas as pd import time @@ -16,6 +17,41 @@ def write_json(obj: dict, path: Path): json.dump(obj, f, indent=2) +def summarize_latency(latencies: list[float]) -> dict[str, float]: + if not latencies: + return {"mean": 0.0, "p95": 0.0, "max": 0.0} + + sorted_latencies = sorted(latencies) + p95_idx = min(len(sorted_latencies) - 1, int(len(sorted_latencies) * 0.95)) + + return { + "mean": round(sum(latencies) / len(latencies), 2), + "p95": round(sorted_latencies[p95_idx], 2), + "max": round(max(latencies), 2), + } + + +def build_performance_payload(profiler: "Profiler") -> dict[str, Any]: + return { + "latency_ms": summarize_latency(profiler.latencies), + "memory_mb": round(profiler.peak_memory_mb, 2), + } + + +def build_evaluation_report( + guardrail: str, + num_samples: int, + profiler: "Profiler", + **extra_fields: Any, +) -> dict[str, Any]: + return { + "guardrail": guardrail, + "num_samples": num_samples, + **extra_fields, + "performance": build_performance_payload(profiler), + } + + def compute_binary_metrics(y_true, y_pred): tp = sum((yt == 1 and yp == 1) for yt, yp in zip(y_true, y_pred, strict=True)) tn = sum((yt == 0 and yp == 0) for yt, yp in zip(y_true, y_pred, strict=True)) diff --git a/backend/app/evaluation/gender_assumption_bias/run.py b/backend/app/evaluation/gender_assumption_bias/run.py index 9fee7ab..301f450 100644 --- a/backend/app/evaluation/gender_assumption_bias/run.py +++ b/backend/app/evaluation/gender_assumption_bias/run.py @@ -4,6 +4,7 @@ from app.core.validators.gender_assumption_bias import GenderAssumptionBias from app.evaluation.common.helper import ( + build_evaluation_report, compute_binary_metrics, Profiler, write_csv, @@ -52,18 +53,11 @@ ) write_json( - { - "guardrail": "gender_assumption_bias", - "num_samples": len(df) * 2, # because evaluating both sides - "metrics": metrics, - "performance": { - "latency_ms": { - "mean": round(sum(p.latencies) / len(p.latencies), 2), - "p95": round(sorted(p.latencies)[int(len(p.latencies) * 0.95)], 2), - "max": round(max(p.latencies), 2), - }, - "memory_mb": round(p.peak_memory_mb, 2), - }, - }, + build_evaluation_report( + guardrail="gender_assumption_bias", + num_samples=len(df) * 2, + profiler=p, + metrics=metrics, + ), OUT_DIR / "metrics.json", ) diff --git a/backend/app/evaluation/lexical_slur/run.py b/backend/app/evaluation/lexical_slur/run.py index c187aa7..9f1808d 100644 --- a/backend/app/evaluation/lexical_slur/run.py +++ b/backend/app/evaluation/lexical_slur/run.py @@ -4,8 +4,9 @@ from app.core.validators.lexical_slur import LexicalSlur from app.evaluation.common.helper import ( - compute_binary_metrics, + build_evaluation_report, Profiler, + compute_binary_metrics, write_csv, write_json, ) @@ -33,18 +34,11 @@ write_csv(df.drop(columns=["result"]), OUT_DIR / "predictions.csv") write_json( - { - "guardrail": "lexical_slur", - "num_samples": len(df), - "metrics": metrics, - "performance": { - "latency_ms": { - "mean": round(sum(p.latencies) / len(p.latencies), 2), - "p95": round(sorted(p.latencies)[int(len(p.latencies) * 0.95)], 2), - "max": round(max(p.latencies), 2), - }, - "memory_mb": round(p.peak_memory_mb, 2), - }, - }, + build_evaluation_report( + guardrail="lexical_slur", + num_samples=len(df), + profiler=p, + metrics=metrics, + ), OUT_DIR / "metrics.json", ) diff --git a/backend/app/evaluation/pii/run.py b/backend/app/evaluation/pii/run.py index aec2af8..ccd669c 100644 --- a/backend/app/evaluation/pii/run.py +++ b/backend/app/evaluation/pii/run.py @@ -3,8 +3,13 @@ from guardrails.validators import FailResult from app.core.validators.pii_remover import PIIRemover +from app.evaluation.common.helper import ( + Profiler, + build_evaluation_report, + write_csv, + write_json, +) from app.evaluation.pii.entity_metrics import compute_entity_metrics -from app.evaluation.common.helper import Profiler, write_csv, write_json BASE_DIR = Path(__file__).resolve().parent.parent OUT_DIR = BASE_DIR / "outputs" / "pii_remover" @@ -35,18 +40,11 @@ def run_pii(text: str) -> str: write_csv(df, OUT_DIR / "predictions.csv") write_json( - { - "guardrail": "pii_remover", - "num_samples": len(df), - "entity_metrics": entity_report, - "performance": { - "latency_ms": { - "mean": round(sum(p.latencies) / len(p.latencies), 2), - "p95": round(sorted(p.latencies)[int(len(p.latencies) * 0.95)], 2), - "max": round(max(p.latencies), 2), - }, - "memory_mb": round(p.peak_memory_mb, 2), - }, - }, + build_evaluation_report( + guardrail="pii_remover", + num_samples=len(df), + profiler=p, + entity_metrics=entity_report, + ), OUT_DIR / "metrics.json", ) diff --git a/backend/scripts/run_all_evaluations.sh b/backend/scripts/run_all_evaluations.sh index 157c5f2..69e2fdf 100755 --- a/backend/scripts/run_all_evaluations.sh +++ b/backend/scripts/run_all_evaluations.sh @@ -27,11 +27,6 @@ for runner in "${RUNNERS[@]}"; do name="$(basename "$(dirname "$runner")")" echo "" echo "==> [$name] $runner" - - if [[ "$name" == "ban_list" ]]; then - : "${BAN_LIST_WORDS:?BAN_LIST_WORDS must be set for ban_list evaluation (comma-separated)}" - fi - uv run python "$runner" done From 2d3314c3a3fa96fef70af435600215d385f27fe2 Mon Sep 17 00:00:00 2001 From: rkritika1508 Date: Tue, 24 Feb 2026 06:56:40 +0530 Subject: [PATCH 3/3] resolved comments --- backend/README.md | 16 +++------------- backend/app/evaluation/ban_list/run.py | 9 ++++++--- 2 files changed, 9 insertions(+), 16 deletions(-) diff --git a/backend/README.md b/backend/README.md index c8a0c80..761e96f 100644 --- a/backend/README.md +++ b/backend/README.md @@ -115,19 +115,9 @@ Each validator produces: Standardized output structure: ```text -app/evaluation/outputs/ - lexical_slur/ - predictions.csv - metrics.json - gender_assumption_bias/ - predictions.csv - metrics.json - ban_list/ - predictions.csv - metrics.json - pii_remover/ - predictions.csv - metrics.json +app/evaluation/outputs/ + predictions.csv + metrics.json ``` - To run all evaluation scripts together, use: diff --git a/backend/app/evaluation/ban_list/run.py b/backend/app/evaluation/ban_list/run.py index 9312988..5577a27 100644 --- a/backend/app/evaluation/ban_list/run.py +++ b/backend/app/evaluation/ban_list/run.py @@ -59,9 +59,12 @@ def run_ban_list(text: str) -> tuple[str, int]: metrics = compute_binary_metrics(dataset["y_true"], dataset["y_pred"]) if "target_text" in dataset.columns: - exact_match = ( - dataset["redacted_text"].astype(str) == dataset["target_text"].astype(str) - ).mean() + if dataset.empty: + exact_match = 0.0 + else: + exact_match = ( + dataset["redacted_text"].astype(str) == dataset["target_text"].astype(str) + ).mean() metrics["exact_match"] = round(float(exact_match), 2) write_csv(dataset, OUT_DIR / "predictions.csv")