ProjectTech4DevAI · rkritika1508 · Feb 23, 2026 · Feb 24, 2026 · Feb 24, 2026 · Feb 24, 2026
diff --git a/backend/README.md b/backend/README.md
@@ -103,37 +103,39 @@ Important: each `run.py` expects a specific filename, so dataset files must be n
 - `app/evaluation/lexical_slur/run.py` expects `lexical_slur_testing_dataset.csv`
 - `app/evaluation/pii/run.py` expects `pii_detection_testing_dataset.csv`
 - `app/evaluation/gender_assumption_bias/run.py` expects `gender_bias_assumption_dataset.csv`
+- `app/evaluation/ban_list/run.py` expects `ban_list_testing_dataset.csv`
 
 Once these files are in place with the exact names above, run the evaluation scripts.
 
-Unit tests for lexical slur match, ban list, and gender assumption bias validators have limited value because their logic is deterministic. However, curated datasets exist for lexical slur match and gender assumption bias to benchmark accuracy and latency. The lexical slur dataset will also be used in future toxicity detection workflows.
+Unit tests for lexical slur match, ban list, and gender assumption bias validators have limited value because their logic is deterministic. Curated datasets are used to benchmark accuracy and latency for lexical slur, gender assumption bias, and ban list. The lexical slur dataset will also be used in future toxicity detection workflows.
 
 Each validator produces:
 - predictions.csv – row-level outputs for debugging and analysis
 - metrics.json – aggregated accuracy + performance metrics (latency and peak memory)
 
 Standardized output structure:
 ```text
-app/evaluation/outputs/
-  lexical_slur/
-    predictions.csv
-    metrics.json
-  gender_assumption_bias/
-    predictions.csv
-    metrics.json
-  pii_remover/
-    predictions.csv
-    metrics.json
+app/evaluation/outputs/<validator-name>
+  predictions.csv
+  metrics.json
 ```
 
 - To run all evaluation scripts together, use:
 ```bash
-bash scripts/run_all_evaluations.sh
+BAN_LIST_WORDS="word1,word2" bash scripts/run_all_evaluations.sh
 ```
+or
+```bash
+bash scripts/run_all_evaluations.sh BAN_LIST_WORDS="word1,word2"
+```
+
+`BAN_LIST_WORDS` is required for the `ban_list` evaluator and should be a comma-separated list.
+
 This script runs the evaluators in sequence:
 - `app/evaluation/lexical_slur/run.py`
 - `app/evaluation/pii/run.py`
 - `app/evaluation/gender_assumption_bias/run.py`
+- `app/evaluation/ban_list/run.py`
 
 To evaluate any specific evaluator, run the offline evaluation script: `python <validator's eval script path>` 
 

diff --git a/backend/app/evaluation/ban_list/run.py b/backend/app/evaluation/ban_list/run.py
@@ -0,0 +1,82 @@
+import os
+from pathlib import Path
+
+import pandas as pd
+from guardrails.hub import BanList
+from guardrails.validators import FailResult
+
+from app.evaluation.common.helper import (
+    build_evaluation_report,
+    Profiler,
+    compute_binary_metrics,
+    write_csv,
+    write_json,
+)
+
+BASE_DIR = Path(__file__).resolve().parent.parent
+OUT_DIR = BASE_DIR / "outputs" / "ban_list"
+DATASET_PATH = BASE_DIR / "datasets" / "ban_list_testing_dataset.csv"
+
+# Provide comma-separated words via env var BAN_LIST_WORDS, e.g.:
+# BAN_LIST_WORDS="badword,slur,profanity"
+BAN_LIST_WORDS_RAW = os.getenv("BAN_LIST_WORDS")
+if not BAN_LIST_WORDS_RAW:
+    raise ValueError(
+        "BAN_LIST_WORDS must be set for ban_list evaluation (comma-separated)."
+    )
+
+BANNED_WORDS = [word.strip() for word in BAN_LIST_WORDS_RAW.split(",") if word.strip()]
+
+dataset = pd.read_csv(DATASET_PATH)
+
+validator = BanList(
+    banned_words=BANNED_WORDS,
+)
+
+
+def run_ban_list(text: str) -> tuple[str, int]:
+    result = validator.validate(text, metadata=None)
+    if isinstance(result, FailResult):
+        return (result.fix_value or text), 1
+    return text, 0
+
+
+with Profiler() as p:
+    results = (
+        dataset["source_text"].astype(str).apply(lambda x: p.record(run_ban_list, x))
+    )
+
+dataset["redacted_text"] = results.apply(lambda x: x[0])
+dataset["y_pred"] = results.apply(lambda x: x[1])
+
+if "label" in dataset.columns:
+    dataset["y_true"] = dataset["label"].astype(int)
+else:
+    dataset["y_true"] = (
+        dataset["source_text"].astype(str) != dataset["target_text"].astype(str)
+    ).astype(int)
+
+metrics = compute_binary_metrics(dataset["y_true"], dataset["y_pred"])
+
+if "target_text" in dataset.columns:
+    if dataset.empty:
+        exact_match = 0.0
+    else:
+        exact_match = (
+            dataset["redacted_text"].astype(str) == dataset["target_text"].astype(str)
+        ).mean()
+    metrics["exact_match"] = round(float(exact_match), 2)
+
+write_csv(dataset, OUT_DIR / "predictions.csv")
+
+write_json(
+    build_evaluation_report(
+        guardrail="ban_list",
+        num_samples=len(dataset),
+        profiler=p,
+        banned_words=BANNED_WORDS,
+        dataset=str(DATASET_PATH.name),
+        metrics=metrics,
+    ),
+    OUT_DIR / "metrics.json",
+)
diff --git a/backend/app/evaluation/common/helper.py b/backend/app/evaluation/common/helper.py
@@ -1,4 +1,5 @@
 from pathlib import Path
+from typing import Any
 import json
 import pandas as pd
 import time
@@ -16,6 +17,41 @@ def write_json(obj: dict, path: Path):
         json.dump(obj, f, indent=2)
 
 
+def summarize_latency(latencies: list[float]) -> dict[str, float]:
+    if not latencies:
+        return {"mean": 0.0, "p95": 0.0, "max": 0.0}
+
+    sorted_latencies = sorted(latencies)
+    p95_idx = min(len(sorted_latencies) - 1, int(len(sorted_latencies) * 0.95))
+
+    return {
+        "mean": round(sum(latencies) / len(latencies), 2),
+        "p95": round(sorted_latencies[p95_idx], 2),
+        "max": round(max(latencies), 2),
+    }
+
+
+def build_performance_payload(profiler: "Profiler") -> dict[str, Any]:
+    return {
+        "latency_ms": summarize_latency(profiler.latencies),
+        "memory_mb": round(profiler.peak_memory_mb, 2),
+    }
+
+
+def build_evaluation_report(
+    guardrail: str,
+    num_samples: int,
+    profiler: "Profiler",
+    **extra_fields: Any,
+) -> dict[str, Any]:
+    return {
+        "guardrail": guardrail,
+        "num_samples": num_samples,
+        **extra_fields,
+        "performance": build_performance_payload(profiler),
+    }
+
+
 def compute_binary_metrics(y_true, y_pred):
     tp = sum((yt == 1 and yp == 1) for yt, yp in zip(y_true, y_pred, strict=True))
     tn = sum((yt == 0 and yp == 0) for yt, yp in zip(y_true, y_pred, strict=True))

diff --git a/backend/app/evaluation/gender_assumption_bias/run.py b/backend/app/evaluation/gender_assumption_bias/run.py
@@ -4,6 +4,7 @@
 
 from app.core.validators.gender_assumption_bias import GenderAssumptionBias
 from app.evaluation.common.helper import (
+    build_evaluation_report,
     compute_binary_metrics,
     Profiler,
     write_csv,
@@ -52,18 +53,11 @@
 )
 
 write_json(
-    {
-        "guardrail": "gender_assumption_bias",
-        "num_samples": len(df) * 2,  # because evaluating both sides
-        "metrics": metrics,
-        "performance": {
-            "latency_ms": {
-                "mean": round(sum(p.latencies) / len(p.latencies), 2),
-                "p95": round(sorted(p.latencies)[int(len(p.latencies) * 0.95)], 2),
-                "max": round(max(p.latencies), 2),
-            },
-            "memory_mb": round(p.peak_memory_mb, 2),
-        },
-    },
+    build_evaluation_report(
+        guardrail="gender_assumption_bias",
+        num_samples=len(df) * 2,
+        profiler=p,
+        metrics=metrics,
+    ),
     OUT_DIR / "metrics.json",
 )
diff --git a/backend/app/evaluation/lexical_slur/run.py b/backend/app/evaluation/lexical_slur/run.py
@@ -4,8 +4,9 @@
 
 from app.core.validators.lexical_slur import LexicalSlur
 from app.evaluation.common.helper import (
-    compute_binary_metrics,
+    build_evaluation_report,
     Profiler,
+    compute_binary_metrics,
     write_csv,
     write_json,
 )
@@ -33,18 +34,11 @@
 write_csv(df.drop(columns=["result"]), OUT_DIR / "predictions.csv")
 
 write_json(
-    {
-        "guardrail": "lexical_slur",
-        "num_samples": len(df),
-        "metrics": metrics,
-        "performance": {
-            "latency_ms": {
-                "mean": round(sum(p.latencies) / len(p.latencies), 2),
-                "p95": round(sorted(p.latencies)[int(len(p.latencies) * 0.95)], 2),
-                "max": round(max(p.latencies), 2),
-            },
-            "memory_mb": round(p.peak_memory_mb, 2),
-        },
-    },
+    build_evaluation_report(
+        guardrail="lexical_slur",
+        num_samples=len(df),
+        profiler=p,
+        metrics=metrics,
+    ),
     OUT_DIR / "metrics.json",
 )
diff --git a/backend/app/evaluation/pii/run.py b/backend/app/evaluation/pii/run.py
@@ -3,8 +3,13 @@
 from guardrails.validators import FailResult
 
 from app.core.validators.pii_remover import PIIRemover
+from app.evaluation.common.helper import (
+    Profiler,
+    build_evaluation_report,
+    write_csv,
+    write_json,
+)
 from app.evaluation.pii.entity_metrics import compute_entity_metrics
-from app.evaluation.common.helper import Profiler, write_csv, write_json
 
 BASE_DIR = Path(__file__).resolve().parent.parent
 OUT_DIR = BASE_DIR / "outputs" / "pii_remover"
@@ -35,18 +40,11 @@ def run_pii(text: str) -> str:
 write_csv(df, OUT_DIR / "predictions.csv")
 
 write_json(
-    {
-        "guardrail": "pii_remover",
-        "num_samples": len(df),
-        "entity_metrics": entity_report,
-        "performance": {
-            "latency_ms": {
-                "mean": round(sum(p.latencies) / len(p.latencies), 2),
-                "p95": round(sorted(p.latencies)[int(len(p.latencies) * 0.95)], 2),
-                "max": round(max(p.latencies), 2),
-            },
-            "memory_mb": round(p.peak_memory_mb, 2),
-        },
-    },
+    build_evaluation_report(
+        guardrail="pii_remover",
+        num_samples=len(df),
+        profiler=p,
+        entity_metrics=entity_report,
+    ),
     OUT_DIR / "metrics.json",
 )
diff --git a/backend/scripts/run_all_evaluations.sh b/backend/scripts/run_all_evaluations.sh
@@ -5,10 +5,19 @@ set -euo pipefail
 BACKEND_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
 EVAL_DIR="$BACKEND_DIR/app/evaluation"
 
+# Support passing env assignments as args, e.g.:
+#   scripts/run_all_evaluations.sh BAN_LIST_WORDS="foo,bar"
+for arg in "$@"; do
+  if [[ "$arg" == *=* ]]; then
+    export "$arg"
+  fi
+done
+
 RUNNERS=(
   "$EVAL_DIR/lexical_slur/run.py"
   "$EVAL_DIR/pii/run.py"
   "$EVAL_DIR/gender_assumption_bias/run.py"
+  "$EVAL_DIR/ban_list/run.py"
 )
 
 echo "Running validator evaluations..."