MFlowCode · sbryngelson · Feb 9, 2026 · Feb 9, 2026 · Feb 9, 2026 · Feb 9, 2026
@@ -25,7 +25,6 @@ yml: &yml
   - '.github/workflows/phoenix/**'
   - '.github/workflows/frontier/**'
   - '.github/workflows/frontier_amd/**'
-  - '.github/workflows/bench.yml'
   - '.github/workflows/test.yml'
   - '.github/workflows/formatting.yml'
 

@@ -0,0 +1,41 @@
+#!/bin/bash
+# Run a single benchmark on a Frontier compute node (build already done on login node).
+# Usage: frontier_bench_config.sh <cluster> <device> <interface>
+# Runs inside a SLURM allocation on an ssh'd compute node.
+
+set -e
+set -x
+
+cluster=$1; device=$2; interface=$3
+
+flag="f"; [ "$cluster" = "frontier_amd" ] && flag="famd"
+mode="g"; [ "$device" = "cpu" ] && mode="c"
+
+. ./mfc.sh load -c "$flag" -m "$mode"
+
+# Benchmark
+job_slug="bench-${device}-${interface}"
+n_ranks=12
+device_opts=""
+if [ "$device" = "gpu" ]; then
+    gpus=$(rocm-smi --showid | awk '{print $1}' | grep -Eo '[0-9]+' | uniq | tr '\n' ' ')
+    n_ranks=$(echo "$gpus" | wc -w)
+    if [ "$n_ranks" -lt 1 ] || [ "$n_ranks" -gt 16 ]; then
+        echo "ERROR: Unexpected GPU count ($n_ranks). Expected 1-16 for Frontier MI250X."
+        echo "rocm-smi output:"
+        rocm-smi --showid
+        exit 1
+    fi
+    echo "Detected $n_ranks GPUs: $gpus"
+    gpu_ids=$(echo "$gpus" | tr ' ' '\n' | tr '\n' ' ' | sed 's/ $//')
+    device_opts="--gpu"
+    [ "$interface" = "acc" ] && device_opts+=" acc"
+    [ "$interface" = "omp" ] && device_opts+=" mp"
+    device_opts+=" -g $gpu_ids"
+fi
+
+if [ "$device" = "gpu" ]; then
+    ./mfc.sh bench --mem 12 -j $n_ranks -o "$job_slug.yaml" -- -c "$cluster" $device_opts -n $n_ranks
+else
+    ./mfc.sh bench --mem 1 -j $(nproc) -o "$job_slug.yaml" -- -c "$cluster" $device_opts -n $n_ranks
+fi
@@ -0,0 +1,35 @@
+#!/bin/bash
+# Post-process all Frontier benchmark results after the SLURM job completes.
+# Runs bench_diff for each config, comparing master vs PR YAML outputs.
+
+set -euo pipefail
+
+# Benchmark configs: cluster device interface flag
+bench_configs=(
+    "frontier:gpu:acc:f"
+    "frontier:gpu:omp:f"
+    "frontier_amd:gpu:omp:famd"
+)
+
+for cfg in "${bench_configs[@]}"; do
+    IFS=':' read -r cluster device interface flag <<< "$cfg"
+    pr_yaml="pr-${cluster}-${device}-${interface}/bench-${device}-${interface}.yaml"
+    master_yaml="master-${cluster}-${device}-${interface}/bench-${device}-${interface}.yaml"
+
+    echo "=========================================="
+    echo "bench_diff: $cluster $device $interface"
+    echo "  PR:     $pr_yaml"
+    echo "  Master: $master_yaml"
+    echo "=========================================="
+
+    if [ ! -f "$pr_yaml" ]; then
+        echo "ERROR: PR YAML not found: $pr_yaml"
+        exit 1
+    fi
+    if [ ! -f "$master_yaml" ]; then
+        echo "ERROR: Master YAML not found: $master_yaml"
+        exit 1
+    fi
+
+    (cd pr && . ./mfc.sh load -c "$flag" -m g && ./mfc.sh bench_diff "../$master_yaml" "../$pr_yaml")
+done
@@ -0,0 +1,41 @@
+#!/bin/bash
+# Run a single test on a Frontier compute node (build already done on login node).
+# Usage: frontier_test_config.sh <cluster> <device> <interface>
+# Runs inside a SLURM allocation on an ssh'd compute node.
+
+set -e
+set -x
+
+cluster=$1; device=$2; interface=$3
+
+flag="f"; [ "$cluster" = "frontier_amd" ] && flag="famd"
+mode="g"; [ "$device" = "cpu" ] && mode="c"
+
+. ./mfc.sh load -c "$flag" -m "$mode"
+
+# Device options
+device_opts=""
+if [ "$device" = "gpu" ]; then
+    device_opts="--gpu"
+    [ "$interface" = "acc" ] && device_opts+=" acc"
+    [ "$interface" = "omp" ] && device_opts+=" mp"
+fi
+
+rdma=""
+[ "$cluster" = "frontier" ] && [ "$device" = "gpu" ] && rdma="--rdma-mpi"
+
+# Test
+if [ "$device" = "gpu" ]; then
+    gpus=$(rocm-smi --showid | awk '{print $1}' | grep -Eo '[0-9]+' | uniq | tr '\n' ' ')
+    ngpus=$(echo "$gpus" | wc -w)
+    if [ "$ngpus" -lt 1 ] || [ "$ngpus" -gt 16 ]; then
+        echo "ERROR: Unexpected GPU count ($ngpus). Expected 1-16 for Frontier MI250X."
+        echo "rocm-smi output:"
+        rocm-smi --showid
+        exit 1
+    fi
+    echo "Detected $ngpus GPUs: $gpus"
+    ./mfc.sh test -v -a $rdma --max-attempts 3 -j $ngpus $device_opts -- -c "$cluster"
+else
+    ./mfc.sh test -v -a --max-attempts 3 -j 32 --no-gpu -- -c "$cluster"
+fi
@@ -64,7 +64,7 @@ while true; do
   # Try to read from tail output (non-blocking via timeout)
   # Read multiple lines if available to avoid falling behind
   lines_read=0
-  while IFS= read -r -t 0.1 line <&3 2>/dev/null; do
+  while IFS= read -r -t 1 line <&3 2>/dev/null; do
     echo "$line"
     lines_read=$((lines_read + 1))
     last_heartbeat=$(date +%s)
@@ -115,7 +115,7 @@ done
 # Drain any remaining output from tail after job completes
 echo "Draining remaining output..."
 drain_count=0
-while IFS= read -r -t 0.5 line <&3 2>/dev/null; do
+while IFS= read -r -t 1 line <&3 2>/dev/null; do
   echo "$line"
   drain_count=$((drain_count + 1))
   # Safety limit to avoid infinite loop