Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .github/ISSUE_TEMPLATE/bug_report.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,9 @@ Collecting full debug bundle (optional):
curl -o must-gather.sh -L https://raw.githubusercontent.com/NVIDIA/gpu-operator/main/hack/must-gather.sh
chmod +x must-gather.sh
./must-gather.sh

# For extended diagnostics (includes system/PCI info):
ENABLE_EXTENDED_DIAGNOSTICS=true ./must-gather.sh
```
**NOTE**: please refer to the [must-gather](https://raw.githubusercontent.com/NVIDIA/gpu-operator/main/hack/must-gather.sh) script for debug data collected.

Expand Down
12 changes: 12 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -295,6 +295,18 @@ build-image:
# This includes https://github.com/openshift-psap/ci-artifacts
docker-image: OUT_IMAGE ?= $(IMAGE_NAME):$(IMAGE_TAG)

##### Debug Container #####
DEBUG_CONTAINER_IMAGE ?= ghcr.io/nvidia/gpu-operator-debug
DEBUG_CONTAINER_TAG ?= latest

.PHONY: build-debug-container push-debug-container

build-debug-container:
$(DOCKER) build -t $(DEBUG_CONTAINER_IMAGE):$(DEBUG_CONTAINER_TAG) hack/debug-container/

push-debug-container: build-debug-container
$(DOCKER) push $(DEBUG_CONTAINER_IMAGE):$(DEBUG_CONTAINER_TAG)

install-tools:
@echo Installing tools from tools.go
export GOBIN=$(PROJECT_DIR)/bin && \
Expand Down
9 changes: 9 additions & 0 deletions hack/debug-container/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
FROM ubuntu:22.04

RUN apt-get update && apt-get install -y --no-install-recommends \
dmidecode \
pciutils \
&& rm -rf /var/lib/apt/lists/*

ENTRYPOINT ["/bin/sh"]

110 changes: 106 additions & 4 deletions hack/must-gather.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,57 @@
set -o nounset
set -x

# Set ENABLE_EXTENDED_DIAGNOSTICS=true to use a debug container for complete nvidia-bug-report collection
ENABLE_EXTENDED_DIAGNOSTICS=${ENABLE_EXTENDED_DIAGNOSTICS:-false}
DEBUG_CONTAINER_IMAGE=${DEBUG_CONTAINER_IMAGE:-ghcr.io/nvidia/gpu-operator-debug:latest}
DEBUG_TIMEOUT_SECONDS=${DEBUG_TIMEOUT_SECONDS:-60}

# Noise patterns from kubectl debug output that should be filtered
KUBECTL_NOISE_PATTERN="^Targeting\|^Defaulting\|^Unable\|^warning:\|^All commands\|^If you don"

# Filter out kubectl informational messages from output
filter_kubectl_noise() {
grep -v "${KUBECTL_NOISE_PATTERN}" || true
}

# Append a section header to the bug report
append_section_header() {
local file="$1"
local title="$2"

{
echo ""
echo "____________________________________________"
echo ""
echo "${title}"
echo ""
} >> "${file}"
}

# Collect diagnostic output using debug container and append to bug report
# Args: $1=pod_name, $2=node_name, $3=command, $4=command_args, $5=output_file
collect_debug_diagnostic() {
local pod_name="$1"
local node_name="$2"
local cmd="$3"
local cmd_args="$4"
local output_file="$5"

append_section_header "${output_file}" "${cmd} ${cmd_args} output (via must-gather extended diagnostics)"

# Use -i to attach stdin (required to capture output)
if ! timeout "${DEBUG_TIMEOUT_SECONDS}" $K debug -n "${OPERATOR_NAMESPACE}" "${pod_name}" \
--image="${DEBUG_CONTAINER_IMAGE}" \
--target=nvidia-driver-ctr \
--profile=sysadmin \
-i \
-- ${cmd} ${cmd_args} 2>/dev/null | filter_kubectl_noise >> "${output_file}"; then
echo "Warning: Failed to collect ${cmd} from ${node_name} (timed out or failed)" >&2
echo "(collection failed or timed out after ${DEBUG_TIMEOUT_SECONDS}s)" >> "${output_file}"
fi
}


K=kubectl
if ! $K version > /dev/null; then
K=oc
Expand Down Expand Up @@ -262,18 +313,69 @@ echo "# nvidia-bug-report.sh"
echo "#"
echo ""

if [[ "${ENABLE_EXTENDED_DIAGNOSTICS}" == "true" ]]; then
echo "==============================================================================="
echo "WARNING: Extended diagnostics enabled."
echo ""
echo "This will pull and run an external debug container (${DEBUG_CONTAINER_IMAGE})"
echo "with privileged access to collect system information (dmidecode, lspci)."
echo ""
echo "By enabling this option, you acknowledge:"
echo " - An external container image will be pulled and executed in your cluster"
echo " - The debug container requires privileged access (sysadmin profile)"
echo " - System hardware information will be collected and included in the bug report"
echo ""
echo "To disable, unset ENABLE_EXTENDED_DIAGNOSTICS or set it to false."
echo "==============================================================================="
echo ""
fi

for pod in $($K get pods -lopenshift.driver-toolkit -oname -n "${OPERATOR_NAMESPACE}"; $K get pods -lapp=nvidia-driver-daemonset -oname -n "${OPERATOR_NAMESPACE}"; $K get pods -lapp=nvidia-vgpu-manager-daemonset -oname -n "${OPERATOR_NAMESPACE}");
do
pod_nodename=$($K get "${pod}" -ojsonpath={.spec.nodeName} -n "${OPERATOR_NAMESPACE}")
pod_name=$(basename "${pod}")
echo "Saving nvidia-bug-report from ${pod_nodename} ..."

$K exec -n "${OPERATOR_NAMESPACE}" "${pod}" -- bash -c 'cd /tmp && nvidia-bug-report.sh' >&2 || \
(echo "Failed to collect nvidia-bug-report from ${pod_nodename}" && continue)
# Collect standard nvidia-bug-report from driver container
if ! $K exec -n "${OPERATOR_NAMESPACE}" "${pod}" -- bash -c 'cd /tmp && nvidia-bug-report.sh' >&2; then
echo "Failed to collect nvidia-bug-report from ${pod_nodename}"
continue
fi

# Clean up any existing temp file to avoid permission issues
rm -f /tmp/nvidia-bug-report.log.gz

if ! $K cp "${OPERATOR_NAMESPACE}"/"${pod_name}":/tmp/nvidia-bug-report.log.gz /tmp/nvidia-bug-report.log.gz 2>/dev/null; then
echo "Failed to save nvidia-bug-report from ${pod_nodename}"
continue
fi

$K cp "${OPERATOR_NAMESPACE}"/$(basename "${pod}"):/tmp/nvidia-bug-report.log.gz /tmp/nvidia-bug-report.log.gz || \
(echo "Failed to save nvidia-bug-report from ${pod_nodename}" && continue)

mv /tmp/nvidia-bug-report.log.gz "${ARTIFACT_DIR}/nvidia-bug-report_${pod_nodename}.log.gz"

if [[ "${ENABLE_EXTENDED_DIAGNOSTICS}" == "true" ]]; then
echo "Collecting extended diagnostics (dmidecode/lspci) from ${pod_nodename}..."

bug_report_file="${ARTIFACT_DIR}/nvidia-bug-report_${pod_nodename}.log"

# Decompress the bug report to append data
if ! gunzip "${bug_report_file}.gz" 2>&1; then
echo "Warning: Failed to decompress bug report for ${pod_nodename}, skipping extended diagnostics"
continue
fi

append_section_header "${bug_report_file}" "*** EXTENDED DIAGNOSTICS (from debug container) ***"

collect_debug_diagnostic "${pod_name}" "${pod_nodename}" "dmidecode" "" "${bug_report_file}"
collect_debug_diagnostic "${pod_name}" "${pod_nodename}" "lspci" "-vvv" "${bug_report_file}"
Comment on lines +369 to +370
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Question -- would it be possible to run nvidia-bug-report.sh itself in the debug container?

Copy link
Member Author

@karthikvetrivel karthikvetrivel Dec 15, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I believe so. However, it'd require adding a lot of NVIDIA utilities/libraries to the debug container. I don't think the debug contained would be "lightweight" anymore.

Do you prefer running the script from the debug container itself?


# Recompress the bug report
if ! gzip "${bug_report_file}" 2>&1; then
echo "Warning: Failed to recompress bug report for ${pod_nodename}"
fi
else
echo "NOTE: For extended diagnostics (dmidecode/lspci), set ENABLE_EXTENDED_DIAGNOSTICS=true"
fi
done

echo ""
Expand Down