Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 19 additions & 42 deletions pkg/dcgm/diag.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,59 +73,36 @@ func diagResultString(r int) string {
return ""
}

func swTestName(t int) string {
switch t {
case C.DCGM_SWTEST_DENYLIST:
return "presence of drivers on the denylist (e.g. nouveau)"
case C.DCGM_SWTEST_NVML_LIBRARY:
return "presence (and version) of NVML lib"
case C.DCGM_SWTEST_CUDA_MAIN_LIBRARY:
return "presence (and version) of CUDA lib"
case C.DCGM_SWTEST_CUDA_RUNTIME_LIBRARY:
return "presence (and version) of CUDA RT lib"
case C.DCGM_SWTEST_PERMISSIONS:
return "character device permissions"
case C.DCGM_SWTEST_PERSISTENCE_MODE:
return "persistence mode enabled"
case C.DCGM_SWTEST_ENVIRONMENT:
return "CUDA environment vars that may slow tests"
case C.DCGM_SWTEST_PAGE_RETIREMENT:
return "pending frame buffer page retirement"
case C.DCGM_SWTEST_GRAPHICS_PROCESSES:
return "graphics processes running"
case C.DCGM_SWTEST_INFOROM:
return "inforom corruption"
}

return ""
}

// gpuTestName returns the category name for a diagnostic test based on its test ID.
// This function handles all diagnostic test types including GPU tests and software tests.
// Software tests (DCGM_SWTEST_*) all report under DCGM_SOFTWARE_INDEX and return "software".
// Detailed test information is provided in TestOutput, not in the TestName.
func gpuTestName(t int) string {
switch t {
case C.DCGM_MEMORY_INDEX:
return "Memory"
return "memory"
case C.DCGM_DIAGNOSTIC_INDEX:
return "Diagnostic"
return "diagnostic"
case C.DCGM_PCI_INDEX:
return "PCIe"
return "pcie"
case C.DCGM_SM_STRESS_INDEX:
return "SM Stress"
return "sm stress"
case C.DCGM_TARGETED_STRESS_INDEX:
return "Targeted Stress"
return "targeted stress"
case C.DCGM_TARGETED_POWER_INDEX:
return "Targeted Power"
return "targeted power"
case C.DCGM_MEMORY_BANDWIDTH_INDEX:
return "Memory bandwidth"
return "memory bandwidth"
case C.DCGM_MEMTEST_INDEX:
return "Memtest"
return "memtest"
case C.DCGM_PULSE_TEST_INDEX:
return "Pulse"
return "pulse"
case C.DCGM_EUD_TEST_INDEX:
return "EUD"
return "eud"
case C.DCGM_SOFTWARE_INDEX:
return "Software"
return "software"
case C.DCGM_CONTEXT_CREATE_INDEX:
return "Context create"
return "context create"
}
return ""
}
Expand Down Expand Up @@ -189,7 +166,7 @@ func newDiagResult(resultIndex uint, response C.dcgmDiagResponse_v12) DiagResult

msg, code := getErrorMsg(entityId, testId, response)
info := getInfoMsg(entityId, testId, response)
testName := getTestName(resultIndex, response)
testName := gpuTestName(int(testId))
serial := getSerial(resultIndex, response)

return DiagResult{
Expand Down Expand Up @@ -227,9 +204,9 @@ func diagLevel(diagType DiagType) C.dcgmDiagnosticLevel_t {
// - error if the diagnostics failed to run
func RunDiag(diagType DiagType, groupID GroupHandle) (DiagResults, error) {
var diagResults C.dcgmDiagResponse_v12
diagResults.version = makeVersion12(unsafe.Sizeof(diagResults))
diagResults.version = C.dcgmDiagResponse_version12

result := C.dcgmRunDiagnostic(handle.handle, groupID.handle, diagLevel(diagType), (*C.dcgmDiagResponse_v12)(unsafe.Pointer(&diagResults)))
result := C.dcgmRunDiagnostic(handle.handle, groupID.handle, diagLevel(diagType), &diagResults)
if err := errorString(result); err != nil {
return DiagResults{}, &Error{msg: C.GoString(C.errorString(result)), Code: result}
}
Expand Down
78 changes: 78 additions & 0 deletions tests/diag_test.go
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
package tests

import (
"strings"
"testing"

"github.com/NVIDIA/go-dcgm/pkg/dcgm"
"github.com/stretchr/testify/assert"
)

// TestDiagnostics demonstrates running DCGM diagnostics
Expand Down Expand Up @@ -99,3 +101,79 @@ func TestDiagnosticsLong(t *testing.T) {
t.Logf(" %s: %s", test.TestName, test.Status)
}
}

// TestDiagTestNameFormat validates that TestName field contains category names,
// not detailed test descriptions (issue #97)
func TestDiagTestNameFormat(t *testing.T) {
cleanup, err := dcgm.Init(dcgm.Embedded)
if err != nil {
t.Fatalf("Failed to initialize DCGM: %v", err)
}
defer cleanup()

dr, err := dcgm.RunDiag(dcgm.DiagQuick, dcgm.GroupAllGPUs())
if err != nil {
t.Fatalf("Failed to run diagnostics: %v", err)
}

assert.NotEmpty(t, dr.Software, "diagnostic results should not be empty")

// Valid test category names that should appear (lowercase)
validTestNames := []string{
"software",
"memory",
"pcie",
"diagnostic",
"sm stress",
"targeted stress",
"targeted power",
"memory bandwidth",
"memtest",
"pulse",
"eud",
"context create",
}

// Invalid strings that should NOT appear in TestName
// These are detailed descriptions that were incorrectly returned before fix
invalidPatterns := []string{
"presence of drivers on the denylist",
"(e.g. nouveau)",
"Allocated",
"bytes",
"presence (and version)",
}

for i, test := range dr.Software {
t.Logf("Result %d: TestName=%q, Status=%s", i, test.TestName, test.Status)

// TestName should be one of the valid category names
assert.Contains(
t,
validTestNames,
test.TestName,
"TestName should be a category name like 'software', 'memory', 'pcie', got: %q",
test.TestName,
)

// TestName should NOT contain detailed descriptions
for _, invalid := range invalidPatterns {
assert.NotContains(
t,
test.TestName,
invalid,
"TestName should not contain detailed descriptions, got: %q",
test.TestName,
)
}

// TestName should be lowercase
assert.Equal(
t,
strings.ToLower(test.TestName),
test.TestName,
"TestName should be lowercase, got: %q",
test.TestName,
)
}
}