diff --git a/pkg/collector/gpu/hardware.go b/pkg/collector/gpu/hardware.go new file mode 100644 index 000000000..18988e020 --- /dev/null +++ b/pkg/collector/gpu/hardware.go @@ -0,0 +1,43 @@ +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gpu + +import "context" + +// HardwareDetector abstracts GPU hardware detection for testability. +// Implementations enumerate PCI devices and kernel module state without +// requiring GPU drivers to be installed. +type HardwareDetector interface { + // Detect discovers GPU hardware and driver module state. + // Returns HardwareInfo describing what was found, or an error if + // detection could not be performed (e.g., sysfs not available). + Detect(ctx context.Context) (*HardwareInfo, error) +} + +// HardwareInfo describes the GPU hardware state detected without drivers. +type HardwareInfo struct { + // GPUPresent is true if at least one NVIDIA GPU was found via PCI enumeration. + GPUPresent bool + + // GPUCount is the number of NVIDIA GPUs detected via PCI enumeration. + GPUCount int + + // DriverLoaded is true if the nvidia kernel module is currently loaded. + DriverLoaded bool + + // DetectionSource identifies which detection method produced this result + // (e.g., "nfd", "sysfs"). + DetectionSource string +} diff --git a/pkg/collector/gpu/hardware_test.go b/pkg/collector/gpu/hardware_test.go new file mode 100644 index 000000000..94dcec7de --- /dev/null +++ b/pkg/collector/gpu/hardware_test.go @@ -0,0 +1,108 @@ +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gpu + +import ( + "context" + "testing" + + "github.com/NVIDIA/aicr/pkg/errors" +) + +// mockHardwareDetector is a test double for the HardwareDetector interface. +type mockHardwareDetector struct { + info *HardwareInfo + err error +} + +func (m *mockHardwareDetector) Detect(_ context.Context) (*HardwareInfo, error) { + return m.info, m.err +} + +func TestHardwareDetectorInterface(t *testing.T) { + tests := []struct { + name string + detector HardwareDetector + wantPresent bool + wantCount int + wantDriver bool + wantDetSrc string + wantErr bool + }{ + { + name: "GPU present with driver", + detector: &mockHardwareDetector{ + info: &HardwareInfo{ + GPUPresent: true, + GPUCount: 2, + DriverLoaded: true, + DetectionSource: "nfd", + }, + }, + wantPresent: true, + wantCount: 2, + wantDriver: true, + wantDetSrc: "nfd", + }, + { + name: "no GPU hardware", + detector: &mockHardwareDetector{ + info: &HardwareInfo{ + GPUPresent: false, + GPUCount: 0, + DriverLoaded: false, + DetectionSource: "nfd", + }, + }, + wantPresent: false, + wantCount: 0, + wantDriver: false, + wantDetSrc: "nfd", + }, + { + name: "detection failure", + detector: &mockHardwareDetector{ + info: nil, + err: errors.New(errors.ErrCodeInternal, "sysfs not available"), + }, + wantErr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + info, err := tt.detector.Detect(context.Background()) + if (err != nil) != tt.wantErr { + t.Errorf("Detect() error = %v, wantErr %v", err, tt.wantErr) + return + } + if err != nil { + return // error expected and received; skip field assertions + } + if info.GPUPresent != tt.wantPresent { + t.Errorf("GPUPresent = %v, want %v", info.GPUPresent, tt.wantPresent) + } + if info.GPUCount != tt.wantCount { + t.Errorf("GPUCount = %v, want %v", info.GPUCount, tt.wantCount) + } + if info.DriverLoaded != tt.wantDriver { + t.Errorf("DriverLoaded = %v, want %v", info.DriverLoaded, tt.wantDriver) + } + if info.DetectionSource != tt.wantDetSrc { + t.Errorf("DetectionSource = %v, want %v", info.DetectionSource, tt.wantDetSrc) + } + }) + } +} diff --git a/pkg/defaults/timeouts.go b/pkg/defaults/timeouts.go index 3ed90afdb..940e64abc 100644 --- a/pkg/defaults/timeouts.go +++ b/pkg/defaults/timeouts.go @@ -25,6 +25,11 @@ const ( // CollectorK8sTimeout is the timeout for Kubernetes API calls in collectors. // Covers 6 sequential sub-collectors (server, image, policy, node, helm, argocd). CollectorK8sTimeout = 60 * time.Second + + // NFDDetectionTimeout is the timeout for NFD-based hardware detection. + // PCI enumeration and kernel module listing are fast local operations + // reading from sysfs/procfs, so a short timeout is sufficient. + NFDDetectionTimeout = 5 * time.Second ) // Node topology collector constants. diff --git a/pkg/defaults/timeouts_test.go b/pkg/defaults/timeouts_test.go index cbfe59d82..367207c29 100644 --- a/pkg/defaults/timeouts_test.go +++ b/pkg/defaults/timeouts_test.go @@ -29,6 +29,7 @@ func TestTimeoutConstants(t *testing.T) { // Collector timeouts {"CollectorTimeout", CollectorTimeout, 5 * time.Second, 30 * time.Second}, {"CollectorK8sTimeout", CollectorK8sTimeout, 30 * time.Second, 120 * time.Second}, + {"NFDDetectionTimeout", NFDDetectionTimeout, 1 * time.Second, 15 * time.Second}, {"CollectorTopologyTimeout", CollectorTopologyTimeout, 60 * time.Second, 180 * time.Second}, // Handler timeouts diff --git a/pkg/measurement/types.go b/pkg/measurement/types.go index 45b78d34a..2e1a705a6 100644 --- a/pkg/measurement/types.go +++ b/pkg/measurement/types.go @@ -31,6 +31,11 @@ const ( KeyGPUDriver = "driver" KeyGPUModel = "model" KeyGPUCount = "gpu-count" + + // GPU hardware detection keys (NFD-based, no driver required) + KeyGPUPresent = "gpu-present" + KeyGPUDriverLoaded = "driver-loaded" + KeyGPUDetectionSource = "detection-source" ) // Internal measurement keys used only within this package.