scitix · weijielee-galaxy · Mar 4, 2026 · Feb 12, 2026 · Feb 23, 2026 · Mar 3, 2026
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -67,3 +67,14 @@ jobs:
             ghcr.io/${{ github.repository_owner }}/sichek:latest
           labels: |
                 org.opencontainers.image.source=https://github.com/${{ github.repository }}
+      - name: Build and push Docker image (ubuntu20.04, cuda12.8)
+        uses: docker/build-push-action@v4
+        with:
+          context: ./
+          file: docker/Dockerfile.cuda128
+          platforms: linux/amd64
+          push: true
+          tags: |
+            ghcr.io/${{ github.repository_owner }}/sichek:${{ github.ref_name }}-ubuntu2004-cuda12.8
+          labels: |
+            org.opencontainers.image.source=https://github.com/${{ github.repository }}
diff --git a/.goreleaser.yaml b/.goreleaser.yaml
@@ -53,6 +53,7 @@ builds:
       - -mod=vendor
     ldflags:
       - "-s -w"
+      - "-X github.com/scitix/sichek/cmd/command.Version={{ .Tag }}"
       - "-X github.com/scitix/sichek/cmd/command.Major={{ .Major }}"
       - "-X github.com/scitix/sichek/cmd/command.Minor={{ .Minor }}"
       - "-X github.com/scitix/sichek/cmd/command.Patch={{ .Patch }}"

diff --git a/cmd/command/version.go b/cmd/command/version.go
@@ -26,6 +26,9 @@ import (
 )
 
 var (
+	// Version is the full version string (e.g. v0.7.6.post1), set by ldflags at build time.
+	// When set, it is used for display instead of Major.Minor.Patch.
+	Version   = ""
 	Major     = ""
 	Minor     = ""
 	Patch     = ""
@@ -52,7 +55,9 @@ func NewVersionCmd() *cobra.Command {
 				now := time.Now()
 				BuildTime = now.Format("2006-01-02T15:04:05")
 			}
-			if Major == "" {
+			if Version != "" {
+				version = Version
+			} else if Major == "" {
 				version = "dev-" + GitCommit
 			} else {
 				version = "v" + Major + "." + Minor + "." + Patch

diff --git a/components/infiniband/checker/ib_devs.go b/components/infiniband/checker/ib_devs.go
@@ -62,6 +62,7 @@ func (c *IBDevsChecker) Check(ctx context.Context, data any) (*common.CheckerRes
 
 	var mismatchPairs []string
 	infinibandInfo.RLock()
+	// 1) Spec -> actual: missing or wrong mapping
 	for expectedMlx5, expectedIb := range c.spec.IBPFDevs {
 		// skip mezzanine card in check
 		if strings.Contains(expectedMlx5, "mezz") {
@@ -80,6 +81,16 @@ func (c *IBDevsChecker) Check(ctx context.Context, data any) (*common.CheckerRes
 			mismatchPairs = append(mismatchPairs, fmt.Sprintf("%s -> %s (expected %s)", expectedMlx5, actualIb, expectedIb))
 		}
 	}
+	// 2) Actual -> spec: extra devices not defined in spec (e.g. mlx5_7 in spec but actual shows mlx5_13_6209)
+	for actualMlx5 := range infinibandInfo.IBPFDevs {
+		if strings.Contains(actualMlx5, "mezz") {
+			continue
+		}
+		if _, inSpec := c.spec.IBPFDevs[actualMlx5]; !inSpec {
+			mismatchPairs = append(mismatchPairs, fmt.Sprintf("%s (not in spec)", actualMlx5))
+			logrus.WithField("component", "infiniband").Debugf("mismatch: actual device %s not defined in spec", actualMlx5)
+		}
+	}
 	infinibandInfo.RUnlock()
 
 	if len(mismatchPairs) > 0 {

diff --git a/components/nvidia/config/config.go b/components/nvidia/config/config.go
@@ -27,9 +27,14 @@ type NvidiaConfig struct {
 	QueryInterval   common.Duration `json:"query_interval" yaml:"query_interval"`
 	CacheSize       int64           `json:"cache_size" yaml:"cache_size"`
 	EnableMetrics   bool            `json:"enable_metrics" yaml:"enable_metrics"`
+	EnableXidPoller bool            `json:"enable_xid_poller" yaml:"enable_xid_poller"`
 	IgnoredCheckers []string        `json:"ignored_checkers,omitempty" yaml:"ignored_checkers,omitempty"`
 }
 
+func (c *NvidiaConfig) IsXidPollerEnabled() bool {
+	return c.EnableXidPoller
+}
+
 func (c *NvidiaUserConfig) GetCheckerSpec() map[string]common.CheckerSpec {
 	commonCfgMap := make(map[string]common.CheckerSpec)
 	return commonCfgMap

diff --git a/components/nvidia/nvidia.go b/components/nvidia/nvidia.go
@@ -66,6 +66,10 @@ type component struct {
 	metrics *metrics.NvidiaMetrics
 
 	initError error // Track initialization errors with detailed information
+
+	// ReNewNvml rate limit: when called from HealthCheck timeout path, skip if last ReNewNvml was < 60s ago
+	renewBackoffMtx   sync.Mutex
+	lastReNewNvmlTime time.Time
 }
 
 var (
@@ -117,6 +121,17 @@ func NewNvml(ctx context.Context) (nvml.Interface, error) {
 // Note: The caller must hold c.healthCheckMtx lock before calling this function
 // to ensure thread-safe access to nvmlInst and xidPoller.
 func ReNewNvml(c *component) error {
+	const minReNewNvmlInterval = 60 * time.Second
+	// When nvmlInst exists (called from HealthCheck timeout path), rate-limit to avoid thrashing with DCGM
+	if c.nvmlInst != nil {
+		c.renewBackoffMtx.Lock()
+		last := c.lastReNewNvmlTime
+		c.renewBackoffMtx.Unlock()
+		if !last.IsZero() && time.Since(last) < minReNewNvmlInterval {
+			logrus.WithField("component", "nvidia").Debugf("ReNewNvml skipped (last run %v ago)", time.Since(last))
+			return nil
+		}
+	}
 
 	// Stop the XidEventPoller before shutting down NVML to prevent SIGSEGV
 	if c.xidPoller != nil {
@@ -144,31 +159,34 @@ func ReNewNvml(c *component) error {
 		if c.nvmlInstPtr != nil {
 			*c.nvmlInstPtr = nvmlInst
 		}
-		// Recreate the XidEventPoller with the new NVML instance
-		// Use RLock to check running status (nested lock: healthCheckMtx -> serviceMtx is safe)
+		// Recreate the XidEventPoller with the new NVML instance only when enabled
 		c.serviceMtx.RLock()
 		isRunning := c.running
 		c.serviceMtx.RUnlock()
-		if isRunning {
-			newXidPoller, err := NewXidEventPoller(c.ctx, c.cfg, nvmlInst, &c.nvmlMtx, c.resultChannel)
+		var newPoller *XidEventPoller
+		if isRunning && c.cfg != nil && c.cfg.Nvidia != nil && c.cfg.Nvidia.IsXidPollerEnabled() {
+			poller, err := NewXidEventPoller(c.ctx, c.cfg, nvmlInst, &c.nvmlMtx, c.resultChannel)
 			if err != nil {
 				logrus.WithField("component", "nvidia").Errorf("failed to recreate xid poller after NVML reinit: %v", err)
 			} else {
-				c.xidPoller = newXidPoller
-				// Restart the poller in a goroutine if the component is still running
+				newPoller = poller
 				go func() {
 					defer func() {
 						if err := recover(); err != nil {
 							fmt.Printf("[xidPoller] panic err is %s\n", err)
 						}
 					}()
-					err := c.xidPoller.Start()
+					err := poller.Start()
 					if err != nil {
 						logrus.WithField("component", "nvidia").Errorf("start xid poller failed after reinit: %v", err)
 					}
 				}()
 			}
 		}
+		c.xidPoller = newPoller
+		c.renewBackoffMtx.Lock()
+		c.lastReNewNvmlTime = time.Now()
+		c.renewBackoffMtx.Unlock()
 	}
 	return ret
 }
@@ -264,10 +282,13 @@ func newNvidia(cfgFile string, specFile string, ignoredCheckers []string) (comp
 		return component, nil
 	}
 
+	// Use a timeout for collector init so nvidia-smi (SoftwareInfo.Get) cannot hang forever
+	collectorCtx, collectorCancel := context.WithTimeout(ctx, consts.CmdTimeout)
+	defer collectorCancel()
 	// Pass the shared pointer to collector
 	// Note: NVML calls in collector are protected by locks in nvidia.go where collector methods are called
 	component.nvmlMtx.Lock()
-	collectorPointer, err := collector.NewNvidiaCollector(ctx, component.nvmlInstPtr, nvidiaSpecCfg.GpuNums, nvidiaSpecCfg.Name)
+	collectorPointer, err := collector.NewNvidiaCollector(collectorCtx, component.nvmlInstPtr, nvidiaSpecCfg.GpuNums, nvidiaSpecCfg.Name)
 	component.nvmlMtx.Unlock()
 	if err != nil {
 		logrus.WithField("component", "nvidia").Errorf("NewNvidiaCollector failed: %v", err)
@@ -282,11 +303,16 @@ func newNvidia(cfgFile string, specFile string, ignoredCheckers []string) (comp
 		return component, nil
 	}
 
-	xidPoller, err := NewXidEventPoller(ctx, nvidiaCfg, nvmlInst, &component.nvmlMtx, component.resultChannel)
-	if err != nil {
-		logrus.WithField("component", "nvidia").Errorf("NewXidEventPoller failed: %v", err)
-		component.initError = fmt.Errorf("failed to create XID event poller: %w", err)
-		return component, nil
+	var xidPoller *XidEventPoller
+	if nvidiaCfg.Nvidia.IsXidPollerEnabled() {
+		xidPoller, err = NewXidEventPoller(ctx, nvidiaCfg, nvmlInst, &component.nvmlMtx, component.resultChannel)
+		if err != nil {
+			logrus.WithField("component", "nvidia").Errorf("NewXidEventPoller failed: %v", err)
+			component.initError = fmt.Errorf("failed to create XID event poller: %w", err)
+			return component, nil
+		}
+	} else {
+		logrus.WithField("component", "nvidia").Infof("XID event poller disabled)")
 	}
 
 	freqController := common.GetFreqController()
@@ -517,8 +543,7 @@ func (c *component) Start() <-chan *common.Result {
 				}
 				// Check if the error message contains "Timeout"
 				if result != nil && len(result.Checkers) > 0 && strings.Contains(result.Checkers[0].Name, "HealthCheckTimeout") {
-					// Handle the timeout error
-					// ReNewNvml requires healthCheckMtx lock to be held
+					// HealthCheck timed out: try ReNewNvml
 					c.healthCheckMtx.Lock()
 					err := ReNewNvml(c)
 					c.healthCheckMtx.Unlock()

diff --git a/components/nvidia/xid_poller.go b/components/nvidia/xid_poller.go
@@ -83,21 +83,21 @@ func (x *XidEventPoller) Start() error {
 
 		// waits for the duration specified in x.Cfg.UpdateInterval (in seconds)
 		// ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlEvents.html#group__nvmlEvents
-		// e, err := x.XidEventSet.Wait(uint32(x.Cfg.UpdateInterval.Microseconds()))
-		event, ret := x.XidEventSet.Wait(200)
-		logrus.WithField("component", "nvidia").Infof("XidEventSet.Wait returned: %v, %v", event.EventData, ret)
-		if ret == nvml.ERROR_NOT_SUPPORTED {
-			logrus.WithField("component", "nvidia").Warningf("XidEvent not supported -- Skipping: %v", nvml.ErrorString(ret))
+		event, ret := x.XidEventSet.Wait(uint32(1000))
+		if ret == nvml.ERROR_TIMEOUT {
+			// no event within timeout — normal
+			logrus.WithField("component", "nvidia").Debugf("XidEventSet.Wait timeout (no event)")
 			continue
 		}
-		if ret == nvml.ERROR_TIMEOUT {
-			// no event within timeout
+		if ret == nvml.ERROR_NOT_SUPPORTED {
+			logrus.WithField("component", "nvidia").Warningf("XidEvent not supported -- Skipping: %v", nvml.ErrorString(ret))
 			continue
 		}
 		if ret != nvml.SUCCESS {
 			logrus.WithField("component", "nvidia").Warningf("XidEventSet.Wait failure -- Retrying: %v", nvml.ErrorString(ret))
 			continue
 		}
+		logrus.WithField("component", "nvidia").Infof("XidEventSet.Wait returned event: %v", event.EventData)
 
 		x.handleEvent(event)
 	}

diff --git a/config/default_spec.yaml b/config/default_spec.yaml
@@ -66,9 +66,27 @@ infiniband:
   default:
     <<: *ib_base
 hca:
-  MT_0000000838: {}
-  DEL0000000036: {}
-  MT_0000000223: {}
+  MT_0000000838:
+    hardware:
+      hca_type: "MT4129"
+      board_id: "MT_0000000838"
+      fw_ver: ">=28.39.2048"
+      vpd: "NVIDIA ConnectX-7 HHHL Adapter card, 400GbE / NDR IB (default mode), Single-port OSFP, PCIe 5.0 x16, Crypto Disabled, Secure Boot Enabled"
+      net_port: 1
+      port_speed: "400 Gb/sec (4X NDR)"
+      phy_state: "LinkUp"
+      port_state: "ACTIVE"
+      net_operstate: "down"
+      link_layer: "InfiniBand"
+      pcie_width: "16"
+      pcie_speed: "32.0 GT/s PCIe"
+      pcie_tree_width: "16"
+      pcie_tree_speed: "32"
+      pcie_acs: "disable"
+      pcie_mrr: "4096"
+    perf:
+      one_way_bw: 360 # Gbps
+      avg_latency_us: 10 # us
 pcie_topo:
   "0x233510de":
     numa_config:

diff --git a/docker/Dockerfile.cuda128 b/docker/Dockerfile.cuda128
@@ -0,0 +1,80 @@
+ARG CUDA_VERSION=12.8.1
+FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu20.04 AS build
+
+ARG GO_VERSION=1.23.3
+ARG GORELEASER_VERSION=v2.13.1
+
+RUN mv /etc/apt/sources.list.d/cuda*.list /tmp/disabled-cuda.list 2>/dev/null || true && \
+    apt-get update && \
+    apt-get install -y --no-install-recommends \
+        build-essential gcc g++ curl git wget ca-certificates && \
+    rm -rf /var/lib/apt/lists/*
+
+# Install Go
+RUN curl -fsSL https://go.dev/dl/go${GO_VERSION}.linux-amd64.tar.gz | \
+    tar -C /usr/local -xz && \
+    ln -s /usr/local/go/bin/go /usr/local/bin/go
+
+# Install GoReleaser
+RUN curl -fsSL https://github.com/goreleaser/goreleaser/releases/download/${GORELEASER_VERSION}/goreleaser_Linux_x86_64.tar.gz | \
+    tar -xz -C /usr/local/bin goreleaser && \
+    goreleaser --version
+
+ENV GOSUMDB=off
+WORKDIR /go/src/sichek
+
+COPY . .
+
+ARG BUILD_TIME
+RUN if [ -n "$BUILD_TIME" ]; then \
+      BUILD_TIME=$BUILD_TIME goreleaser release --snapshot --clean ; \
+    else \
+      goreleaser release --snapshot --clean ; \
+    fi
+
+FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu20.04
+
+ENV DEBIAN_FRONTEND=noninteractive
+ENV TZ=Asia/Shanghai
+ENV NVIDIA_VISIBLE_DEVICES=""
+
+RUN mv /etc/apt/sources.list.d/cuda*.list /tmp/disabled-cuda.list 2>/dev/null || true && \
+    apt-get update && \
+    apt-get install -y --no-install-recommends \
+        vim curl tzdata ca-certificates \
+        openssh-server openssh-client \
+        iproute2 iputils-ping jq \
+        libnuma1 numactl rdmacm-utils perftest xz-utils && \
+    ln -fs /usr/share/zoneinfo/${TZ} /etc/localtime && \
+    dpkg-reconfigure --frontend noninteractive tzdata && \
+    apt-get clean && rm -rf /var/lib/apt/lists/*
+
+# Helm (runtime tool)
+RUN curl -fsSL https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
+
+WORKDIR /opt/sichek
+COPY --from=build /go/src/sichek/dist ./dist
+
+ARG SICL_VERSION=sicl-25.11-1.cuda128.ubuntu2004.run
+RUN curl -fsSL -o ./dist/${SICL_VERSION} \
+      https://oss-ap-southeast.scitix.ai/scitix-release/${SICL_VERSION} && \
+    bash ./dist/${SICL_VERSION} && \
+    dpkg -i ./dist/sichek_*_linux_amd64.deb
+
+ENV SIHPC_HOME=/usr/local/sihpc
+ENV PATH=${SIHPC_HOME}/bin:$PATH
+ENV LD_LIBRARY_PATH=${SIHPC_HOME}/lib:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+ENV OMPI_MCA_opal_prefix=${SIHPC_HOME}
+ENV OPAL_PREFIX=${SIHPC_HOME}
+
+# SSH dependencies for MPI
+RUN sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \
+    echo "    UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \
+    sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config && \
+    mkdir -p /var/run/sshd /root/.ssh && \
+    chmod 700 /root/.ssh
+RUN ssh-keygen -t rsa -f /root/.ssh/id_rsa -q -N "" && \
+    cat /root/.ssh/id_rsa.pub >> /root/.ssh/authorized_keys && \
+    chmod 600 /root/.ssh/authorized_keys
+
+EXPOSE 22