From 3408d48bf39c85dd13e57405e7ecb3ffeaa2287e Mon Sep 17 00:00:00 2001
From: xlliu <xlliu@scitix.ai>
Date: Thu, 12 Feb 2026 01:10:45 +0000
Subject: [PATCH 1/4] disable NVIDIA XidPoller by default

---
 .goreleaser.yaml                   |   1 +
 cmd/command/version.go             |   7 +-
 components/nvidia/config/config.go |   5 +
 components/nvidia/nvidia.go        |  50 +++--
 components/nvidia/xid_poller.go    |  14 +-
 install.yaml                       | 348 +++++++++++++++++++++++++++++
 6 files changed, 403 insertions(+), 22 deletions(-)
 create mode 100644 install.yaml

diff --git a/.goreleaser.yaml b/.goreleaser.yaml
index 97e2770a..2e5dd045 100644
--- a/.goreleaser.yaml
+++ b/.goreleaser.yaml
@@ -53,6 +53,7 @@ builds:
       - -mod=vendor
     ldflags:
       - "-s -w"
+      - "-X github.com/scitix/sichek/cmd/command.Version={{ .Tag }}"
       - "-X github.com/scitix/sichek/cmd/command.Major={{ .Major }}"
       - "-X github.com/scitix/sichek/cmd/command.Minor={{ .Minor }}"
       - "-X github.com/scitix/sichek/cmd/command.Patch={{ .Patch }}"
diff --git a/cmd/command/version.go b/cmd/command/version.go
index c15ac16b..74694948 100644
--- a/cmd/command/version.go
+++ b/cmd/command/version.go
@@ -26,6 +26,9 @@ import (
 )
 
 var (
+	// Version is the full version string (e.g. v0.7.6.post1), set by ldflags at build time.
+	// When set, it is used for display instead of Major.Minor.Patch.
+	Version   = ""
 	Major     = ""
 	Minor     = ""
 	Patch     = ""
@@ -52,7 +55,9 @@ func NewVersionCmd() *cobra.Command {
 				now := time.Now()
 				BuildTime = now.Format("2006-01-02T15:04:05")
 			}
-			if Major == "" {
+			if Version != "" {
+				version = Version
+			} else if Major == "" {
 				version = "dev-" + GitCommit
 			} else {
 				version = "v" + Major + "." + Minor + "." + Patch
diff --git a/components/nvidia/config/config.go b/components/nvidia/config/config.go
index fa4c92b4..a4ddaa07 100644
--- a/components/nvidia/config/config.go
+++ b/components/nvidia/config/config.go
@@ -27,9 +27,14 @@ type NvidiaConfig struct {
 	QueryInterval   common.Duration `json:"query_interval" yaml:"query_interval"`
 	CacheSize       int64           `json:"cache_size" yaml:"cache_size"`
 	EnableMetrics   bool            `json:"enable_metrics" yaml:"enable_metrics"`
+	EnableXidPoller bool            `json:"enable_xid_poller" yaml:"enable_xid_poller"`
 	IgnoredCheckers []string        `json:"ignored_checkers,omitempty" yaml:"ignored_checkers,omitempty"`
 }
 
+func (c *NvidiaConfig) IsXidPollerEnabled() bool {
+	return c.EnableXidPoller
+}
+
 func (c *NvidiaUserConfig) GetCheckerSpec() map[string]common.CheckerSpec {
 	commonCfgMap := make(map[string]common.CheckerSpec)
 	return commonCfgMap
diff --git a/components/nvidia/nvidia.go b/components/nvidia/nvidia.go
index 881b9826..69efbac3 100644
--- a/components/nvidia/nvidia.go
+++ b/components/nvidia/nvidia.go
@@ -66,6 +66,10 @@ type component struct {
 	metrics *metrics.NvidiaMetrics
 
 	initError error // Track initialization errors with detailed information
+
+	// ReNewNvml rate limit: when called from HealthCheck timeout path, skip if last ReNewNvml was < 60s ago
+	renewBackoffMtx   sync.Mutex
+	lastReNewNvmlTime time.Time
 }
 
 var (
@@ -117,6 +121,17 @@ func NewNvml(ctx context.Context) (nvml.Interface, error) {
 // Note: The caller must hold c.healthCheckMtx lock before calling this function
 // to ensure thread-safe access to nvmlInst and xidPoller.
 func ReNewNvml(c *component) error {
+	const minReNewNvmlInterval = 60 * time.Second
+	// When nvmlInst exists (called from HealthCheck timeout path), rate-limit to avoid thrashing with DCGM
+	if c.nvmlInst != nil {
+		c.renewBackoffMtx.Lock()
+		last := c.lastReNewNvmlTime
+		c.renewBackoffMtx.Unlock()
+		if !last.IsZero() && time.Since(last) < minReNewNvmlInterval {
+			logrus.WithField("component", "nvidia").Debugf("ReNewNvml skipped (last run %v ago)", time.Since(last))
+			return nil
+		}
+	}
 
 	// Stop the XidEventPoller before shutting down NVML to prevent SIGSEGV
 	if c.xidPoller != nil {
@@ -144,31 +159,34 @@ func ReNewNvml(c *component) error {
 		if c.nvmlInstPtr != nil {
 			*c.nvmlInstPtr = nvmlInst
 		}
-		// Recreate the XidEventPoller with the new NVML instance
-		// Use RLock to check running status (nested lock: healthCheckMtx -> serviceMtx is safe)
+		// Recreate the XidEventPoller with the new NVML instance only when enabled
 		c.serviceMtx.RLock()
 		isRunning := c.running
 		c.serviceMtx.RUnlock()
-		if isRunning {
-			newXidPoller, err := NewXidEventPoller(c.ctx, c.cfg, nvmlInst, &c.nvmlMtx, c.resultChannel)
+		var newPoller *XidEventPoller
+		if isRunning && c.cfg != nil && c.cfg.Nvidia != nil && c.cfg.Nvidia.IsXidPollerEnabled() {
+			poller, err := NewXidEventPoller(c.ctx, c.cfg, nvmlInst, &c.nvmlMtx, c.resultChannel)
 			if err != nil {
 				logrus.WithField("component", "nvidia").Errorf("failed to recreate xid poller after NVML reinit: %v", err)
 			} else {
-				c.xidPoller = newXidPoller
-				// Restart the poller in a goroutine if the component is still running
+				newPoller = poller
 				go func() {
 					defer func() {
 						if err := recover(); err != nil {
 							fmt.Printf("[xidPoller] panic err is %s\n", err)
 						}
 					}()
-					err := c.xidPoller.Start()
+					err := poller.Start()
 					if err != nil {
 						logrus.WithField("component", "nvidia").Errorf("start xid poller failed after reinit: %v", err)
 					}
 				}()
 			}
 		}
+		c.xidPoller = newPoller
+		c.renewBackoffMtx.Lock()
+		c.lastReNewNvmlTime = time.Now()
+		c.renewBackoffMtx.Unlock()
 	}
 	return ret
 }
@@ -282,11 +300,16 @@ func newNvidia(cfgFile string, specFile string, ignoredCheckers []string) (comp
 		return component, nil
 	}
 
-	xidPoller, err := NewXidEventPoller(ctx, nvidiaCfg, nvmlInst, &component.nvmlMtx, component.resultChannel)
-	if err != nil {
-		logrus.WithField("component", "nvidia").Errorf("NewXidEventPoller failed: %v", err)
-		component.initError = fmt.Errorf("failed to create XID event poller: %w", err)
-		return component, nil
+	var xidPoller *XidEventPoller
+	if nvidiaCfg.Nvidia.IsXidPollerEnabled() {
+		xidPoller, err = NewXidEventPoller(ctx, nvidiaCfg, nvmlInst, &component.nvmlMtx, component.resultChannel)
+		if err != nil {
+			logrus.WithField("component", "nvidia").Errorf("NewXidEventPoller failed: %v", err)
+			component.initError = fmt.Errorf("failed to create XID event poller: %w", err)
+			return component, nil
+		}
+	} else {
+		logrus.WithField("component", "nvidia").Infof("XID event poller disabled)")
 	}
 
 	freqController := common.GetFreqController()
@@ -517,8 +540,7 @@ func (c *component) Start() <-chan *common.Result {
 				}
 				// Check if the error message contains "Timeout"
 				if result != nil && len(result.Checkers) > 0 && strings.Contains(result.Checkers[0].Name, "HealthCheckTimeout") {
-					// Handle the timeout error
-					// ReNewNvml requires healthCheckMtx lock to be held
+					// HealthCheck timed out: try ReNewNvml
 					c.healthCheckMtx.Lock()
 					err := ReNewNvml(c)
 					c.healthCheckMtx.Unlock()
diff --git a/components/nvidia/xid_poller.go b/components/nvidia/xid_poller.go
index aa565043..1c9e6e44 100644
--- a/components/nvidia/xid_poller.go
+++ b/components/nvidia/xid_poller.go
@@ -83,21 +83,21 @@ func (x *XidEventPoller) Start() error {
 
 		// waits for the duration specified in x.Cfg.UpdateInterval (in seconds)
 		// ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlEvents.html#group__nvmlEvents
-		// e, err := x.XidEventSet.Wait(uint32(x.Cfg.UpdateInterval.Microseconds()))
-		event, ret := x.XidEventSet.Wait(200)
-		logrus.WithField("component", "nvidia").Infof("XidEventSet.Wait returned: %v, %v", event.EventData, ret)
-		if ret == nvml.ERROR_NOT_SUPPORTED {
-			logrus.WithField("component", "nvidia").Warningf("XidEvent not supported -- Skipping: %v", nvml.ErrorString(ret))
+		event, ret := x.XidEventSet.Wait(uint32(1000))
+		if ret == nvml.ERROR_TIMEOUT {
+			// no event within timeout — normal
+			logrus.WithField("component", "nvidia").Debugf("XidEventSet.Wait timeout (no event)")
 			continue
 		}
-		if ret == nvml.ERROR_TIMEOUT {
-			// no event within timeout
+		if ret == nvml.ERROR_NOT_SUPPORTED {
+			logrus.WithField("component", "nvidia").Warningf("XidEvent not supported -- Skipping: %v", nvml.ErrorString(ret))
 			continue
 		}
 		if ret != nvml.SUCCESS {
 			logrus.WithField("component", "nvidia").Warningf("XidEventSet.Wait failure -- Retrying: %v", nvml.ErrorString(ret))
 			continue
 		}
+		logrus.WithField("component", "nvidia").Infof("XidEventSet.Wait returned event: %v", event.EventData)
 
 		x.handleEvent(event)
 	}
diff --git a/install.yaml b/install.yaml
new file mode 100644
index 00000000..853ce40b
--- /dev/null
+++ b/install.yaml
@@ -0,0 +1,348 @@
+---
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: sichek-gpu
+  namespace: monitoring
+  labels:
+    app: sichek
+spec:
+  selector:
+    matchLabels:
+      app: sichek
+  updateStrategy:
+    type: RollingUpdate
+    rollingUpdate:
+      maxUnavailable: 10
+
+  template:
+    metadata:
+      labels:
+        app: sichek
+    spec:
+      hostPID: true
+      hostNetwork: true
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+              - matchExpressions:
+                  - key: scitix.ai/gpu-type
+                    operator: Exists
+      tolerations:
+      - operator: Exists
+
+      serviceAccountName: sa-sichek
+
+      ############################################################
+      # volumes
+      ############################################################
+      volumes:
+      - name: sichek-host
+        hostPath:
+          path: /var/sichek
+          type: DirectoryOrCreate
+      - name: sichek-default-spec
+        configMap:
+          name: sichek-default-spec
+      - name: sichek-default-user-config
+        configMap:
+          name: sichek-default-user-config
+
+      ###########################################################
+      # main container
+      ###########################################################
+      containers:
+      - name: sichek
+        image: registry-ap-southeast.scitix.ai/hisys/sichek:v0.7.6
+        imagePullPolicy: Always
+
+        securityContext:
+          privileged: true
+
+        env:
+        - name: NODE_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: spec.nodeName
+        - name: SICHEK_SPEC_URL
+            value: {{ .sichek_spec_url }}
+
+        volumeMounts:
+        - name: sichek-host
+          mountPath: /host/var/sichek
+
+        lifecycle:
+          preStop:
+            exec:
+              command:
+              - /bin/bash
+              - -c
+              - |
+                nsenter -t 1 -m -p -n -u -i -- \
+                  rm -rf /var/sichek/run/pods/${POD_UID}
+
+        command: ["/bin/bash", "-c"]
+        args:
+        - |
+          set -euo pipefail
+
+          HOST="nsenter -t 1 -m -p -n -u -i --"
+          echo "[sichek] ensure service started"
+          $HOST sichek d start
+
+          echo "[sichek] entering keepalive loop"
+          while true; do
+            if ! $HOST systemctl is-active --quiet sichek; then
+              echo "[sichek] service not active, restarting"
+              $HOST sichek d start
+            fi
+            sleep 10
+          done
+
+        resources:
+          requests:
+            cpu: 100m
+            memory: 1Mi
+          limits:
+            cpu: "1"
+            memory: 1Gi
+
+      - name: sichek-exporter
+        image: registry-ap-southeast.scitix.ai/hisys/sichek:v0.7.6
+        securityContext:
+          privileged: true
+        volumeMounts:
+        - name: sichek-host
+          mountPath: /host/var/sichek
+        ports:
+        - name: metrics
+          containerPort: {{ .metrics_port }}
+        command: ["/bin/bash", "-c"]
+        args:
+        - |
+          set -euo pipefail
+
+          RUNTIME=/host/var/sichek/run/current
+          sichek exporter \
+            --metrics-socket ${RUNTIME}/metrics.sock \
+            --listen :{{ .metrics_port }}
+
+        resources:
+          requests:
+            cpu: 100m
+            memory: 1Mi
+          limits:
+            cpu: "1"
+            memory: 1Gi
+
+      ############################################################
+      # init container
+      ############################################################
+      initContainers:
+      - name: sichek-init
+        image: registry-ap-southeast.scitix.ai/hisys/sichek:v0.7.6
+        imagePullPolicy: Always
+
+        securityContext:
+          privileged: true
+          runAsUser: 0
+          capabilities:
+            add: [SYS_ADMIN]
+
+        env:
+        - name: SICHEK_VERSION
+          value: {{ .version }}
+        - name: SICHEK_SPEC_URL
+          value: {{ .sichek_spec_url }}
+        - name: NODE_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: spec.nodeName
+        - name: POD_UID
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.uid
+
+        volumeMounts:
+        - name: sichek-host
+          mountPath: /host/var/sichek
+        - name: sichek-default-spec
+          mountPath: /var/sichek/config/default_spec.yaml
+          subPath: default_spec.yaml
+        - name: sichek-default-user-config
+          mountPath: /var/sichek/config/default_user_config.yaml
+          subPath: default_user_config.yaml
+
+        command: ["/bin/bash", "-c"]
+        args:
+        - |
+          set -euo pipefail
+          set -x
+
+          HOST="nsenter -t 1 -m -p -n -u -i --"
+          SICL=$(ls /opt/sichek/dist/sicl-*.run)
+          SICHEK_RPM=$(ls /opt/sichek/dist/sichek_*_linux_amd64.rpm)
+          SICHEK_DEB=$(ls /opt/sichek/dist/sichek_*_linux_amd64.deb)
+          RUNTIME_BASE=/host/var/sichek/run
+          HOST_RUNTIME_BASE=/var/sichek/run
+          HOST_SICL_INSTALL_PATH=/usr/local/sihpc
+          [[ -e "${RUNTIME_BASE}" ]] && rm -rf "${RUNTIME_BASE}"
+          mkdir -p "${RUNTIME_BASE}"
+
+          ##########################################################
+          # 1. install / verify sichek on host
+          ##########################################################
+          # 1.1 check existing sichek
+          need_install=true
+          if $HOST sh -c 'command -v sichek >/dev/null 2>&1'; then
+            if $HOST sichek version >/dev/null 2>&1; then
+              current_version=$($HOST sichek version 2>&1 | awk '/^Version:/ {print $2}')
+            else
+              echo "[sichek] sichek exists but cannot run (glibc mismatch?)"
+              current_version=""
+            fi
+            echo "[sichek] found sichek version=${current_version}"
+            if [[ "${current_version}" == "${SICHEK_VERSION}" ]]; then
+                echo "[sichek] version match(${current_version} == ${SICHEK_VERSION}), skip install"
+                need_install=false
+            else
+                echo "[sichek] version mismatch(${current_version} != ${SICHEK_VERSION}), reinstall"
+            fi
+          else
+            echo "[sichek] sichek not installed"
+          fi
+
+          # 1.2 install if needed
+          echo "[sichek] node=${NODE_NAME}"
+          if [[ "${need_install}" == "true" ]]; then
+            $HOST sichek daemon stop || echo "Failed to stop sichek daemon"
+
+            echo "[sichek] install sicl"
+            $HOST rm -rf "${HOST_SICL_INSTALL_PATH}"
+            cp "$SICL" "${RUNTIME_BASE}/"
+            $HOST bash "${HOST_RUNTIME_BASE}/$(basename "$SICL")"
+            rm "${RUNTIME_BASE}/$(basename "$SICL")"
+
+            echo "[sichek] install sichek"
+            if $HOST sh -c 'command -v rpm >/dev/null 2>&1'; then
+              echo "[sichek] install via rpm"
+              cp "$SICHEK_RPM" "${RUNTIME_BASE}/"
+              $HOST rpm -e sichek || echo "Failed to remove existing sichek"
+              $HOST rpm -Uvh --force "${HOST_RUNTIME_BASE}/$(basename "$SICHEK_RPM")"
+              rm "${RUNTIME_BASE}/$(basename "$SICHEK_RPM")"
+            elif $HOST sh -c 'command -v dpkg >/dev/null 2>&1'; then
+              echo "[sichek] install via dpkg"
+              cp "$SICHEK_DEB" "${RUNTIME_BASE}/"
+              $HOST dpkg -P sichek || echo "Failed to remove existing sichek"
+              $HOST dpkg -i "${HOST_RUNTIME_BASE}/$(basename "$SICHEK_DEB")"
+              rm "${RUNTIME_BASE}/$(basename "$SICHEK_DEB")"
+            else
+              echo "[sichek] no rpm / dpkg found on host"
+              exit 1
+            fi
+            cp /var/sichek/config/default_spec.yaml /host/var/sichek/config/default_spec.yaml
+            cp /var/sichek/config/default_user_config.yaml /host/var/sichek/config/default_user_config.yaml
+          fi
+
+          # 1.3 final version check
+          echo "[sichek] final version check"
+          if $HOST sichek version >/dev/null 2>&1; then
+            final_version=$($HOST sichek version 2>&1 | awk '/^Version:/ {print $2}')
+          else
+            echo "[sichek] sichek exists but cannot run (glibc mismatch?)"
+            final_version=""
+          fi
+          if [[ "${final_version}" != "${SICHEK_VERSION}" ]]; then
+            echo "Installed sichek version (${final_version}) does not match expected (${SICHEK_VERSION})"
+            exit 1
+          fi
+          echo "[sichek] sichek ${final_version} ready"
+
+          cp /var/sichek/config/default_spec.yaml /host/var/sichek/config/default_spec.yaml
+          cp /var/sichek/config/default_user_config.yaml /host/var/sichek/config/default_user_config.yaml
+          
+          ##########################################################
+          # 2. generate kubeconfig (atomic)
+          ##########################################################
+          POD_DIR=${RUNTIME_BASE}/pods/${POD_UID}
+          CURRENT_LINK=${RUNTIME_BASE}/current
+          mkdir -p "${POD_DIR}"
+          CA_FILE=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt
+          TOKEN_FILE=/var/run/secrets/kubernetes.io/serviceaccount/token
+          SERVER="https://${KUBERNETES_SERVICE_HOST}:${KUBERNETES_SERVICE_PORT}"
+
+          CA_DATA=$(base64 < "${CA_FILE}" | tr -d '\n')
+          TOKEN=$(cat "${TOKEN_FILE}")
+
+          tmp=$(mktemp "${POD_DIR}/kubeconfig.XXXX")
+          cat <<EOF > "${tmp}"
+          apiVersion: v1
+          kind: Config
+          clusters:
+          - name: local
+            cluster:
+              server: ${SERVER}
+              certificate-authority-data: ${CA_DATA}
+          users:
+          - name: sichek
+            user:
+              token: ${TOKEN}
+          contexts:
+          - name: sichek-context
+            context:
+              cluster: local
+              user: sichek
+          current-context: sichek-context
+          EOF
+
+          chmod 600 "${tmp}"
+          mv "${tmp}" "${POD_DIR}/kubeconfig"
+
+          # Use relative symlink so both host and container resolve current under their view of the mount.
+          # (Absolute /var/sichek/... would not resolve inside container which only has /host/var/sichek.)
+          ln -sfn "pods/${POD_UID}" "${CURRENT_LINK}"
+
+          HOST_CURRENT=${HOST_RUNTIME_BASE}/current
+          TMP_ENV=$(mktemp "${POD_DIR}/env.XXXX")
+          cat <<EOF > "${TMP_ENV}"
+          FLAGS="--metrics-socket ${HOST_CURRENT}/metrics.sock -s /var/sichek/config/default_spec.yaml -c /var/sichek/config/default_user_config.yaml --log-file ${HOST_CURRENT}/sichek.log"
+          KUBECONFIG=${HOST_CURRENT}/kubeconfig
+          SICHEK_SPEC_URL={{ .sichek_spec_url }}
+          EOF
+          chmod 644 "${TMP_ENV}"
+          mv "${TMP_ENV}" "${RUNTIME_BASE}/env"
+
+      dnsPolicy: ClusterFirstWithHostNet
+      restartPolicy: Always
+      terminationGracePeriodSeconds: 30
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: sa-sichek
+  namespace: monitoring
+automountServiceAccountToken: true
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: cluster-role-sichek
+rules:
+  - apiGroups: ["kubeflow.org", ""]
+    resources: ["nodes", "pods", "pytorchjobs"]
+    verbs: ["get", "list", "patch", "update", "watch"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: cluster-role-binding-sichek
+  namespace: monitoring
+roleRef:
+  apiGroup: rbac.authorization.k8s.io  # Remove the /v1
+  kind: ClusterRole
+  name: cluster-role-sichek
+subjects:
+  - kind: ServiceAccount
+    name: sa-sichek
+    namespace: monitoring
\ No newline at end of file

From 5dbceafe97a8d03e7558ad97bf087ef9a0de1ad1 Mon Sep 17 00:00:00 2001
From: xlliu <xlliu@scitix.ai>
Date: Mon, 23 Feb 2026 11:42:38 +0000
Subject: [PATCH 2/4] Fix ib_dev checker and enhance atest features

- Fix abnormal reporting in ib_dev checker when devices not defined
  in spec are detected
- Add --swanlab-mode option to atest
- Add k8s wrapper commands for atest
---
 .github/workflows/release.yml            | 11 ++++
 components/infiniband/checker/ib_devs.go | 11 ++++
 config/default_spec.yaml                 | 24 ++++++-
 docker/Dockerfile.cuda128                | 80 ++++++++++++++++++++++++
 scripts/atest/common.py                  | 13 ++++
 scripts/atest/config.py                  |  5 ++
 scripts/atest/deepeptest_multi_node.py   | 17 +++--
 scripts/atest/deepeptest_single_node.py  | 10 ++-
 scripts/atest/modeltest_multi_node.py    | 26 +++++---
 scripts/atest/modeltest_single_node.py   |  6 +-
 scripts/atest/mpijob_helper.py           |  8 ++-
 scripts/atest/nccltest_multi_node.py     | 30 ++++++---
 scripts/atest/nccltest_single_node.py    | 10 +--
 scripts/postinstall.sh                   |  3 +
 scripts/sichek-ib-topo                   | 29 ++++++---
 15 files changed, 240 insertions(+), 43 deletions(-)
 create mode 100644 docker/Dockerfile.cuda128

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 822f8210..c8114abb 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -67,3 +67,14 @@ jobs:
             ghcr.io/${{ github.repository_owner }}/sichek:latest
           labels: |
                 org.opencontainers.image.source=https://github.com/${{ github.repository }}
+      - name: Build and push Docker image (ubuntu20.04, cuda12.8)
+        uses: docker/build-push-action@v4
+        with:
+          context: ./
+          file: docker/Dockerfile.cuda128
+          platforms: linux/amd64
+          push: true
+          tags: |
+            ghcr.io/${{ github.repository_owner }}/sichek:${{ github.ref_name }}-ubuntu2004-cuda12.8
+          labels: |
+            org.opencontainers.image.source=https://github.com/${{ github.repository }}
diff --git a/components/infiniband/checker/ib_devs.go b/components/infiniband/checker/ib_devs.go
index b1fc840b..cf63d751 100644
--- a/components/infiniband/checker/ib_devs.go
+++ b/components/infiniband/checker/ib_devs.go
@@ -62,6 +62,7 @@ func (c *IBDevsChecker) Check(ctx context.Context, data any) (*common.CheckerRes
 
 	var mismatchPairs []string
 	infinibandInfo.RLock()
+	// 1) Spec -> actual: missing or wrong mapping
 	for expectedMlx5, expectedIb := range c.spec.IBPFDevs {
 		// skip mezzanine card in check
 		if strings.Contains(expectedMlx5, "mezz") {
@@ -80,6 +81,16 @@ func (c *IBDevsChecker) Check(ctx context.Context, data any) (*common.CheckerRes
 			mismatchPairs = append(mismatchPairs, fmt.Sprintf("%s -> %s (expected %s)", expectedMlx5, actualIb, expectedIb))
 		}
 	}
+	// 2) Actual -> spec: extra devices not defined in spec (e.g. mlx5_7 in spec but actual shows mlx5_13_6209)
+	for actualMlx5 := range infinibandInfo.IBPFDevs {
+		if strings.Contains(actualMlx5, "mezz") {
+			continue
+		}
+		if _, inSpec := c.spec.IBPFDevs[actualMlx5]; !inSpec {
+			mismatchPairs = append(mismatchPairs, fmt.Sprintf("%s (not in spec)", actualMlx5))
+			logrus.WithField("component", "infiniband").Debugf("mismatch: actual device %s not defined in spec", actualMlx5)
+		}
+	}
 	infinibandInfo.RUnlock()
 
 	if len(mismatchPairs) > 0 {
diff --git a/config/default_spec.yaml b/config/default_spec.yaml
index 2f3c8f42..da63ee78 100644
--- a/config/default_spec.yaml
+++ b/config/default_spec.yaml
@@ -66,9 +66,27 @@ infiniband:
   default:
     <<: *ib_base
 hca:
-  MT_0000000838: {}
-  DEL0000000036: {}
-  MT_0000000223: {}
+  MT_0000000838:
+    hardware:
+      hca_type: "MT4129"
+      board_id: "MT_0000000838"
+      fw_ver: ">=28.39.2048"
+      vpd: "NVIDIA ConnectX-7 HHHL Adapter card, 400GbE / NDR IB (default mode), Single-port OSFP, PCIe 5.0 x16, Crypto Disabled, Secure Boot Enabled"
+      net_port: 1
+      port_speed: "400 Gb/sec (4X NDR)"
+      phy_state: "LinkUp"
+      port_state: "ACTIVE"
+      net_operstate: "down"
+      link_layer: "InfiniBand"
+      pcie_width: "16"
+      pcie_speed: "32.0 GT/s PCIe"
+      pcie_tree_width: "16"
+      pcie_tree_speed: "32"
+      pcie_acs: "disable"
+      pcie_mrr: "4096"
+    perf:
+      one_way_bw: 360 # Gbps
+      avg_latency_us: 10 # us
 pcie_topo:
   "0x233510de":
     numa_config:
diff --git a/docker/Dockerfile.cuda128 b/docker/Dockerfile.cuda128
new file mode 100644
index 00000000..5a12d931
--- /dev/null
+++ b/docker/Dockerfile.cuda128
@@ -0,0 +1,80 @@
+ARG CUDA_VERSION=12.8.1
+FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu20.04 AS build
+
+ARG GO_VERSION=1.23.3
+ARG GORELEASER_VERSION=v2.13.1
+
+RUN mv /etc/apt/sources.list.d/cuda*.list /tmp/disabled-cuda.list 2>/dev/null || true && \
+    apt-get update && \
+    apt-get install -y --no-install-recommends \
+        build-essential gcc g++ curl git wget ca-certificates && \
+    rm -rf /var/lib/apt/lists/*
+
+# Install Go
+RUN curl -fsSL https://go.dev/dl/go${GO_VERSION}.linux-amd64.tar.gz | \
+    tar -C /usr/local -xz && \
+    ln -s /usr/local/go/bin/go /usr/local/bin/go
+
+# Install GoReleaser
+RUN curl -fsSL https://github.com/goreleaser/goreleaser/releases/download/${GORELEASER_VERSION}/goreleaser_Linux_x86_64.tar.gz | \
+    tar -xz -C /usr/local/bin goreleaser && \
+    goreleaser --version
+
+ENV GOSUMDB=off
+WORKDIR /go/src/sichek
+
+COPY . .
+
+ARG BUILD_TIME
+RUN if [ -n "$BUILD_TIME" ]; then \
+      BUILD_TIME=$BUILD_TIME goreleaser release --snapshot --clean ; \
+    else \
+      goreleaser release --snapshot --clean ; \
+    fi
+
+FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu20.04
+
+ENV DEBIAN_FRONTEND=noninteractive
+ENV TZ=Asia/Shanghai
+ENV NVIDIA_VISIBLE_DEVICES=""
+
+RUN mv /etc/apt/sources.list.d/cuda*.list /tmp/disabled-cuda.list 2>/dev/null || true && \
+    apt-get update && \
+    apt-get install -y --no-install-recommends \
+        vim curl tzdata ca-certificates \
+        openssh-server openssh-client \
+        iproute2 iputils-ping jq \
+        libnuma1 numactl rdmacm-utils perftest xz-utils && \
+    ln -fs /usr/share/zoneinfo/${TZ} /etc/localtime && \
+    dpkg-reconfigure --frontend noninteractive tzdata && \
+    apt-get clean && rm -rf /var/lib/apt/lists/*
+
+# Helm (runtime tool)
+RUN curl -fsSL https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
+
+WORKDIR /opt/sichek
+COPY --from=build /go/src/sichek/dist ./dist
+
+ARG SICL_VERSION=sicl-25.11-1.cuda128.ubuntu2004.run
+RUN curl -fsSL -o ./dist/${SICL_VERSION} \
+      https://oss-ap-southeast.scitix.ai/scitix-release/${SICL_VERSION} && \
+    bash ./dist/${SICL_VERSION} && \
+    dpkg -i ./dist/sichek_*_linux_amd64.deb
+
+ENV SIHPC_HOME=/usr/local/sihpc
+ENV PATH=${SIHPC_HOME}/bin:$PATH
+ENV LD_LIBRARY_PATH=${SIHPC_HOME}/lib:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+ENV OMPI_MCA_opal_prefix=${SIHPC_HOME}
+ENV OPAL_PREFIX=${SIHPC_HOME}
+
+# SSH dependencies for MPI
+RUN sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \
+    echo "    UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \
+    sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config && \
+    mkdir -p /var/run/sshd /root/.ssh && \
+    chmod 700 /root/.ssh
+RUN ssh-keygen -t rsa -f /root/.ssh/id_rsa -q -N "" && \
+    cat /root/.ssh/id_rsa.pub >> /root/.ssh/authorized_keys && \
+    chmod 600 /root/.ssh/authorized_keys
+
+EXPOSE 22
diff --git a/scripts/atest/common.py b/scripts/atest/common.py
index 21ae0fe2..99d9aa69 100644
--- a/scripts/atest/common.py
+++ b/scripts/atest/common.py
@@ -159,6 +159,7 @@ def is_valid_value(value):
         ("SWANLAB_API_KEY", "swanlab_api_key"),
         ("SWANLAB_WORKSPACE", "swanlab_workspace"),
         ("SWANLAB_PROJ_NAME", "swanlab_proj_name"),
+        ("SWANLAB_MODE", "swanlab_mode"),
     ]:
         val = config.get(config_key)
         if is_valid_value(val):
@@ -166,6 +167,18 @@ def is_valid_value(value):
         elif config_key in config:
             os.environ.pop(env_key, None)
 
+def apply_swanlab_mode(cli_mode: Optional[str], config: Dict[str, str]):
+    """Resolve swanlab mode from CLI > config > env, and set SWANLAB_MODE env var."""
+    mode = pick_value(cli_mode, config, "swanlab_mode", "")
+    if mode:
+        os.environ["SWANLAB_MODE"] = mode
+
+
+def is_swanlab_disabled() -> bool:
+    """Return True if SWANLAB_MODE is set to 'disabled'."""
+    return os.getenv("SWANLAB_MODE", "").lower() == "disabled"
+
+
 def pick_value(cli_value: Optional[str], config: Dict[str, str], key: str, default: str) -> str:
     if cli_value not in (None, ""):
         return cli_value
diff --git a/scripts/atest/config.py b/scripts/atest/config.py
index ab8dd0a2..85226e0b 100644
--- a/scripts/atest/config.py
+++ b/scripts/atest/config.py
@@ -29,6 +29,7 @@
     "swanlab_api_key",
     "swanlab_workspace",
     "swanlab_proj_name",
+    "swanlab_mode",
 ]
 
 CONFIG_DIR = Path.home() / ".sichek"
@@ -162,6 +163,10 @@ def config_create():
         "swanlab project",
         config.get("swanlab_proj_name", "") or config.get("swanlab_proj_name", "")
     )
+    new_config["swanlab_mode"] = ask(
+        "swanlab mode [cloud, offline, disabled, local]",
+        config.get("swanlab_mode", "cloud")
+    )
     
     # Validate default_spec if provided
     if new_config.get("default_spec"):
diff --git a/scripts/atest/deepeptest_multi_node.py b/scripts/atest/deepeptest_multi_node.py
index 05693be3..3d97dadc 100644
--- a/scripts/atest/deepeptest_multi_node.py
+++ b/scripts/atest/deepeptest_multi_node.py
@@ -33,6 +33,8 @@
     parse_hostnames,
     load_user_config,
     pick_value,
+    apply_swanlab_mode,
+    is_swanlab_disabled,
     start_kubectl_log_stream,
     wait_for_pods_ready,
 )
@@ -152,23 +154,26 @@ def main() -> None:
     parser.add_argument("--job-name", default=None)
     parser.add_argument("--namespace", default="default")
     parser.add_argument("--cmd", default="", help="Command to run (same as deepeptest_single_node)")
-    parser.add_argument("--image-repo", default="registry-taihua.siflow.cn/hisys/mcore", help="Container image repository")
-    parser.add_argument("--image-tag", default="pytorch25.11-cuda13-cudnn9.17-te-main-v1", help="Container image tag")
+    parser.add_argument("--image-repo", default=None, help="Container image repository (or set pytorchjob_image_repo in config)")
+    parser.add_argument("--image-tag", default=None, help="Container image tag (or set pytorchjob_image_tag in config)")
     parser.add_argument("--timeout", type=int, default=600)
     parser.add_argument("--scheduler-name", default=None)
     parser.add_argument("--roce-shared-mode", default=None)
     parser.add_argument("--hostfile", default="None")
     parser.add_argument("--host", default="None")
     parser.add_argument("--host-dir", default=None, help="Host directory to mount (e.g. /tmp/DeepEP)")
+    parser.add_argument("--swanlab-mode", type=str, default=None, choices=["cloud", "offline", "disabled", "local"],
+                        help="SwanLab mode: cloud (default), offline, disabled, local")
 
     args = parser.parse_args()
     config = load_user_config()
+    apply_swanlab_mode(args.swanlab_mode, config)
 
     args.image_repo = pick_value(
-        args.image_repo, config, "pytorchjob_image_repo", "registry-taihua.siflow.cn/hisys/mcore"
+        args.image_repo, config, "pytorchjob_image_repo", "registry-us-east.scitix.ai/hisys/mcore"
     )
     args.image_tag = pick_value(
-        args.image_tag, config, "pytorchjob_image_tag", "pytorch25.11-cuda13-cudnn9.17-te-main-v1"
+        args.image_tag, config, "pytorchjob_image_tag", "v2.1-cudnn9.14-te2.8-cuda_arch_10.0_at"
     )
     args.scheduler_name = pick_value(args.scheduler_name, config, "scheduler", "si-scheduler")
     args.roce_shared_mode = pick_value(args.roce_shared_mode, config, "roce_shared_mode", "none")
@@ -191,7 +196,7 @@ def main() -> None:
     num_experts = 8 * num_workers
     default_cmd = (
         "python /tmp/DeepEP/tests/test_internode.py "
-        f"--num-processes 8 --num-tokens 4096 --hidden 7168 --num-topk 8 --num-experts {num_experts}"
+        f"--num-processes 8 --num-tokens 4096 --hidden 7168 --num-topk 8 --num-experts 256"
     )
     cmd = args.cmd if args.cmd else default_cmd
     if args.host_dir is not None:
@@ -222,7 +227,7 @@ def signal_handler(sig: int, frame: Any) -> None:
     signal.signal(signal.SIGTERM, signal_handler)
 
     swan_run = None
-    if os.getenv("SWANLAB_API_KEY") and swanlab is not None:
+    if os.getenv("SWANLAB_API_KEY") and swanlab is not None and not is_swanlab_disabled():
         swan_run = swanlab.init(
             experiment_name=args.job_name,
             description=f"DeepEP tuning multi-node ({num_workers} workers)",
diff --git a/scripts/atest/deepeptest_single_node.py b/scripts/atest/deepeptest_single_node.py
index aaa24d31..2cea3deb 100644
--- a/scripts/atest/deepeptest_single_node.py
+++ b/scripts/atest/deepeptest_single_node.py
@@ -22,6 +22,8 @@
     parse_hostnames,
     load_user_config,
     pick_value,
+    apply_swanlab_mode,
+    is_swanlab_disabled,
 )
 
 from mpijob_helper import (
@@ -121,8 +123,8 @@ def main() -> None:
     parser.description = (
         "Runs DeepEP intranode test on each worker pod and prints the three Best results per node."
     )
-    parser.add_argument("--image-repo", default="registry-taihua.siflow.cn/hisys/mcore", help="Container image repository")
-    parser.add_argument("--image-tag", default="pytorch25.11-cuda13-cudnn9.17-te-main-v1", help="Container image tag")
+    parser.add_argument("--image-repo", default=None, help="Container image repository (or set pytorchjob_image_repo in config)")
+    parser.add_argument("--image-tag", default=None, help="Container image tag (or set pytorchjob_image_tag in config)")
     parser.add_argument(
         "--host-dir",
         default=None,
@@ -131,6 +133,7 @@ def main() -> None:
 
     args = parser.parse_args()
     config = load_user_config()
+    apply_swanlab_mode(args.swanlab_mode, config)
 
     default_cmd = (
         "python /tmp/DeepEP/tests/test_intranode.py "
@@ -169,12 +172,13 @@ def main() -> None:
         timeout=args.timeout,
         max_parallel_jobs=args.max_parallel_jobs,
         cmd=cmd,
+        request_gpu=not args.no_request_gpu,
     )
 
     runner = MPIJobRunner(mpijob_config)
 
     swan_run = None
-    if os.getenv("SWANLAB_API_KEY") and swanlab is not None:
+    if os.getenv("SWANLAB_API_KEY") and swanlab is not None and not is_swanlab_disabled():
         swan_run = swanlab.init(
             experiment_name=mpijob_config.job_name,
             description=f"DeepEP tuning ({len(mpijob_config.hostnames)} workers)",
diff --git a/scripts/atest/modeltest_multi_node.py b/scripts/atest/modeltest_multi_node.py
index b9869a3d..7926cd85 100755
--- a/scripts/atest/modeltest_multi_node.py
+++ b/scripts/atest/modeltest_multi_node.py
@@ -5,6 +5,7 @@
 
 import os
 import argparse
+import re
 import shlex
 import signal
 import sys
@@ -35,6 +36,8 @@
     summarize,
     load_user_config,
     pick_value,
+    apply_swanlab_mode,
+    is_swanlab_disabled,
     start_kubectl_log_stream,
     wait_for_pods_ready,
 )
@@ -47,7 +50,7 @@ def main():
     parser.add_argument("--cmd", default="", help="command to run in each pod: PP=1 MBS=4 bash /workspace/ai4s-job-system/mcore_trainer/demos/llama/train_llama2_70b_bf16.sh by default")
     parser.add_argument("--image-repo", default=None)
     parser.add_argument("--image-tag", default=None)
-    parser.add_argument("--timeout", type=int, default=600)
+    parser.add_argument("--timeout", type=int, default=3600)
     parser.add_argument("--scheduler-name", default=None)
     parser.add_argument("--roce-shared-mode", default=None)
     parser.add_argument("--hostfile", default="None")
@@ -59,10 +62,13 @@ def main():
     parser.add_argument("--ep", type=int, default=None, help="Expert Parallelism size")
     parser.add_argument("--host-dir", default=None, help="host directory to mount in pytorchjob pods")
     parser.add_argument("--gpu-type", default=None, help="GPU type")
+    parser.add_argument("--swanlab-mode", type=str, default=None, choices=["cloud", "offline", "disabled", "local"],
+                        help="SwanLab mode: cloud (default), offline, disabled, local")
 
     args = parser.parse_args()
 
     config = load_user_config()
+    apply_swanlab_mode(args.swanlab_mode, config)
 
     args.image_repo = pick_value(args.image_repo, config, "pytorchjob_image_repo", "registry-us-east.scitix.ai/hisys/mcore")
 
@@ -110,7 +116,9 @@ def main():
         cmd = f"EP={args.ep} {cmd}"
     if args.host_dir is not None:
         cmd = f"OLMO_CORE_DIR={args.host_dir} {cmd}"
-    if os.getenv("SWANLAB_API_KEY") is not None:
+    if is_swanlab_disabled():
+        cmd = f"export SWANLAB_MODE=disabled && {cmd}"
+    elif os.getenv("SWANLAB_API_KEY") is not None:
         cmd = (
             f"export SWANLAB_API_KEY={os.getenv('SWANLAB_API_KEY')} && "
             f"export SWANLAB_WORKSPACE={os.getenv('SWANLAB_WORKSPACE')} && "
@@ -119,10 +127,7 @@ def main():
             f"{cmd}"
         )
     else:
-        cmd = (
-            f"export SWANLAB_MODE=disabled && "
-            f"{cmd}"
-        )
+        cmd = f"export SWANLAB_MODE=disabled && {cmd}"
     
     scripts_dir = Path(__file__).parent.resolve()
     helm_dir = scripts_dir.parent.parent / "k8s" / "sichek"
@@ -149,7 +154,7 @@ def signal_handler(sig, frame):
     signal.signal(signal.SIGTERM, signal_handler)
 
     swan_run = None
-    if os.getenv("SWANLAB_API_KEY") and swanlab is not None:
+    if os.getenv("SWANLAB_API_KEY") and swanlab is not None and not is_swanlab_disabled():
         swan_run = swanlab.init(
             experiment_name=args.job_name,
             description=f"Llama2 70B benchmark ({num_workers} workers)",
@@ -253,7 +258,12 @@ def signal_handler(sig, frame):
         if rc != 0 or not out.strip():
             echo_warn(f"No worker pods found for job '{args.job_name}'")
             return
-        pods = sorted(out.strip().split())
+        # Sort by worker index (e.g. worker-9 < worker-10 < worker-99 < worker-100)
+        def _worker_index(name):
+            m = re.search(r"worker-(\d+)$", name)
+            return int(m.group(1)) if m else -1
+
+        pods = sorted(out.strip().split(), key=_worker_index)
         last_pod = pods[-1]
         echo_info(f"Last worker pod name: {last_pod}")
 
diff --git a/scripts/atest/modeltest_single_node.py b/scripts/atest/modeltest_single_node.py
index 37ee3323..ec9d69f2 100644
--- a/scripts/atest/modeltest_single_node.py
+++ b/scripts/atest/modeltest_single_node.py
@@ -31,6 +31,8 @@
     summarize,
     load_user_config,
     pick_value,
+    apply_swanlab_mode,
+    is_swanlab_disabled,
 )
 
 from mpijob_helper import (
@@ -120,6 +122,7 @@ def main():
     args = parser.parse_args()
     
     config = load_user_config()
+    apply_swanlab_mode(args.swanlab_mode, config)
     
     default_cmd = (
         "bash /workspace/ai4s-job-system/mcore_trainer/demos/llama/train_llama2_13b_bf16.sh"
@@ -156,12 +159,13 @@ def main():
         timeout=args.timeout,
         max_parallel_jobs=args.max_parallel_jobs,
         cmd=cmd,
+        request_gpu=not args.no_request_gpu,
     )
     
     runner = MPIJobRunner(mpijob_config)
     
     swan_run = None
-    if os.getenv("SWANLAB_API_KEY") and swanlab is not None:
+    if os.getenv("SWANLAB_API_KEY") and swanlab is not None and not is_swanlab_disabled():
         swan_run = swanlab.init(
             experiment_name=mpijob_config.job_name,
             description=f"Model benchmark ({len(mpijob_config.hostnames)} workers)",
diff --git a/scripts/atest/mpijob_helper.py b/scripts/atest/mpijob_helper.py
index ff3fd3e5..f8046f2e 100644
--- a/scripts/atest/mpijob_helper.py
+++ b/scripts/atest/mpijob_helper.py
@@ -128,7 +128,7 @@ def deploy(self):
             f"--set mpijob.name={shlex.quote(cfg.job_name)} "
             f"--set mpijob.numWorkers={len(cfg.hostnames)} "
             f"--set 'mpijob.nodeAffinityHosts={{{host_csv}}}' "
-            f"--set 'mpijob.requestGpu={cfg.request_gpu}'"
+            f"--set 'mpijob.requestGpu={"true" if cfg.request_gpu else "false"}'"
         )
         run_cmd_check(helm_cmd)
     
@@ -322,13 +322,15 @@ def create_mpijob_arg_parser(description: str):
     parser.add_argument("--namespace", default="default", help="Kubernetes namespace")
     parser.add_argument("--cmd", default="", help="Command to run in each pod")
     # Note: --image-repo and --image-tag should be added by each script as they differ
-    parser.add_argument("--timeout", type=int, default=600, help="Timeout in seconds")
+    parser.add_argument("--timeout", type=int, default=3600, help="Timeout in seconds")
     parser.add_argument("--scheduler-name", default=None, help="Kubernetes scheduler name")
     parser.add_argument("--roce-shared-mode", default=None, help="RoCE shared mode")
     parser.add_argument("--hostfile", default="None", help="File containing hostnames")
     parser.add_argument("--host", default="None", help="Comma-separated hostnames")
     parser.add_argument("--max-parallel-jobs", type=int, default=200, help="Max parallel jobs")
-    
+    parser.add_argument("--no-request-gpu", action="store_true", help="Do not request GPU resources (default: request GPU)")
+    parser.add_argument("--swanlab-mode", type=str, default=None, choices=["cloud", "offline", "disabled", "local"],
+                        help="SwanLab mode: cloud (default), offline, disabled, local")
     return parser
 
 
diff --git a/scripts/atest/nccltest_multi_node.py b/scripts/atest/nccltest_multi_node.py
index 5d6f6d8e..c8112cce 100755
--- a/scripts/atest/nccltest_multi_node.py
+++ b/scripts/atest/nccltest_multi_node.py
@@ -37,6 +37,8 @@
     parse_nccltest_bandwidth,
     load_user_config,
     pick_value,
+    apply_swanlab_mode,
+    is_swanlab_disabled,
     start_kubectl_log_stream,
     wait_for_pods_ready,
 )
@@ -56,10 +58,13 @@ def main():
     parser.add_argument("--roce-shared-mode", default=None)
     parser.add_argument("--hostfile", default="None")
     parser.add_argument("--host", default="None")
-    parser.add_argument("--request-gpu", action="store_true", help="Request GPU resources for each worker pod")
+    parser.add_argument("--no-request-gpu", action="store_true", help="Do not request GPU resources (default: request GPU)")
+    parser.add_argument("--swanlab-mode", type=str, default=None, choices=["cloud", "offline", "disabled", "local"],
+                        help="SwanLab mode: cloud (default), offline, disabled, local")
     args = parser.parse_args()
 
     config = load_user_config()
+    apply_swanlab_mode(args.swanlab_mode, config)
     args.image_repo = pick_value(args.image_repo, config, "image_repo", "registry-us-east.scitix.ai/hisys/sichek")
     args.image_tag = pick_value(args.image_tag, config, "image_tag", "latest")
     args.scheduler_name = pick_value(args.scheduler_name, config, "scheduler", "si-scheduler")
@@ -101,7 +106,7 @@ def signal_handler(sig, frame):
     signal.signal(signal.SIGTERM, signal_handler)
 
     swan_run = None
-    if os.getenv("SWANLAB_API_KEY") and swanlab is not None:
+    if os.getenv("SWANLAB_API_KEY") and swanlab is not None and not is_swanlab_disabled():
         swan_run = swanlab.init(
             experiment_name=args.job_name,
             description=f"NCCL benchmark multi-node ({num_workers} workers)",
@@ -125,7 +130,7 @@ def signal_handler(sig, frame):
         echo_info(f"Timeout: {args.timeout} seconds")
 
         host_csv = ",".join(hostnames)
-        gpu_flag = "true" if args.request_gpu else "false"
+        gpu_flag = "false" if args.no_request_gpu else "true"
         helm_cmd = (
             f"helm upgrade --install {shlex.quote(args.job_name)} {shlex.quote(str(helm_dir))} "
             f"--atomic "
@@ -220,14 +225,23 @@ def signal_handler(sig, frame):
                 time.sleep(5)
                 waited += 5
 
-            run_cmd_line = (
-                "timeout {timeout} /usr/local/sihpc/bin/mpirun "
+            # For all+reduce tests on >64 nodes, use -b32g -e32g
+            nccl_extra = ""
+            if "all" in cmd and "reduce" in cmd and num_workers > 64:
+                nccl_extra = " -b32g -e32g"
+            mpirun_part = (
+                "/usr/local/sihpc/bin/mpirun "
                 "--allow-run-as-root --map-by ppr:8:node "
                 "--mca oob_tcp_if_include eth0 --mca pml ^ucx --mca btl self,tcp "
                 "--mca btl_tcp_if_include eth0 --mca routed direct --mca plm_rsh_no_tree_spawn 1 "
-                "-x UCX_TLS=tcp -x NCCL_MIN_NCHANNELS=32 -x NCCL_IB_QPS_PER_CONNECTION=8 "
-                "/usr/local/sihpc/libexec/nccl-tests/nccl_test -l{cmd}"
-            ).format(timeout=args.timeout, cmd=cmd)
+                "-x UCX_TLS=tcp -x NCCL_DEBUG=WARN -x NCCL_MIN_NCHANNELS=32 -x NCCL_IB_QPS_PER_CONNECTION=8 "
+                "/usr/local/sihpc/libexec/nccl-tests/nccl_test -l{cmd}{extra}"
+            ).format(cmd=cmd, extra=nccl_extra)
+            
+            if "-N 0" in cmd:
+                run_cmd_line = mpirun_part
+            else:
+                run_cmd_line = f"timeout {args.timeout} {mpirun_part}"
 
             echo_info(f"Running NCCL test: {label}")
             # Wrap command to tee output to container's main process stdout
diff --git a/scripts/atest/nccltest_single_node.py b/scripts/atest/nccltest_single_node.py
index f3b679ff..603adc1e 100755
--- a/scripts/atest/nccltest_single_node.py
+++ b/scripts/atest/nccltest_single_node.py
@@ -29,6 +29,8 @@
     parse_hostnames,
     load_user_config,
     pick_value,
+    apply_swanlab_mode,
+    is_swanlab_disabled,
 )
 
 from mpijob_helper import (
@@ -117,16 +119,16 @@ def main():
     )
     parser.add_argument("--image-repo", default=None, help="Container image repository")
     parser.add_argument("--image-tag", default=None, help="Container image tag")
-    parser.add_argument("--request-gpu", action="store_true", help="Request GPU resources for each worker pod")
     args = parser.parse_args()
     
     config = load_user_config()
+    apply_swanlab_mode(args.swanlab_mode, config)
     
     image_repo = pick_value(args.image_repo, config, "image_repo", "registry-us-east.scitix.ai/hisys/sichek")
     image_tag = pick_value(args.image_tag, config, "image_tag", "latest")
     
     if not args.cmd:
-        cmd = "NCCL_DEBUG=INFO /usr/local/sihpc/libexec/nccl-tests/nccl_test -g 8"
+        cmd = "NCCL_DEBUG=WARN /usr/local/sihpc/libexec/nccl-tests/nccl_test -g 8"
     else:
         cmd = args.cmd
     
@@ -151,13 +153,13 @@ def main():
         timeout=args.timeout,
         max_parallel_jobs=args.max_parallel_jobs,
         cmd=cmd,
-        request_gpu=args.request_gpu,
+        request_gpu=not args.no_request_gpu,
     )
     
     runner = MPIJobRunner(mpijob_config)
     
     swan_run = None
-    if os.getenv("SWANLAB_API_KEY") and swanlab is not None:
+    if os.getenv("SWANLAB_API_KEY") and swanlab is not None and not is_swanlab_disabled():
         swan_run = swanlab.init(
             experiment_name=mpijob_config.job_name,
             description=f"NCCL benchmark ({len(mpijob_config.hostnames)} workers)",
diff --git a/scripts/postinstall.sh b/scripts/postinstall.sh
index b5128aeb..b2984a93 100755
--- a/scripts/postinstall.sh
+++ b/scripts/postinstall.sh
@@ -99,6 +99,9 @@ create_at_wrapper "sichek-k8s-llama2-70b" "${SICHEK_SCRIPTS_PATH}/atest/modeltes
 create_at_wrapper "sichek-k8s-olmo3-7b" "${SICHEK_SCRIPTS_PATH}/atest/modeltest_single_node.py" --job-name sichek-olmo3-7b --cmd "bash /workspace/ai4s-job-system/mcore_trainer/demos/OLMo3/OLMo-3-1025-7B-pretrain-1.sh"
 create_at_wrapper "sichek-k8s-olmo3-7b-multinode" "${SICHEK_SCRIPTS_PATH}/atest/modeltest_multi_node.py" --job-name sichek-olmo3-7b --cmd "bash /workspace/ai4s-job-system/mcore_trainer/demos/OLMo3/OLMo-3-1025-7B-pretrain-1.sh"
 create_at_wrapper "sichek-k8s-qwen-a3b" "${SICHEK_SCRIPTS_PATH}/atest/modeltest_multi_node.py" --job-name sichek-qwen-a3b --cmd "MAX_STEPS=128 NCCL_DEBUG=WARN bash /workspace/ai4s-job-system/mcore_trainer/demos/deepseek/sft_deepseekv3.1_base.sh"
+create_at_wrapper "sichek-k8s-modeltest-multinode" "${SICHEK_SCRIPTS_PATH}/atest/modeltest_multi_node.py"
+create_at_wrapper "sichek-k8s-deepeptest-singlenode" "${SICHEK_SCRIPTS_PATH}/atest/deepeptest_single_node.py"
+create_at_wrapper "sichek-k8s-deepeptest-multinode" "${SICHEK_SCRIPTS_PATH}/atest/deepeptest_multi_node.py"
 
 # Run check_sicl.sh
 bash /var/sichek/scripts/check_sicl.sh || echo "Failed to run SICL installer"
diff --git a/scripts/sichek-ib-topo b/scripts/sichek-ib-topo
index eda587c1..d2fba835 100755
--- a/scripts/sichek-ib-topo
+++ b/scripts/sichek-ib-topo
@@ -64,7 +64,7 @@ def parse_iblinkinfo(file_path: str):
                     "port": int(port),
                     "state": state.strip(),
                     "peer": {
-                        "lid": peer_lid if peer_lid else None,
+                        "lid": int(peer_lid) if peer_lid else None,
                         "port": int(peer_port) if peer_port else None,
                         "name": peer_name.strip(),
                         "type": peer_type,
@@ -263,6 +263,9 @@ def group_leaf_switches_by_hosts(switches: dict, verbose: bool = False):
     # Filter out empty groups (Leaf Switches not connected to hosts)
     groups = {k: v for k, v in groups.items() if k}  # Remove empty groups
 
+    # Sort by host_key (tuple of sorted hostnames) so group numbering is stable across runs
+    groups = dict(sorted(groups.items(), key=lambda x: x[0]))
+
     # Summary
     total_groups = len(groups)
     total_leafs = len(leaf_switches)
@@ -386,14 +389,17 @@ def check_leaf_to_spine_links_by_group(
     Check connection distribution from each leaf switch to spine by group:
     - Expected to connect to {expected_spine_count} different Spine Switches
     - Only count ports in LinkUp state
+    - Also collect and print the list of spine GUIDs connected to each group
     """
     issues = []
     total_groups = len(groups)
     passed_groups = 0
     rows = []
+    group_spines = {}  # group_id -> sorted list of spine GUIDs (connected to any leaf in group)
 
     for group_id, (hosts, sw_guids) in enumerate(groups.items(), 1):
         group_passed = True
+        group_spine_set = set()
         for sw_guid in sw_guids:
             sw = switches[sw_guid]
             spine_links = defaultdict(list)  # spine_name -> [port list]
@@ -403,14 +409,16 @@ def check_leaf_to_spine_links_by_group(
                     continue
                 peer = p.get("peer")
                 if peer and peer.get("type") == "switch":
-                    # Check if peer is a spine
-                    peer_sw = None
+                    # Match peer by LID (unique per switch), not by name (multiple spines can share same model name)
+                    peer_lid = peer.get("lid")
+                    peer_sw_id = None
                     for other_sw_id, other_sw in switches.items():
-                        if other_sw["name"] == peer["name"] and other_sw.get("type") == "spine_sw":
-                            peer_sw = other_sw
+                        if other_sw.get("type") == "spine_sw" and other_sw.get("lid") == peer_lid:
+                            peer_sw_id = other_sw_id
                             break
-                    if peer_sw:
-                        spine_links[peer["name"]].append(p["port"])
+                    if peer_sw_id:
+                        spine_links[peer_sw_id].append(p["port"])
+                        group_spine_set.add(peer_sw_id)
 
             actual_spine_count = len(spine_links)
             down_ports = [p["port"] for p in sw["ports"] if "LinkUp" not in p["state"]]
@@ -433,6 +441,7 @@ def check_leaf_to_spine_links_by_group(
                 str(down_ports),
             ])
 
+        group_spines[group_id] = sorted(group_spine_set)
         if group_passed:
             passed_groups += 1
 
@@ -451,6 +460,12 @@ def check_leaf_to_spine_links_by_group(
     for row in rows:
         print("| " + " | ".join(f"{row[i]:<{col_widths[i]}}" for i in range(len(row))) + " |")
 
+    # Print spine GUIDs per group (for leafsw_group etc.)
+    print("\nSpine switches connected to each leaf group (spine GUIDs):")
+    for group_id in sorted(group_spines.keys()):
+        spines = group_spines[group_id]
+        print(f"  Group {group_id} ({len(spines)} spines): " + ", ".join(spines))
+
     return issues
 
 

From cfd8da9653b03f6d4ea59df838479c38e9dd1e11 Mon Sep 17 00:00:00 2001
From: xlliu <xlliu@scitix.ai>
Date: Tue, 3 Mar 2026 03:18:20 +0000
Subject: [PATCH 3/4] add timeout to nvidia collector

---
 components/nvidia/nvidia.go            |  5 ++++-
 scripts/atest/deepeptest_multi_node.py |  4 ++--
 scripts/sichek-gpu-ib-occupy           | 31 ++++++++++++++++++++++++--
 3 files changed, 35 insertions(+), 5 deletions(-)

diff --git a/components/nvidia/nvidia.go b/components/nvidia/nvidia.go
index 69efbac3..7f795004 100644
--- a/components/nvidia/nvidia.go
+++ b/components/nvidia/nvidia.go
@@ -282,10 +282,13 @@ func newNvidia(cfgFile string, specFile string, ignoredCheckers []string) (comp
 		return component, nil
 	}
 
+	// Use a timeout for collector init so nvidia-smi (SoftwareInfo.Get) cannot hang forever
+	collectorCtx, collectorCancel := context.WithTimeout(ctx, consts.CmdTimeout)
+	defer collectorCancel()
 	// Pass the shared pointer to collector
 	// Note: NVML calls in collector are protected by locks in nvidia.go where collector methods are called
 	component.nvmlMtx.Lock()
-	collectorPointer, err := collector.NewNvidiaCollector(ctx, component.nvmlInstPtr, nvidiaSpecCfg.GpuNums, nvidiaSpecCfg.Name)
+	collectorPointer, err := collector.NewNvidiaCollector(collectorCtx, component.nvmlInstPtr, nvidiaSpecCfg.GpuNums, nvidiaSpecCfg.Name)
 	component.nvmlMtx.Unlock()
 	if err != nil {
 		logrus.WithField("component", "nvidia").Errorf("NewNvidiaCollector failed: %v", err)
diff --git a/scripts/atest/deepeptest_multi_node.py b/scripts/atest/deepeptest_multi_node.py
index 3d97dadc..f63ad0c2 100644
--- a/scripts/atest/deepeptest_multi_node.py
+++ b/scripts/atest/deepeptest_multi_node.py
@@ -170,10 +170,10 @@ def main() -> None:
     apply_swanlab_mode(args.swanlab_mode, config)
 
     args.image_repo = pick_value(
-        args.image_repo, config, "pytorchjob_image_repo", "registry-us-east.scitix.ai/hisys/mcore"
+        args.image_repo, config, "pytorchjob_image_repo", ""
     )
     args.image_tag = pick_value(
-        args.image_tag, config, "pytorchjob_image_tag", "v2.1-cudnn9.14-te2.8-cuda_arch_10.0_at"
+        args.image_tag, config, "pytorchjob_image_tag", ""
     )
     args.scheduler_name = pick_value(args.scheduler_name, config, "scheduler", "si-scheduler")
     args.roce_shared_mode = pick_value(args.roce_shared_mode, config, "roce_shared_mode", "none")
diff --git a/scripts/sichek-gpu-ib-occupy b/scripts/sichek-gpu-ib-occupy
index 8be84566..5a1c70a4 100755
--- a/scripts/sichek-gpu-ib-occupy
+++ b/scripts/sichek-gpu-ib-occupy
@@ -5,6 +5,29 @@ set -euo pipefail
 MIN_FREE_GPU=8
 MIN_FREE_IB=32
 
+# Convert Kubernetes quantity string to integer for arithmetic (e.g. 1k -> 1000, 2 -> 2)
+to_int_quantity() {
+  local v="${1:-0}"
+  v="${v//[[:space:]]/}"
+  if [[ "$v" =~ ^[0-9]+$ ]]; then
+    echo "$v"
+  elif [[ "$v" =~ ^([0-9]+)k$ ]]; then
+    echo $((${BASH_REMATCH[1]} * 1000))
+  elif [[ "$v" =~ ^([0-9]+)Ki$ ]]; then
+    echo $((${BASH_REMATCH[1]} * 1024))
+  elif [[ "$v" =~ ^([0-9]+)M$ ]]; then
+    echo $((${BASH_REMATCH[1]} * 1000000))
+  elif [[ "$v" =~ ^([0-9]+)Mi$ ]]; then
+    echo $((${BASH_REMATCH[1]} * 1048576))
+  elif [[ "$v" =~ ^([0-9]+)G$ ]]; then
+    echo $((${BASH_REMATCH[1]} * 1000000000))
+  elif [[ "$v" =~ ^([0-9]+)Gi$ ]]; then
+    echo $((${BASH_REMATCH[1]} * 1073741824))
+  else
+    echo "0"
+  fi
+}
+
 # Parse parameters
 while [[ $# -gt 0 ]]; do
   case "$1" in
@@ -51,7 +74,9 @@ for node in $(kubectl get nodes -o json | jq -r '.items[] | select(.status.alloc
   # GPU allocated
   gpu_allocated=$(kubectl get pods -A --field-selector spec.nodeName="$node",status.phase=Running \
     -o json | jq "[.items[].spec.containers[].resources.requests[\"$GPU_RES\"] | (tonumber? // 0)] | add // 0")
-  gpu_free=$((gpu_allocatable - gpu_allocated))
+  gpu_allocatable_int=$(to_int_quantity "$gpu_allocatable")
+  gpu_allocated_int=$(to_int_quantity "$gpu_allocated")
+  gpu_free=$((gpu_allocatable_int - gpu_allocated_int))
 
   # IB capacity & allocatable
   ib_capacity=$(kubectl get node "$node" -o json | jq -r --arg res "$IB_RES" '.status.capacity[$res] // 0')
@@ -59,7 +84,9 @@ for node in $(kubectl get nodes -o json | jq -r '.items[] | select(.status.alloc
   # IB allocated
   ib_allocated=$(kubectl get pods -A --field-selector spec.nodeName="$node",status.phase=Running \
     -o json | jq "[.items[].spec.containers[].resources.requests[\"$IB_RES\"] | (tonumber? // 0)] | add // 0")
-  ib_free=$((ib_allocatable - ib_allocated))
+  ib_allocatable_int=$(to_int_quantity "$ib_allocatable")
+  ib_allocated_int=$(to_int_quantity "$ib_allocated")
+  ib_free=$((ib_allocatable_int - ib_allocated_int))
 
   # Condition: remaining resources < specified value
   if [[ "$gpu_free" -lt "$MIN_FREE_GPU" || "$ib_free" -lt "$MIN_FREE_IB" ]]; then

From 1f32fdf0420975f2775b6c212607ff4aa746d8d3 Mon Sep 17 00:00:00 2001
From: xlliu <xlliu@scitix.ai>
Date: Tue, 3 Mar 2026 04:22:20 +0000
Subject: [PATCH 4/4] add deploy.yaml

---
 k8s/README.md                   | 137 ++++++++++++++++++++++++++++++++
 install.yaml => k8s/deploy.yaml | 100 +++++++++++++++++++----
 2 files changed, 220 insertions(+), 17 deletions(-)
 create mode 100644 k8s/README.md
 rename install.yaml => k8s/deploy.yaml (77%)

diff --git a/k8s/README.md b/k8s/README.md
new file mode 100644
index 00000000..32e31f6c
--- /dev/null
+++ b/k8s/README.md
@@ -0,0 +1,137 @@
+# `k8s/deploy.yaml` README
+
+This document explains how to render, apply, and operate `k8s/deploy.yaml`.
+
+## What this manifest contains
+
+`k8s/deploy.yaml` defines:
+
+- A `DaemonSet` named `sichek-gpu` in namespace `monitoring`
+- A `ServiceAccount` (`sa-sichek`)
+- A `ClusterRole` and `ClusterRoleBinding`
+- A `PodMonitor` (`sichek-exporter`)
+
+The DaemonSet has:
+
+- `initContainer` (`sichek-init`) that installs/verifies `sichek` on the host
+- Main container (`sichek`) that keeps host-side `sichek` service alive
+- Exporter container (`sichek-exporter`) that exposes metrics over HTTP
+
+Important runtime characteristics:
+
+- `hostPID: true`
+- `hostNetwork: true`
+- `privileged: true`
+- Uses `nsenter` to execute commands in host namespaces
+- Uses hostPath `/var/sichek` (mounted as `/host/var/sichek` in containers)
+
+## Template variables you must render
+
+`k8s/deploy.yaml` is templated. Replace these placeholders before apply:
+
+- `{{ .registry }}`: image registry, for example `registry.example.com`
+- `{{ .version }}`: sichek version tag, for example `v0.7.6`
+- `{{ .sichek_spec_url }}`: spec fallback URL (or empty string)
+- `{{ .metrics_port }}`: exporter metrics port, for example `19092`
+
+## Prerequisites
+
+1. Namespace exists:
+
+```bash
+kubectl create namespace monitoring
+```
+
+2. ConfigMaps exist in `monitoring`:
+
+- `sichek-default-spec` with key `default_spec.yaml`
+- `sichek-default-user-config` with key `default_user_config.yaml`
+
+Example:
+
+```bash
+kubectl create configmap sichek-default-spec \
+  --from-file=default_spec.yaml=/path/to/default_spec.yaml \
+  -n monitoring
+
+kubectl create configmap sichek-default-user-config \
+  --from-file=default_user_config.yaml=/path/to/default_user_config.yaml \
+  -n monitoring
+```
+
+3. Cluster has permissions and binaries expected by the script on target nodes:
+
+- `systemd`/`systemctl`
+- package manager (`rpm` or `dpkg`)
+- host can run `sichek` after install
+
+## Render and apply
+
+From repository root:
+
+```bash
+export REGISTRY="registry-ap-southeast.scitix.ai"
+export SICHEK_VERSION="v0.7.6"
+export SICHEK_SPEC_URL='""'
+export METRICS_PORT="19092"
+
+sed -e "s|{{ \\.registry }}|${REGISTRY}|g" \
+    -e "s|{{ \\.version }}|${SICHEK_VERSION}|g" \
+    -e "s|{{ \\.sichek_spec_url }}|${SICHEK_SPEC_URL}|g" \
+    -e "s|{{ \\.metrics_port }}|${METRICS_PORT}|g" \
+    k8s/deploy.yaml > k8s/deploy.rendered.yaml
+
+kubectl apply -f k8s/deploy.rendered.yaml
+```
+
+## Verify deployment
+
+```bash
+kubectl get daemonset -n monitoring sichek-gpu
+kubectl get pods -n monitoring -l app=sichek -o wide
+
+# init/install logs
+kubectl logs -n monitoring <pod-name> -c sichek-init
+
+# keepalive service logs
+kubectl logs -n monitoring <pod-name> -c sichek -f
+
+# exporter logs
+kubectl logs -n monitoring <pod-name> -c sichek-exporter -f
+```
+
+## Update flow
+
+### Update config only
+
+1. Update ConfigMaps
+2. Restart DaemonSet:
+
+```bash
+kubectl rollout restart daemonset/sichek-gpu -n monitoring
+```
+
+### Upgrade version
+
+1. Change rendered values (`REGISTRY` / `SICHEK_VERSION`)
+2. Re-render and re-apply
+3. Watch rollout:
+
+```bash
+kubectl rollout status daemonset/sichek-gpu -n monitoring
+```
+
+## Uninstall
+
+```bash
+kubectl delete -f k8s/deploy.rendered.yaml
+```
+
+Note: this removes Kubernetes resources, but host files under `/var/sichek` and host-installed packages may remain.
+
+## Troubleshooting quick notes
+
+- `Init` fails: check `sichek-init` logs first.
+- `sichek` keeps restarting: verify host `systemctl is-active sichek`.
+- Exporter exits with socket timeout: check `/var/sichek/run/current/metrics.sock` on host.
+- Pod cannot schedule: inspect taints/resources and DaemonSet events.
diff --git a/install.yaml b/k8s/deploy.yaml
similarity index 77%
rename from install.yaml
rename to k8s/deploy.yaml
index 853ce40b..f28a3de0 100644
--- a/install.yaml
+++ b/k8s/deploy.yaml
@@ -22,13 +22,7 @@ spec:
     spec:
       hostPID: true
       hostNetwork: true
-      affinity:
-        nodeAffinity:
-          requiredDuringSchedulingIgnoredDuringExecution:
-            nodeSelectorTerms:
-              - matchExpressions:
-                  - key: scitix.ai/gpu-type
-                    operator: Exists
+
       tolerations:
       - operator: Exists
 
@@ -54,7 +48,7 @@ spec:
       ###########################################################
       containers:
       - name: sichek
-        image: registry-ap-southeast.scitix.ai/hisys/sichek:v0.7.6
+        image: "{{ .registry }}/hisys/sichek:{{ .version }}"
         imagePullPolicy: Always
 
         securityContext:
@@ -66,7 +60,11 @@ spec:
             fieldRef:
               fieldPath: spec.nodeName
         - name: SICHEK_SPEC_URL
-            value: {{ .sichek_spec_url }}
+          value: {{ .sichek_spec_url }}
+        - name: POD_UID
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.uid
 
         volumeMounts:
         - name: sichek-host
@@ -79,8 +77,7 @@ spec:
               - /bin/bash
               - -c
               - |
-                nsenter -t 1 -m -p -n -u -i -- \
-                  rm -rf /var/sichek/run/pods/${POD_UID}
+                rm -rf /host/var/sichek/run/pods/${POD_UID}
 
         command: ["/bin/bash", "-c"]
         args:
@@ -109,9 +106,14 @@ spec:
             memory: 1Gi
 
       - name: sichek-exporter
-        image: registry-ap-southeast.scitix.ai/hisys/sichek:v0.7.6
+        image: "{{ .registry }}/hisys/sichek:{{ .version }}"
         securityContext:
           privileged: true
+        env:
+        - name: NODE_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: spec.nodeName
         volumeMounts:
         - name: sichek-host
           mountPath: /host/var/sichek
@@ -123,10 +125,27 @@ spec:
         - |
           set -euo pipefail
 
+          echo "[sichek] restart dcgm-exportor"
+          HOST="nsenter -t 1 -m -p -n -u -i --"
+          $HOST kubectl --kubeconfig=/var/sichek/run/current/kubeconfig delete pod -nmonitoring -lapp.kubernetes.io/name=dcgm-exporter --field-selector spec.nodeName=${NODE_NAME}
+          $HOST kubectl --kubeconfig=/var/sichek/run/current/kubeconfig get pod -nmonitoring -lapp.kubernetes.io/name=dcgm-exporter --field-selector spec.nodeName=${NODE_NAME}
+
           RUNTIME=/host/var/sichek/run/current
+          timeout=10
+          elapsed=0
+          while [ ! -S "${RUNTIME}/metrics.sock" ]; do
+            if [ $elapsed -ge $timeout ]; then
+              echo "[sichek] ERROR: ${RUNTIME}/metrics.sock not ready after ${timeout}s"
+              exit 1
+            fi
+            sleep 1
+            elapsed=$((elapsed + 1))
+          done
+
+          echo "[sichek] ${RUNTIME}/metrics.sock is ready, starting exporter..."
           sichek exporter \
             --metrics-socket ${RUNTIME}/metrics.sock \
-            --listen :{{ .metrics_port }}
+            --listen :19092
 
         resources:
           requests:
@@ -141,7 +160,7 @@ spec:
       ############################################################
       initContainers:
       - name: sichek-init
-        image: registry-ap-southeast.scitix.ai/hisys/sichek:v0.7.6
+        image: "{{ .registry }}/hisys/sichek:{{ .version }}"
         imagePullPolicy: Always
 
         securityContext:
@@ -235,6 +254,10 @@ spec:
               echo "[sichek] install via dpkg"
               cp "$SICHEK_DEB" "${RUNTIME_BASE}/"
               $HOST dpkg -P sichek || echo "Failed to remove existing sichek"
+              ###dpkg: warning: while removing sichek, directory '/var/sichek/scripts' not empty so not removed
+              ###dpkg: warning: while removing sichek, directory '/var/sichek/config' not empty so not removed
+              rm -rf /host/var/sichek/scripts
+              rm -rf /host/var/sichek/config
               $HOST dpkg -i "${HOST_RUNTIME_BASE}/$(basename "$SICHEK_DEB")"
               rm "${RUNTIME_BASE}/$(basename "$SICHEK_DEB")"
             else
@@ -313,6 +336,27 @@ spec:
           chmod 644 "${TMP_ENV}"
           mv "${TMP_ENV}" "${RUNTIME_BASE}/env"
 
+          ###########################################################################
+          # 3. delete pod dir in /var/sichek/run/pods except current and canary link
+          ###########################################################################
+          PODS_DIR="$RUNTIME_BASE/pods"
+          CURRENT_TARGET=$(readlink -f "$RUNTIME_BASE/current")
+          CANARY_TARGET=$(readlink -f "$RUNTIME_BASE/canary")
+          echo "Current points to: $CURRENT_TARGET"
+          echo "Canary points to: $CANARY_TARGET"
+
+          for d in "$PODS_DIR"/*; do
+            [ -d "$d" ] || continue
+
+            # skip current/canary dir
+            if [[ "$d" == "$CURRENT_TARGET" || "$d" == "$CANARY_TARGET" ]]; then
+              echo "Skipping $d"
+              continue
+            fi
+            echo "Deleting $d"
+            rm -rf "$d"
+          done
+
       dnsPolicy: ClusterFirstWithHostNet
       restartPolicy: Always
       terminationGracePeriodSeconds: 30
@@ -331,7 +375,7 @@ metadata:
 rules:
   - apiGroups: ["kubeflow.org", ""]
     resources: ["nodes", "pods", "pytorchjobs"]
-    verbs: ["get", "list", "patch", "update", "watch"]
+    verbs: ["get", "list", "patch", "update", "watch", "delete"]
 ---
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRoleBinding
@@ -339,10 +383,32 @@ metadata:
   name: cluster-role-binding-sichek
   namespace: monitoring
 roleRef:
-  apiGroup: rbac.authorization.k8s.io  # Remove the /v1
+  apiGroup: rbac.authorization.k8s.io
   kind: ClusterRole
   name: cluster-role-sichek
 subjects:
   - kind: ServiceAccount
     name: sa-sichek
-    namespace: monitoring
\ No newline at end of file
+    namespace: monitoring
+---
+# Source: sichek/templates/prometheus.yaml
+apiVersion: monitoring.coreos.com/v1
+kind: PodMonitor
+metadata:
+  name: sichek-exporter
+  namespace: monitoring
+  labels:
+    app: sichek
+spec:
+  podMetricsEndpoints:
+    - interval: 15s  # Scrape interval
+      path: /metrics  # Metrics path
+      port: metrics
+      scrapeTimeout: 10s
+      scheme: http
+  namespaceSelector:
+    matchNames:
+      - monitoring
+  selector:
+    matchLabels:
+      app: sichek
\ No newline at end of file