From 3408d48bf39c85dd13e57405e7ecb3ffeaa2287e Mon Sep 17 00:00:00 2001 From: xlliu Date: Thu, 12 Feb 2026 01:10:45 +0000 Subject: [PATCH 1/4] disable NVIDIA XidPoller by default --- .goreleaser.yaml | 1 + cmd/command/version.go | 7 +- components/nvidia/config/config.go | 5 + components/nvidia/nvidia.go | 50 +++-- components/nvidia/xid_poller.go | 14 +- install.yaml | 348 +++++++++++++++++++++++++++++ 6 files changed, 403 insertions(+), 22 deletions(-) create mode 100644 install.yaml diff --git a/.goreleaser.yaml b/.goreleaser.yaml index 97e2770a..2e5dd045 100644 --- a/.goreleaser.yaml +++ b/.goreleaser.yaml @@ -53,6 +53,7 @@ builds: - -mod=vendor ldflags: - "-s -w" + - "-X github.com/scitix/sichek/cmd/command.Version={{ .Tag }}" - "-X github.com/scitix/sichek/cmd/command.Major={{ .Major }}" - "-X github.com/scitix/sichek/cmd/command.Minor={{ .Minor }}" - "-X github.com/scitix/sichek/cmd/command.Patch={{ .Patch }}" diff --git a/cmd/command/version.go b/cmd/command/version.go index c15ac16b..74694948 100644 --- a/cmd/command/version.go +++ b/cmd/command/version.go @@ -26,6 +26,9 @@ import ( ) var ( + // Version is the full version string (e.g. v0.7.6.post1), set by ldflags at build time. + // When set, it is used for display instead of Major.Minor.Patch. + Version = "" Major = "" Minor = "" Patch = "" @@ -52,7 +55,9 @@ func NewVersionCmd() *cobra.Command { now := time.Now() BuildTime = now.Format("2006-01-02T15:04:05") } - if Major == "" { + if Version != "" { + version = Version + } else if Major == "" { version = "dev-" + GitCommit } else { version = "v" + Major + "." + Minor + "." + Patch diff --git a/components/nvidia/config/config.go b/components/nvidia/config/config.go index fa4c92b4..a4ddaa07 100644 --- a/components/nvidia/config/config.go +++ b/components/nvidia/config/config.go @@ -27,9 +27,14 @@ type NvidiaConfig struct { QueryInterval common.Duration `json:"query_interval" yaml:"query_interval"` CacheSize int64 `json:"cache_size" yaml:"cache_size"` EnableMetrics bool `json:"enable_metrics" yaml:"enable_metrics"` + EnableXidPoller bool `json:"enable_xid_poller" yaml:"enable_xid_poller"` IgnoredCheckers []string `json:"ignored_checkers,omitempty" yaml:"ignored_checkers,omitempty"` } +func (c *NvidiaConfig) IsXidPollerEnabled() bool { + return c.EnableXidPoller +} + func (c *NvidiaUserConfig) GetCheckerSpec() map[string]common.CheckerSpec { commonCfgMap := make(map[string]common.CheckerSpec) return commonCfgMap diff --git a/components/nvidia/nvidia.go b/components/nvidia/nvidia.go index 881b9826..69efbac3 100644 --- a/components/nvidia/nvidia.go +++ b/components/nvidia/nvidia.go @@ -66,6 +66,10 @@ type component struct { metrics *metrics.NvidiaMetrics initError error // Track initialization errors with detailed information + + // ReNewNvml rate limit: when called from HealthCheck timeout path, skip if last ReNewNvml was < 60s ago + renewBackoffMtx sync.Mutex + lastReNewNvmlTime time.Time } var ( @@ -117,6 +121,17 @@ func NewNvml(ctx context.Context) (nvml.Interface, error) { // Note: The caller must hold c.healthCheckMtx lock before calling this function // to ensure thread-safe access to nvmlInst and xidPoller. func ReNewNvml(c *component) error { + const minReNewNvmlInterval = 60 * time.Second + // When nvmlInst exists (called from HealthCheck timeout path), rate-limit to avoid thrashing with DCGM + if c.nvmlInst != nil { + c.renewBackoffMtx.Lock() + last := c.lastReNewNvmlTime + c.renewBackoffMtx.Unlock() + if !last.IsZero() && time.Since(last) < minReNewNvmlInterval { + logrus.WithField("component", "nvidia").Debugf("ReNewNvml skipped (last run %v ago)", time.Since(last)) + return nil + } + } // Stop the XidEventPoller before shutting down NVML to prevent SIGSEGV if c.xidPoller != nil { @@ -144,31 +159,34 @@ func ReNewNvml(c *component) error { if c.nvmlInstPtr != nil { *c.nvmlInstPtr = nvmlInst } - // Recreate the XidEventPoller with the new NVML instance - // Use RLock to check running status (nested lock: healthCheckMtx -> serviceMtx is safe) + // Recreate the XidEventPoller with the new NVML instance only when enabled c.serviceMtx.RLock() isRunning := c.running c.serviceMtx.RUnlock() - if isRunning { - newXidPoller, err := NewXidEventPoller(c.ctx, c.cfg, nvmlInst, &c.nvmlMtx, c.resultChannel) + var newPoller *XidEventPoller + if isRunning && c.cfg != nil && c.cfg.Nvidia != nil && c.cfg.Nvidia.IsXidPollerEnabled() { + poller, err := NewXidEventPoller(c.ctx, c.cfg, nvmlInst, &c.nvmlMtx, c.resultChannel) if err != nil { logrus.WithField("component", "nvidia").Errorf("failed to recreate xid poller after NVML reinit: %v", err) } else { - c.xidPoller = newXidPoller - // Restart the poller in a goroutine if the component is still running + newPoller = poller go func() { defer func() { if err := recover(); err != nil { fmt.Printf("[xidPoller] panic err is %s\n", err) } }() - err := c.xidPoller.Start() + err := poller.Start() if err != nil { logrus.WithField("component", "nvidia").Errorf("start xid poller failed after reinit: %v", err) } }() } } + c.xidPoller = newPoller + c.renewBackoffMtx.Lock() + c.lastReNewNvmlTime = time.Now() + c.renewBackoffMtx.Unlock() } return ret } @@ -282,11 +300,16 @@ func newNvidia(cfgFile string, specFile string, ignoredCheckers []string) (comp return component, nil } - xidPoller, err := NewXidEventPoller(ctx, nvidiaCfg, nvmlInst, &component.nvmlMtx, component.resultChannel) - if err != nil { - logrus.WithField("component", "nvidia").Errorf("NewXidEventPoller failed: %v", err) - component.initError = fmt.Errorf("failed to create XID event poller: %w", err) - return component, nil + var xidPoller *XidEventPoller + if nvidiaCfg.Nvidia.IsXidPollerEnabled() { + xidPoller, err = NewXidEventPoller(ctx, nvidiaCfg, nvmlInst, &component.nvmlMtx, component.resultChannel) + if err != nil { + logrus.WithField("component", "nvidia").Errorf("NewXidEventPoller failed: %v", err) + component.initError = fmt.Errorf("failed to create XID event poller: %w", err) + return component, nil + } + } else { + logrus.WithField("component", "nvidia").Infof("XID event poller disabled)") } freqController := common.GetFreqController() @@ -517,8 +540,7 @@ func (c *component) Start() <-chan *common.Result { } // Check if the error message contains "Timeout" if result != nil && len(result.Checkers) > 0 && strings.Contains(result.Checkers[0].Name, "HealthCheckTimeout") { - // Handle the timeout error - // ReNewNvml requires healthCheckMtx lock to be held + // HealthCheck timed out: try ReNewNvml c.healthCheckMtx.Lock() err := ReNewNvml(c) c.healthCheckMtx.Unlock() diff --git a/components/nvidia/xid_poller.go b/components/nvidia/xid_poller.go index aa565043..1c9e6e44 100644 --- a/components/nvidia/xid_poller.go +++ b/components/nvidia/xid_poller.go @@ -83,21 +83,21 @@ func (x *XidEventPoller) Start() error { // waits for the duration specified in x.Cfg.UpdateInterval (in seconds) // ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlEvents.html#group__nvmlEvents - // e, err := x.XidEventSet.Wait(uint32(x.Cfg.UpdateInterval.Microseconds())) - event, ret := x.XidEventSet.Wait(200) - logrus.WithField("component", "nvidia").Infof("XidEventSet.Wait returned: %v, %v", event.EventData, ret) - if ret == nvml.ERROR_NOT_SUPPORTED { - logrus.WithField("component", "nvidia").Warningf("XidEvent not supported -- Skipping: %v", nvml.ErrorString(ret)) + event, ret := x.XidEventSet.Wait(uint32(1000)) + if ret == nvml.ERROR_TIMEOUT { + // no event within timeout — normal + logrus.WithField("component", "nvidia").Debugf("XidEventSet.Wait timeout (no event)") continue } - if ret == nvml.ERROR_TIMEOUT { - // no event within timeout + if ret == nvml.ERROR_NOT_SUPPORTED { + logrus.WithField("component", "nvidia").Warningf("XidEvent not supported -- Skipping: %v", nvml.ErrorString(ret)) continue } if ret != nvml.SUCCESS { logrus.WithField("component", "nvidia").Warningf("XidEventSet.Wait failure -- Retrying: %v", nvml.ErrorString(ret)) continue } + logrus.WithField("component", "nvidia").Infof("XidEventSet.Wait returned event: %v", event.EventData) x.handleEvent(event) } diff --git a/install.yaml b/install.yaml new file mode 100644 index 00000000..853ce40b --- /dev/null +++ b/install.yaml @@ -0,0 +1,348 @@ +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: sichek-gpu + namespace: monitoring + labels: + app: sichek +spec: + selector: + matchLabels: + app: sichek + updateStrategy: + type: RollingUpdate + rollingUpdate: + maxUnavailable: 10 + + template: + metadata: + labels: + app: sichek + spec: + hostPID: true + hostNetwork: true + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: scitix.ai/gpu-type + operator: Exists + tolerations: + - operator: Exists + + serviceAccountName: sa-sichek + + ############################################################ + # volumes + ############################################################ + volumes: + - name: sichek-host + hostPath: + path: /var/sichek + type: DirectoryOrCreate + - name: sichek-default-spec + configMap: + name: sichek-default-spec + - name: sichek-default-user-config + configMap: + name: sichek-default-user-config + + ########################################################### + # main container + ########################################################### + containers: + - name: sichek + image: registry-ap-southeast.scitix.ai/hisys/sichek:v0.7.6 + imagePullPolicy: Always + + securityContext: + privileged: true + + env: + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + - name: SICHEK_SPEC_URL + value: {{ .sichek_spec_url }} + + volumeMounts: + - name: sichek-host + mountPath: /host/var/sichek + + lifecycle: + preStop: + exec: + command: + - /bin/bash + - -c + - | + nsenter -t 1 -m -p -n -u -i -- \ + rm -rf /var/sichek/run/pods/${POD_UID} + + command: ["/bin/bash", "-c"] + args: + - | + set -euo pipefail + + HOST="nsenter -t 1 -m -p -n -u -i --" + echo "[sichek] ensure service started" + $HOST sichek d start + + echo "[sichek] entering keepalive loop" + while true; do + if ! $HOST systemctl is-active --quiet sichek; then + echo "[sichek] service not active, restarting" + $HOST sichek d start + fi + sleep 10 + done + + resources: + requests: + cpu: 100m + memory: 1Mi + limits: + cpu: "1" + memory: 1Gi + + - name: sichek-exporter + image: registry-ap-southeast.scitix.ai/hisys/sichek:v0.7.6 + securityContext: + privileged: true + volumeMounts: + - name: sichek-host + mountPath: /host/var/sichek + ports: + - name: metrics + containerPort: {{ .metrics_port }} + command: ["/bin/bash", "-c"] + args: + - | + set -euo pipefail + + RUNTIME=/host/var/sichek/run/current + sichek exporter \ + --metrics-socket ${RUNTIME}/metrics.sock \ + --listen :{{ .metrics_port }} + + resources: + requests: + cpu: 100m + memory: 1Mi + limits: + cpu: "1" + memory: 1Gi + + ############################################################ + # init container + ############################################################ + initContainers: + - name: sichek-init + image: registry-ap-southeast.scitix.ai/hisys/sichek:v0.7.6 + imagePullPolicy: Always + + securityContext: + privileged: true + runAsUser: 0 + capabilities: + add: [SYS_ADMIN] + + env: + - name: SICHEK_VERSION + value: {{ .version }} + - name: SICHEK_SPEC_URL + value: {{ .sichek_spec_url }} + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + - name: POD_UID + valueFrom: + fieldRef: + fieldPath: metadata.uid + + volumeMounts: + - name: sichek-host + mountPath: /host/var/sichek + - name: sichek-default-spec + mountPath: /var/sichek/config/default_spec.yaml + subPath: default_spec.yaml + - name: sichek-default-user-config + mountPath: /var/sichek/config/default_user_config.yaml + subPath: default_user_config.yaml + + command: ["/bin/bash", "-c"] + args: + - | + set -euo pipefail + set -x + + HOST="nsenter -t 1 -m -p -n -u -i --" + SICL=$(ls /opt/sichek/dist/sicl-*.run) + SICHEK_RPM=$(ls /opt/sichek/dist/sichek_*_linux_amd64.rpm) + SICHEK_DEB=$(ls /opt/sichek/dist/sichek_*_linux_amd64.deb) + RUNTIME_BASE=/host/var/sichek/run + HOST_RUNTIME_BASE=/var/sichek/run + HOST_SICL_INSTALL_PATH=/usr/local/sihpc + [[ -e "${RUNTIME_BASE}" ]] && rm -rf "${RUNTIME_BASE}" + mkdir -p "${RUNTIME_BASE}" + + ########################################################## + # 1. install / verify sichek on host + ########################################################## + # 1.1 check existing sichek + need_install=true + if $HOST sh -c 'command -v sichek >/dev/null 2>&1'; then + if $HOST sichek version >/dev/null 2>&1; then + current_version=$($HOST sichek version 2>&1 | awk '/^Version:/ {print $2}') + else + echo "[sichek] sichek exists but cannot run (glibc mismatch?)" + current_version="" + fi + echo "[sichek] found sichek version=${current_version}" + if [[ "${current_version}" == "${SICHEK_VERSION}" ]]; then + echo "[sichek] version match(${current_version} == ${SICHEK_VERSION}), skip install" + need_install=false + else + echo "[sichek] version mismatch(${current_version} != ${SICHEK_VERSION}), reinstall" + fi + else + echo "[sichek] sichek not installed" + fi + + # 1.2 install if needed + echo "[sichek] node=${NODE_NAME}" + if [[ "${need_install}" == "true" ]]; then + $HOST sichek daemon stop || echo "Failed to stop sichek daemon" + + echo "[sichek] install sicl" + $HOST rm -rf "${HOST_SICL_INSTALL_PATH}" + cp "$SICL" "${RUNTIME_BASE}/" + $HOST bash "${HOST_RUNTIME_BASE}/$(basename "$SICL")" + rm "${RUNTIME_BASE}/$(basename "$SICL")" + + echo "[sichek] install sichek" + if $HOST sh -c 'command -v rpm >/dev/null 2>&1'; then + echo "[sichek] install via rpm" + cp "$SICHEK_RPM" "${RUNTIME_BASE}/" + $HOST rpm -e sichek || echo "Failed to remove existing sichek" + $HOST rpm -Uvh --force "${HOST_RUNTIME_BASE}/$(basename "$SICHEK_RPM")" + rm "${RUNTIME_BASE}/$(basename "$SICHEK_RPM")" + elif $HOST sh -c 'command -v dpkg >/dev/null 2>&1'; then + echo "[sichek] install via dpkg" + cp "$SICHEK_DEB" "${RUNTIME_BASE}/" + $HOST dpkg -P sichek || echo "Failed to remove existing sichek" + $HOST dpkg -i "${HOST_RUNTIME_BASE}/$(basename "$SICHEK_DEB")" + rm "${RUNTIME_BASE}/$(basename "$SICHEK_DEB")" + else + echo "[sichek] no rpm / dpkg found on host" + exit 1 + fi + cp /var/sichek/config/default_spec.yaml /host/var/sichek/config/default_spec.yaml + cp /var/sichek/config/default_user_config.yaml /host/var/sichek/config/default_user_config.yaml + fi + + # 1.3 final version check + echo "[sichek] final version check" + if $HOST sichek version >/dev/null 2>&1; then + final_version=$($HOST sichek version 2>&1 | awk '/^Version:/ {print $2}') + else + echo "[sichek] sichek exists but cannot run (glibc mismatch?)" + final_version="" + fi + if [[ "${final_version}" != "${SICHEK_VERSION}" ]]; then + echo "Installed sichek version (${final_version}) does not match expected (${SICHEK_VERSION})" + exit 1 + fi + echo "[sichek] sichek ${final_version} ready" + + cp /var/sichek/config/default_spec.yaml /host/var/sichek/config/default_spec.yaml + cp /var/sichek/config/default_user_config.yaml /host/var/sichek/config/default_user_config.yaml + + ########################################################## + # 2. generate kubeconfig (atomic) + ########################################################## + POD_DIR=${RUNTIME_BASE}/pods/${POD_UID} + CURRENT_LINK=${RUNTIME_BASE}/current + mkdir -p "${POD_DIR}" + CA_FILE=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt + TOKEN_FILE=/var/run/secrets/kubernetes.io/serviceaccount/token + SERVER="https://${KUBERNETES_SERVICE_HOST}:${KUBERNETES_SERVICE_PORT}" + + CA_DATA=$(base64 < "${CA_FILE}" | tr -d '\n') + TOKEN=$(cat "${TOKEN_FILE}") + + tmp=$(mktemp "${POD_DIR}/kubeconfig.XXXX") + cat < "${tmp}" + apiVersion: v1 + kind: Config + clusters: + - name: local + cluster: + server: ${SERVER} + certificate-authority-data: ${CA_DATA} + users: + - name: sichek + user: + token: ${TOKEN} + contexts: + - name: sichek-context + context: + cluster: local + user: sichek + current-context: sichek-context + EOF + + chmod 600 "${tmp}" + mv "${tmp}" "${POD_DIR}/kubeconfig" + + # Use relative symlink so both host and container resolve current under their view of the mount. + # (Absolute /var/sichek/... would not resolve inside container which only has /host/var/sichek.) + ln -sfn "pods/${POD_UID}" "${CURRENT_LINK}" + + HOST_CURRENT=${HOST_RUNTIME_BASE}/current + TMP_ENV=$(mktemp "${POD_DIR}/env.XXXX") + cat < "${TMP_ENV}" + FLAGS="--metrics-socket ${HOST_CURRENT}/metrics.sock -s /var/sichek/config/default_spec.yaml -c /var/sichek/config/default_user_config.yaml --log-file ${HOST_CURRENT}/sichek.log" + KUBECONFIG=${HOST_CURRENT}/kubeconfig + SICHEK_SPEC_URL={{ .sichek_spec_url }} + EOF + chmod 644 "${TMP_ENV}" + mv "${TMP_ENV}" "${RUNTIME_BASE}/env" + + dnsPolicy: ClusterFirstWithHostNet + restartPolicy: Always + terminationGracePeriodSeconds: 30 +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: sa-sichek + namespace: monitoring +automountServiceAccountToken: true +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: cluster-role-sichek +rules: + - apiGroups: ["kubeflow.org", ""] + resources: ["nodes", "pods", "pytorchjobs"] + verbs: ["get", "list", "patch", "update", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: cluster-role-binding-sichek + namespace: monitoring +roleRef: + apiGroup: rbac.authorization.k8s.io # Remove the /v1 + kind: ClusterRole + name: cluster-role-sichek +subjects: + - kind: ServiceAccount + name: sa-sichek + namespace: monitoring \ No newline at end of file From 5dbceafe97a8d03e7558ad97bf087ef9a0de1ad1 Mon Sep 17 00:00:00 2001 From: xlliu Date: Mon, 23 Feb 2026 11:42:38 +0000 Subject: [PATCH 2/4] Fix ib_dev checker and enhance atest features - Fix abnormal reporting in ib_dev checker when devices not defined in spec are detected - Add --swanlab-mode option to atest - Add k8s wrapper commands for atest --- .github/workflows/release.yml | 11 ++++ components/infiniband/checker/ib_devs.go | 11 ++++ config/default_spec.yaml | 24 ++++++- docker/Dockerfile.cuda128 | 80 ++++++++++++++++++++++++ scripts/atest/common.py | 13 ++++ scripts/atest/config.py | 5 ++ scripts/atest/deepeptest_multi_node.py | 17 +++-- scripts/atest/deepeptest_single_node.py | 10 ++- scripts/atest/modeltest_multi_node.py | 26 +++++--- scripts/atest/modeltest_single_node.py | 6 +- scripts/atest/mpijob_helper.py | 8 ++- scripts/atest/nccltest_multi_node.py | 30 ++++++--- scripts/atest/nccltest_single_node.py | 10 +-- scripts/postinstall.sh | 3 + scripts/sichek-ib-topo | 29 ++++++--- 15 files changed, 240 insertions(+), 43 deletions(-) create mode 100644 docker/Dockerfile.cuda128 diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 822f8210..c8114abb 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -67,3 +67,14 @@ jobs: ghcr.io/${{ github.repository_owner }}/sichek:latest labels: | org.opencontainers.image.source=https://github.com/${{ github.repository }} + - name: Build and push Docker image (ubuntu20.04, cuda12.8) + uses: docker/build-push-action@v4 + with: + context: ./ + file: docker/Dockerfile.cuda128 + platforms: linux/amd64 + push: true + tags: | + ghcr.io/${{ github.repository_owner }}/sichek:${{ github.ref_name }}-ubuntu2004-cuda12.8 + labels: | + org.opencontainers.image.source=https://github.com/${{ github.repository }} diff --git a/components/infiniband/checker/ib_devs.go b/components/infiniband/checker/ib_devs.go index b1fc840b..cf63d751 100644 --- a/components/infiniband/checker/ib_devs.go +++ b/components/infiniband/checker/ib_devs.go @@ -62,6 +62,7 @@ func (c *IBDevsChecker) Check(ctx context.Context, data any) (*common.CheckerRes var mismatchPairs []string infinibandInfo.RLock() + // 1) Spec -> actual: missing or wrong mapping for expectedMlx5, expectedIb := range c.spec.IBPFDevs { // skip mezzanine card in check if strings.Contains(expectedMlx5, "mezz") { @@ -80,6 +81,16 @@ func (c *IBDevsChecker) Check(ctx context.Context, data any) (*common.CheckerRes mismatchPairs = append(mismatchPairs, fmt.Sprintf("%s -> %s (expected %s)", expectedMlx5, actualIb, expectedIb)) } } + // 2) Actual -> spec: extra devices not defined in spec (e.g. mlx5_7 in spec but actual shows mlx5_13_6209) + for actualMlx5 := range infinibandInfo.IBPFDevs { + if strings.Contains(actualMlx5, "mezz") { + continue + } + if _, inSpec := c.spec.IBPFDevs[actualMlx5]; !inSpec { + mismatchPairs = append(mismatchPairs, fmt.Sprintf("%s (not in spec)", actualMlx5)) + logrus.WithField("component", "infiniband").Debugf("mismatch: actual device %s not defined in spec", actualMlx5) + } + } infinibandInfo.RUnlock() if len(mismatchPairs) > 0 { diff --git a/config/default_spec.yaml b/config/default_spec.yaml index 2f3c8f42..da63ee78 100644 --- a/config/default_spec.yaml +++ b/config/default_spec.yaml @@ -66,9 +66,27 @@ infiniband: default: <<: *ib_base hca: - MT_0000000838: {} - DEL0000000036: {} - MT_0000000223: {} + MT_0000000838: + hardware: + hca_type: "MT4129" + board_id: "MT_0000000838" + fw_ver: ">=28.39.2048" + vpd: "NVIDIA ConnectX-7 HHHL Adapter card, 400GbE / NDR IB (default mode), Single-port OSFP, PCIe 5.0 x16, Crypto Disabled, Secure Boot Enabled" + net_port: 1 + port_speed: "400 Gb/sec (4X NDR)" + phy_state: "LinkUp" + port_state: "ACTIVE" + net_operstate: "down" + link_layer: "InfiniBand" + pcie_width: "16" + pcie_speed: "32.0 GT/s PCIe" + pcie_tree_width: "16" + pcie_tree_speed: "32" + pcie_acs: "disable" + pcie_mrr: "4096" + perf: + one_way_bw: 360 # Gbps + avg_latency_us: 10 # us pcie_topo: "0x233510de": numa_config: diff --git a/docker/Dockerfile.cuda128 b/docker/Dockerfile.cuda128 new file mode 100644 index 00000000..5a12d931 --- /dev/null +++ b/docker/Dockerfile.cuda128 @@ -0,0 +1,80 @@ +ARG CUDA_VERSION=12.8.1 +FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu20.04 AS build + +ARG GO_VERSION=1.23.3 +ARG GORELEASER_VERSION=v2.13.1 + +RUN mv /etc/apt/sources.list.d/cuda*.list /tmp/disabled-cuda.list 2>/dev/null || true && \ + apt-get update && \ + apt-get install -y --no-install-recommends \ + build-essential gcc g++ curl git wget ca-certificates && \ + rm -rf /var/lib/apt/lists/* + +# Install Go +RUN curl -fsSL https://go.dev/dl/go${GO_VERSION}.linux-amd64.tar.gz | \ + tar -C /usr/local -xz && \ + ln -s /usr/local/go/bin/go /usr/local/bin/go + +# Install GoReleaser +RUN curl -fsSL https://github.com/goreleaser/goreleaser/releases/download/${GORELEASER_VERSION}/goreleaser_Linux_x86_64.tar.gz | \ + tar -xz -C /usr/local/bin goreleaser && \ + goreleaser --version + +ENV GOSUMDB=off +WORKDIR /go/src/sichek + +COPY . . + +ARG BUILD_TIME +RUN if [ -n "$BUILD_TIME" ]; then \ + BUILD_TIME=$BUILD_TIME goreleaser release --snapshot --clean ; \ + else \ + goreleaser release --snapshot --clean ; \ + fi + +FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu20.04 + +ENV DEBIAN_FRONTEND=noninteractive +ENV TZ=Asia/Shanghai +ENV NVIDIA_VISIBLE_DEVICES="" + +RUN mv /etc/apt/sources.list.d/cuda*.list /tmp/disabled-cuda.list 2>/dev/null || true && \ + apt-get update && \ + apt-get install -y --no-install-recommends \ + vim curl tzdata ca-certificates \ + openssh-server openssh-client \ + iproute2 iputils-ping jq \ + libnuma1 numactl rdmacm-utils perftest xz-utils && \ + ln -fs /usr/share/zoneinfo/${TZ} /etc/localtime && \ + dpkg-reconfigure --frontend noninteractive tzdata && \ + apt-get clean && rm -rf /var/lib/apt/lists/* + +# Helm (runtime tool) +RUN curl -fsSL https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash + +WORKDIR /opt/sichek +COPY --from=build /go/src/sichek/dist ./dist + +ARG SICL_VERSION=sicl-25.11-1.cuda128.ubuntu2004.run +RUN curl -fsSL -o ./dist/${SICL_VERSION} \ + https://oss-ap-southeast.scitix.ai/scitix-release/${SICL_VERSION} && \ + bash ./dist/${SICL_VERSION} && \ + dpkg -i ./dist/sichek_*_linux_amd64.deb + +ENV SIHPC_HOME=/usr/local/sihpc +ENV PATH=${SIHPC_HOME}/bin:$PATH +ENV LD_LIBRARY_PATH=${SIHPC_HOME}/lib:/usr/local/cuda/lib64:$LD_LIBRARY_PATH +ENV OMPI_MCA_opal_prefix=${SIHPC_HOME} +ENV OPAL_PREFIX=${SIHPC_HOME} + +# SSH dependencies for MPI +RUN sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \ + echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \ + sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config && \ + mkdir -p /var/run/sshd /root/.ssh && \ + chmod 700 /root/.ssh +RUN ssh-keygen -t rsa -f /root/.ssh/id_rsa -q -N "" && \ + cat /root/.ssh/id_rsa.pub >> /root/.ssh/authorized_keys && \ + chmod 600 /root/.ssh/authorized_keys + +EXPOSE 22 diff --git a/scripts/atest/common.py b/scripts/atest/common.py index 21ae0fe2..99d9aa69 100644 --- a/scripts/atest/common.py +++ b/scripts/atest/common.py @@ -159,6 +159,7 @@ def is_valid_value(value): ("SWANLAB_API_KEY", "swanlab_api_key"), ("SWANLAB_WORKSPACE", "swanlab_workspace"), ("SWANLAB_PROJ_NAME", "swanlab_proj_name"), + ("SWANLAB_MODE", "swanlab_mode"), ]: val = config.get(config_key) if is_valid_value(val): @@ -166,6 +167,18 @@ def is_valid_value(value): elif config_key in config: os.environ.pop(env_key, None) +def apply_swanlab_mode(cli_mode: Optional[str], config: Dict[str, str]): + """Resolve swanlab mode from CLI > config > env, and set SWANLAB_MODE env var.""" + mode = pick_value(cli_mode, config, "swanlab_mode", "") + if mode: + os.environ["SWANLAB_MODE"] = mode + + +def is_swanlab_disabled() -> bool: + """Return True if SWANLAB_MODE is set to 'disabled'.""" + return os.getenv("SWANLAB_MODE", "").lower() == "disabled" + + def pick_value(cli_value: Optional[str], config: Dict[str, str], key: str, default: str) -> str: if cli_value not in (None, ""): return cli_value diff --git a/scripts/atest/config.py b/scripts/atest/config.py index ab8dd0a2..85226e0b 100644 --- a/scripts/atest/config.py +++ b/scripts/atest/config.py @@ -29,6 +29,7 @@ "swanlab_api_key", "swanlab_workspace", "swanlab_proj_name", + "swanlab_mode", ] CONFIG_DIR = Path.home() / ".sichek" @@ -162,6 +163,10 @@ def config_create(): "swanlab project", config.get("swanlab_proj_name", "") or config.get("swanlab_proj_name", "") ) + new_config["swanlab_mode"] = ask( + "swanlab mode [cloud, offline, disabled, local]", + config.get("swanlab_mode", "cloud") + ) # Validate default_spec if provided if new_config.get("default_spec"): diff --git a/scripts/atest/deepeptest_multi_node.py b/scripts/atest/deepeptest_multi_node.py index 05693be3..3d97dadc 100644 --- a/scripts/atest/deepeptest_multi_node.py +++ b/scripts/atest/deepeptest_multi_node.py @@ -33,6 +33,8 @@ parse_hostnames, load_user_config, pick_value, + apply_swanlab_mode, + is_swanlab_disabled, start_kubectl_log_stream, wait_for_pods_ready, ) @@ -152,23 +154,26 @@ def main() -> None: parser.add_argument("--job-name", default=None) parser.add_argument("--namespace", default="default") parser.add_argument("--cmd", default="", help="Command to run (same as deepeptest_single_node)") - parser.add_argument("--image-repo", default="registry-taihua.siflow.cn/hisys/mcore", help="Container image repository") - parser.add_argument("--image-tag", default="pytorch25.11-cuda13-cudnn9.17-te-main-v1", help="Container image tag") + parser.add_argument("--image-repo", default=None, help="Container image repository (or set pytorchjob_image_repo in config)") + parser.add_argument("--image-tag", default=None, help="Container image tag (or set pytorchjob_image_tag in config)") parser.add_argument("--timeout", type=int, default=600) parser.add_argument("--scheduler-name", default=None) parser.add_argument("--roce-shared-mode", default=None) parser.add_argument("--hostfile", default="None") parser.add_argument("--host", default="None") parser.add_argument("--host-dir", default=None, help="Host directory to mount (e.g. /tmp/DeepEP)") + parser.add_argument("--swanlab-mode", type=str, default=None, choices=["cloud", "offline", "disabled", "local"], + help="SwanLab mode: cloud (default), offline, disabled, local") args = parser.parse_args() config = load_user_config() + apply_swanlab_mode(args.swanlab_mode, config) args.image_repo = pick_value( - args.image_repo, config, "pytorchjob_image_repo", "registry-taihua.siflow.cn/hisys/mcore" + args.image_repo, config, "pytorchjob_image_repo", "registry-us-east.scitix.ai/hisys/mcore" ) args.image_tag = pick_value( - args.image_tag, config, "pytorchjob_image_tag", "pytorch25.11-cuda13-cudnn9.17-te-main-v1" + args.image_tag, config, "pytorchjob_image_tag", "v2.1-cudnn9.14-te2.8-cuda_arch_10.0_at" ) args.scheduler_name = pick_value(args.scheduler_name, config, "scheduler", "si-scheduler") args.roce_shared_mode = pick_value(args.roce_shared_mode, config, "roce_shared_mode", "none") @@ -191,7 +196,7 @@ def main() -> None: num_experts = 8 * num_workers default_cmd = ( "python /tmp/DeepEP/tests/test_internode.py " - f"--num-processes 8 --num-tokens 4096 --hidden 7168 --num-topk 8 --num-experts {num_experts}" + f"--num-processes 8 --num-tokens 4096 --hidden 7168 --num-topk 8 --num-experts 256" ) cmd = args.cmd if args.cmd else default_cmd if args.host_dir is not None: @@ -222,7 +227,7 @@ def signal_handler(sig: int, frame: Any) -> None: signal.signal(signal.SIGTERM, signal_handler) swan_run = None - if os.getenv("SWANLAB_API_KEY") and swanlab is not None: + if os.getenv("SWANLAB_API_KEY") and swanlab is not None and not is_swanlab_disabled(): swan_run = swanlab.init( experiment_name=args.job_name, description=f"DeepEP tuning multi-node ({num_workers} workers)", diff --git a/scripts/atest/deepeptest_single_node.py b/scripts/atest/deepeptest_single_node.py index aaa24d31..2cea3deb 100644 --- a/scripts/atest/deepeptest_single_node.py +++ b/scripts/atest/deepeptest_single_node.py @@ -22,6 +22,8 @@ parse_hostnames, load_user_config, pick_value, + apply_swanlab_mode, + is_swanlab_disabled, ) from mpijob_helper import ( @@ -121,8 +123,8 @@ def main() -> None: parser.description = ( "Runs DeepEP intranode test on each worker pod and prints the three Best results per node." ) - parser.add_argument("--image-repo", default="registry-taihua.siflow.cn/hisys/mcore", help="Container image repository") - parser.add_argument("--image-tag", default="pytorch25.11-cuda13-cudnn9.17-te-main-v1", help="Container image tag") + parser.add_argument("--image-repo", default=None, help="Container image repository (or set pytorchjob_image_repo in config)") + parser.add_argument("--image-tag", default=None, help="Container image tag (or set pytorchjob_image_tag in config)") parser.add_argument( "--host-dir", default=None, @@ -131,6 +133,7 @@ def main() -> None: args = parser.parse_args() config = load_user_config() + apply_swanlab_mode(args.swanlab_mode, config) default_cmd = ( "python /tmp/DeepEP/tests/test_intranode.py " @@ -169,12 +172,13 @@ def main() -> None: timeout=args.timeout, max_parallel_jobs=args.max_parallel_jobs, cmd=cmd, + request_gpu=not args.no_request_gpu, ) runner = MPIJobRunner(mpijob_config) swan_run = None - if os.getenv("SWANLAB_API_KEY") and swanlab is not None: + if os.getenv("SWANLAB_API_KEY") and swanlab is not None and not is_swanlab_disabled(): swan_run = swanlab.init( experiment_name=mpijob_config.job_name, description=f"DeepEP tuning ({len(mpijob_config.hostnames)} workers)", diff --git a/scripts/atest/modeltest_multi_node.py b/scripts/atest/modeltest_multi_node.py index b9869a3d..7926cd85 100755 --- a/scripts/atest/modeltest_multi_node.py +++ b/scripts/atest/modeltest_multi_node.py @@ -5,6 +5,7 @@ import os import argparse +import re import shlex import signal import sys @@ -35,6 +36,8 @@ summarize, load_user_config, pick_value, + apply_swanlab_mode, + is_swanlab_disabled, start_kubectl_log_stream, wait_for_pods_ready, ) @@ -47,7 +50,7 @@ def main(): parser.add_argument("--cmd", default="", help="command to run in each pod: PP=1 MBS=4 bash /workspace/ai4s-job-system/mcore_trainer/demos/llama/train_llama2_70b_bf16.sh by default") parser.add_argument("--image-repo", default=None) parser.add_argument("--image-tag", default=None) - parser.add_argument("--timeout", type=int, default=600) + parser.add_argument("--timeout", type=int, default=3600) parser.add_argument("--scheduler-name", default=None) parser.add_argument("--roce-shared-mode", default=None) parser.add_argument("--hostfile", default="None") @@ -59,10 +62,13 @@ def main(): parser.add_argument("--ep", type=int, default=None, help="Expert Parallelism size") parser.add_argument("--host-dir", default=None, help="host directory to mount in pytorchjob pods") parser.add_argument("--gpu-type", default=None, help="GPU type") + parser.add_argument("--swanlab-mode", type=str, default=None, choices=["cloud", "offline", "disabled", "local"], + help="SwanLab mode: cloud (default), offline, disabled, local") args = parser.parse_args() config = load_user_config() + apply_swanlab_mode(args.swanlab_mode, config) args.image_repo = pick_value(args.image_repo, config, "pytorchjob_image_repo", "registry-us-east.scitix.ai/hisys/mcore") @@ -110,7 +116,9 @@ def main(): cmd = f"EP={args.ep} {cmd}" if args.host_dir is not None: cmd = f"OLMO_CORE_DIR={args.host_dir} {cmd}" - if os.getenv("SWANLAB_API_KEY") is not None: + if is_swanlab_disabled(): + cmd = f"export SWANLAB_MODE=disabled && {cmd}" + elif os.getenv("SWANLAB_API_KEY") is not None: cmd = ( f"export SWANLAB_API_KEY={os.getenv('SWANLAB_API_KEY')} && " f"export SWANLAB_WORKSPACE={os.getenv('SWANLAB_WORKSPACE')} && " @@ -119,10 +127,7 @@ def main(): f"{cmd}" ) else: - cmd = ( - f"export SWANLAB_MODE=disabled && " - f"{cmd}" - ) + cmd = f"export SWANLAB_MODE=disabled && {cmd}" scripts_dir = Path(__file__).parent.resolve() helm_dir = scripts_dir.parent.parent / "k8s" / "sichek" @@ -149,7 +154,7 @@ def signal_handler(sig, frame): signal.signal(signal.SIGTERM, signal_handler) swan_run = None - if os.getenv("SWANLAB_API_KEY") and swanlab is not None: + if os.getenv("SWANLAB_API_KEY") and swanlab is not None and not is_swanlab_disabled(): swan_run = swanlab.init( experiment_name=args.job_name, description=f"Llama2 70B benchmark ({num_workers} workers)", @@ -253,7 +258,12 @@ def signal_handler(sig, frame): if rc != 0 or not out.strip(): echo_warn(f"No worker pods found for job '{args.job_name}'") return - pods = sorted(out.strip().split()) + # Sort by worker index (e.g. worker-9 < worker-10 < worker-99 < worker-100) + def _worker_index(name): + m = re.search(r"worker-(\d+)$", name) + return int(m.group(1)) if m else -1 + + pods = sorted(out.strip().split(), key=_worker_index) last_pod = pods[-1] echo_info(f"Last worker pod name: {last_pod}") diff --git a/scripts/atest/modeltest_single_node.py b/scripts/atest/modeltest_single_node.py index 37ee3323..ec9d69f2 100644 --- a/scripts/atest/modeltest_single_node.py +++ b/scripts/atest/modeltest_single_node.py @@ -31,6 +31,8 @@ summarize, load_user_config, pick_value, + apply_swanlab_mode, + is_swanlab_disabled, ) from mpijob_helper import ( @@ -120,6 +122,7 @@ def main(): args = parser.parse_args() config = load_user_config() + apply_swanlab_mode(args.swanlab_mode, config) default_cmd = ( "bash /workspace/ai4s-job-system/mcore_trainer/demos/llama/train_llama2_13b_bf16.sh" @@ -156,12 +159,13 @@ def main(): timeout=args.timeout, max_parallel_jobs=args.max_parallel_jobs, cmd=cmd, + request_gpu=not args.no_request_gpu, ) runner = MPIJobRunner(mpijob_config) swan_run = None - if os.getenv("SWANLAB_API_KEY") and swanlab is not None: + if os.getenv("SWANLAB_API_KEY") and swanlab is not None and not is_swanlab_disabled(): swan_run = swanlab.init( experiment_name=mpijob_config.job_name, description=f"Model benchmark ({len(mpijob_config.hostnames)} workers)", diff --git a/scripts/atest/mpijob_helper.py b/scripts/atest/mpijob_helper.py index ff3fd3e5..f8046f2e 100644 --- a/scripts/atest/mpijob_helper.py +++ b/scripts/atest/mpijob_helper.py @@ -128,7 +128,7 @@ def deploy(self): f"--set mpijob.name={shlex.quote(cfg.job_name)} " f"--set mpijob.numWorkers={len(cfg.hostnames)} " f"--set 'mpijob.nodeAffinityHosts={{{host_csv}}}' " - f"--set 'mpijob.requestGpu={cfg.request_gpu}'" + f"--set 'mpijob.requestGpu={"true" if cfg.request_gpu else "false"}'" ) run_cmd_check(helm_cmd) @@ -322,13 +322,15 @@ def create_mpijob_arg_parser(description: str): parser.add_argument("--namespace", default="default", help="Kubernetes namespace") parser.add_argument("--cmd", default="", help="Command to run in each pod") # Note: --image-repo and --image-tag should be added by each script as they differ - parser.add_argument("--timeout", type=int, default=600, help="Timeout in seconds") + parser.add_argument("--timeout", type=int, default=3600, help="Timeout in seconds") parser.add_argument("--scheduler-name", default=None, help="Kubernetes scheduler name") parser.add_argument("--roce-shared-mode", default=None, help="RoCE shared mode") parser.add_argument("--hostfile", default="None", help="File containing hostnames") parser.add_argument("--host", default="None", help="Comma-separated hostnames") parser.add_argument("--max-parallel-jobs", type=int, default=200, help="Max parallel jobs") - + parser.add_argument("--no-request-gpu", action="store_true", help="Do not request GPU resources (default: request GPU)") + parser.add_argument("--swanlab-mode", type=str, default=None, choices=["cloud", "offline", "disabled", "local"], + help="SwanLab mode: cloud (default), offline, disabled, local") return parser diff --git a/scripts/atest/nccltest_multi_node.py b/scripts/atest/nccltest_multi_node.py index 5d6f6d8e..c8112cce 100755 --- a/scripts/atest/nccltest_multi_node.py +++ b/scripts/atest/nccltest_multi_node.py @@ -37,6 +37,8 @@ parse_nccltest_bandwidth, load_user_config, pick_value, + apply_swanlab_mode, + is_swanlab_disabled, start_kubectl_log_stream, wait_for_pods_ready, ) @@ -56,10 +58,13 @@ def main(): parser.add_argument("--roce-shared-mode", default=None) parser.add_argument("--hostfile", default="None") parser.add_argument("--host", default="None") - parser.add_argument("--request-gpu", action="store_true", help="Request GPU resources for each worker pod") + parser.add_argument("--no-request-gpu", action="store_true", help="Do not request GPU resources (default: request GPU)") + parser.add_argument("--swanlab-mode", type=str, default=None, choices=["cloud", "offline", "disabled", "local"], + help="SwanLab mode: cloud (default), offline, disabled, local") args = parser.parse_args() config = load_user_config() + apply_swanlab_mode(args.swanlab_mode, config) args.image_repo = pick_value(args.image_repo, config, "image_repo", "registry-us-east.scitix.ai/hisys/sichek") args.image_tag = pick_value(args.image_tag, config, "image_tag", "latest") args.scheduler_name = pick_value(args.scheduler_name, config, "scheduler", "si-scheduler") @@ -101,7 +106,7 @@ def signal_handler(sig, frame): signal.signal(signal.SIGTERM, signal_handler) swan_run = None - if os.getenv("SWANLAB_API_KEY") and swanlab is not None: + if os.getenv("SWANLAB_API_KEY") and swanlab is not None and not is_swanlab_disabled(): swan_run = swanlab.init( experiment_name=args.job_name, description=f"NCCL benchmark multi-node ({num_workers} workers)", @@ -125,7 +130,7 @@ def signal_handler(sig, frame): echo_info(f"Timeout: {args.timeout} seconds") host_csv = ",".join(hostnames) - gpu_flag = "true" if args.request_gpu else "false" + gpu_flag = "false" if args.no_request_gpu else "true" helm_cmd = ( f"helm upgrade --install {shlex.quote(args.job_name)} {shlex.quote(str(helm_dir))} " f"--atomic " @@ -220,14 +225,23 @@ def signal_handler(sig, frame): time.sleep(5) waited += 5 - run_cmd_line = ( - "timeout {timeout} /usr/local/sihpc/bin/mpirun " + # For all+reduce tests on >64 nodes, use -b32g -e32g + nccl_extra = "" + if "all" in cmd and "reduce" in cmd and num_workers > 64: + nccl_extra = " -b32g -e32g" + mpirun_part = ( + "/usr/local/sihpc/bin/mpirun " "--allow-run-as-root --map-by ppr:8:node " "--mca oob_tcp_if_include eth0 --mca pml ^ucx --mca btl self,tcp " "--mca btl_tcp_if_include eth0 --mca routed direct --mca plm_rsh_no_tree_spawn 1 " - "-x UCX_TLS=tcp -x NCCL_MIN_NCHANNELS=32 -x NCCL_IB_QPS_PER_CONNECTION=8 " - "/usr/local/sihpc/libexec/nccl-tests/nccl_test -l{cmd}" - ).format(timeout=args.timeout, cmd=cmd) + "-x UCX_TLS=tcp -x NCCL_DEBUG=WARN -x NCCL_MIN_NCHANNELS=32 -x NCCL_IB_QPS_PER_CONNECTION=8 " + "/usr/local/sihpc/libexec/nccl-tests/nccl_test -l{cmd}{extra}" + ).format(cmd=cmd, extra=nccl_extra) + + if "-N 0" in cmd: + run_cmd_line = mpirun_part + else: + run_cmd_line = f"timeout {args.timeout} {mpirun_part}" echo_info(f"Running NCCL test: {label}") # Wrap command to tee output to container's main process stdout diff --git a/scripts/atest/nccltest_single_node.py b/scripts/atest/nccltest_single_node.py index f3b679ff..603adc1e 100755 --- a/scripts/atest/nccltest_single_node.py +++ b/scripts/atest/nccltest_single_node.py @@ -29,6 +29,8 @@ parse_hostnames, load_user_config, pick_value, + apply_swanlab_mode, + is_swanlab_disabled, ) from mpijob_helper import ( @@ -117,16 +119,16 @@ def main(): ) parser.add_argument("--image-repo", default=None, help="Container image repository") parser.add_argument("--image-tag", default=None, help="Container image tag") - parser.add_argument("--request-gpu", action="store_true", help="Request GPU resources for each worker pod") args = parser.parse_args() config = load_user_config() + apply_swanlab_mode(args.swanlab_mode, config) image_repo = pick_value(args.image_repo, config, "image_repo", "registry-us-east.scitix.ai/hisys/sichek") image_tag = pick_value(args.image_tag, config, "image_tag", "latest") if not args.cmd: - cmd = "NCCL_DEBUG=INFO /usr/local/sihpc/libexec/nccl-tests/nccl_test -g 8" + cmd = "NCCL_DEBUG=WARN /usr/local/sihpc/libexec/nccl-tests/nccl_test -g 8" else: cmd = args.cmd @@ -151,13 +153,13 @@ def main(): timeout=args.timeout, max_parallel_jobs=args.max_parallel_jobs, cmd=cmd, - request_gpu=args.request_gpu, + request_gpu=not args.no_request_gpu, ) runner = MPIJobRunner(mpijob_config) swan_run = None - if os.getenv("SWANLAB_API_KEY") and swanlab is not None: + if os.getenv("SWANLAB_API_KEY") and swanlab is not None and not is_swanlab_disabled(): swan_run = swanlab.init( experiment_name=mpijob_config.job_name, description=f"NCCL benchmark ({len(mpijob_config.hostnames)} workers)", diff --git a/scripts/postinstall.sh b/scripts/postinstall.sh index b5128aeb..b2984a93 100755 --- a/scripts/postinstall.sh +++ b/scripts/postinstall.sh @@ -99,6 +99,9 @@ create_at_wrapper "sichek-k8s-llama2-70b" "${SICHEK_SCRIPTS_PATH}/atest/modeltes create_at_wrapper "sichek-k8s-olmo3-7b" "${SICHEK_SCRIPTS_PATH}/atest/modeltest_single_node.py" --job-name sichek-olmo3-7b --cmd "bash /workspace/ai4s-job-system/mcore_trainer/demos/OLMo3/OLMo-3-1025-7B-pretrain-1.sh" create_at_wrapper "sichek-k8s-olmo3-7b-multinode" "${SICHEK_SCRIPTS_PATH}/atest/modeltest_multi_node.py" --job-name sichek-olmo3-7b --cmd "bash /workspace/ai4s-job-system/mcore_trainer/demos/OLMo3/OLMo-3-1025-7B-pretrain-1.sh" create_at_wrapper "sichek-k8s-qwen-a3b" "${SICHEK_SCRIPTS_PATH}/atest/modeltest_multi_node.py" --job-name sichek-qwen-a3b --cmd "MAX_STEPS=128 NCCL_DEBUG=WARN bash /workspace/ai4s-job-system/mcore_trainer/demos/deepseek/sft_deepseekv3.1_base.sh" +create_at_wrapper "sichek-k8s-modeltest-multinode" "${SICHEK_SCRIPTS_PATH}/atest/modeltest_multi_node.py" +create_at_wrapper "sichek-k8s-deepeptest-singlenode" "${SICHEK_SCRIPTS_PATH}/atest/deepeptest_single_node.py" +create_at_wrapper "sichek-k8s-deepeptest-multinode" "${SICHEK_SCRIPTS_PATH}/atest/deepeptest_multi_node.py" # Run check_sicl.sh bash /var/sichek/scripts/check_sicl.sh || echo "Failed to run SICL installer" diff --git a/scripts/sichek-ib-topo b/scripts/sichek-ib-topo index eda587c1..d2fba835 100755 --- a/scripts/sichek-ib-topo +++ b/scripts/sichek-ib-topo @@ -64,7 +64,7 @@ def parse_iblinkinfo(file_path: str): "port": int(port), "state": state.strip(), "peer": { - "lid": peer_lid if peer_lid else None, + "lid": int(peer_lid) if peer_lid else None, "port": int(peer_port) if peer_port else None, "name": peer_name.strip(), "type": peer_type, @@ -263,6 +263,9 @@ def group_leaf_switches_by_hosts(switches: dict, verbose: bool = False): # Filter out empty groups (Leaf Switches not connected to hosts) groups = {k: v for k, v in groups.items() if k} # Remove empty groups + # Sort by host_key (tuple of sorted hostnames) so group numbering is stable across runs + groups = dict(sorted(groups.items(), key=lambda x: x[0])) + # Summary total_groups = len(groups) total_leafs = len(leaf_switches) @@ -386,14 +389,17 @@ def check_leaf_to_spine_links_by_group( Check connection distribution from each leaf switch to spine by group: - Expected to connect to {expected_spine_count} different Spine Switches - Only count ports in LinkUp state + - Also collect and print the list of spine GUIDs connected to each group """ issues = [] total_groups = len(groups) passed_groups = 0 rows = [] + group_spines = {} # group_id -> sorted list of spine GUIDs (connected to any leaf in group) for group_id, (hosts, sw_guids) in enumerate(groups.items(), 1): group_passed = True + group_spine_set = set() for sw_guid in sw_guids: sw = switches[sw_guid] spine_links = defaultdict(list) # spine_name -> [port list] @@ -403,14 +409,16 @@ def check_leaf_to_spine_links_by_group( continue peer = p.get("peer") if peer and peer.get("type") == "switch": - # Check if peer is a spine - peer_sw = None + # Match peer by LID (unique per switch), not by name (multiple spines can share same model name) + peer_lid = peer.get("lid") + peer_sw_id = None for other_sw_id, other_sw in switches.items(): - if other_sw["name"] == peer["name"] and other_sw.get("type") == "spine_sw": - peer_sw = other_sw + if other_sw.get("type") == "spine_sw" and other_sw.get("lid") == peer_lid: + peer_sw_id = other_sw_id break - if peer_sw: - spine_links[peer["name"]].append(p["port"]) + if peer_sw_id: + spine_links[peer_sw_id].append(p["port"]) + group_spine_set.add(peer_sw_id) actual_spine_count = len(spine_links) down_ports = [p["port"] for p in sw["ports"] if "LinkUp" not in p["state"]] @@ -433,6 +441,7 @@ def check_leaf_to_spine_links_by_group( str(down_ports), ]) + group_spines[group_id] = sorted(group_spine_set) if group_passed: passed_groups += 1 @@ -451,6 +460,12 @@ def check_leaf_to_spine_links_by_group( for row in rows: print("| " + " | ".join(f"{row[i]:<{col_widths[i]}}" for i in range(len(row))) + " |") + # Print spine GUIDs per group (for leafsw_group etc.) + print("\nSpine switches connected to each leaf group (spine GUIDs):") + for group_id in sorted(group_spines.keys()): + spines = group_spines[group_id] + print(f" Group {group_id} ({len(spines)} spines): " + ", ".join(spines)) + return issues From cfd8da9653b03f6d4ea59df838479c38e9dd1e11 Mon Sep 17 00:00:00 2001 From: xlliu Date: Tue, 3 Mar 2026 03:18:20 +0000 Subject: [PATCH 3/4] add timeout to nvidia collector --- components/nvidia/nvidia.go | 5 ++++- scripts/atest/deepeptest_multi_node.py | 4 ++-- scripts/sichek-gpu-ib-occupy | 31 ++++++++++++++++++++++++-- 3 files changed, 35 insertions(+), 5 deletions(-) diff --git a/components/nvidia/nvidia.go b/components/nvidia/nvidia.go index 69efbac3..7f795004 100644 --- a/components/nvidia/nvidia.go +++ b/components/nvidia/nvidia.go @@ -282,10 +282,13 @@ func newNvidia(cfgFile string, specFile string, ignoredCheckers []string) (comp return component, nil } + // Use a timeout for collector init so nvidia-smi (SoftwareInfo.Get) cannot hang forever + collectorCtx, collectorCancel := context.WithTimeout(ctx, consts.CmdTimeout) + defer collectorCancel() // Pass the shared pointer to collector // Note: NVML calls in collector are protected by locks in nvidia.go where collector methods are called component.nvmlMtx.Lock() - collectorPointer, err := collector.NewNvidiaCollector(ctx, component.nvmlInstPtr, nvidiaSpecCfg.GpuNums, nvidiaSpecCfg.Name) + collectorPointer, err := collector.NewNvidiaCollector(collectorCtx, component.nvmlInstPtr, nvidiaSpecCfg.GpuNums, nvidiaSpecCfg.Name) component.nvmlMtx.Unlock() if err != nil { logrus.WithField("component", "nvidia").Errorf("NewNvidiaCollector failed: %v", err) diff --git a/scripts/atest/deepeptest_multi_node.py b/scripts/atest/deepeptest_multi_node.py index 3d97dadc..f63ad0c2 100644 --- a/scripts/atest/deepeptest_multi_node.py +++ b/scripts/atest/deepeptest_multi_node.py @@ -170,10 +170,10 @@ def main() -> None: apply_swanlab_mode(args.swanlab_mode, config) args.image_repo = pick_value( - args.image_repo, config, "pytorchjob_image_repo", "registry-us-east.scitix.ai/hisys/mcore" + args.image_repo, config, "pytorchjob_image_repo", "" ) args.image_tag = pick_value( - args.image_tag, config, "pytorchjob_image_tag", "v2.1-cudnn9.14-te2.8-cuda_arch_10.0_at" + args.image_tag, config, "pytorchjob_image_tag", "" ) args.scheduler_name = pick_value(args.scheduler_name, config, "scheduler", "si-scheduler") args.roce_shared_mode = pick_value(args.roce_shared_mode, config, "roce_shared_mode", "none") diff --git a/scripts/sichek-gpu-ib-occupy b/scripts/sichek-gpu-ib-occupy index 8be84566..5a1c70a4 100755 --- a/scripts/sichek-gpu-ib-occupy +++ b/scripts/sichek-gpu-ib-occupy @@ -5,6 +5,29 @@ set -euo pipefail MIN_FREE_GPU=8 MIN_FREE_IB=32 +# Convert Kubernetes quantity string to integer for arithmetic (e.g. 1k -> 1000, 2 -> 2) +to_int_quantity() { + local v="${1:-0}" + v="${v//[[:space:]]/}" + if [[ "$v" =~ ^[0-9]+$ ]]; then + echo "$v" + elif [[ "$v" =~ ^([0-9]+)k$ ]]; then + echo $((${BASH_REMATCH[1]} * 1000)) + elif [[ "$v" =~ ^([0-9]+)Ki$ ]]; then + echo $((${BASH_REMATCH[1]} * 1024)) + elif [[ "$v" =~ ^([0-9]+)M$ ]]; then + echo $((${BASH_REMATCH[1]} * 1000000)) + elif [[ "$v" =~ ^([0-9]+)Mi$ ]]; then + echo $((${BASH_REMATCH[1]} * 1048576)) + elif [[ "$v" =~ ^([0-9]+)G$ ]]; then + echo $((${BASH_REMATCH[1]} * 1000000000)) + elif [[ "$v" =~ ^([0-9]+)Gi$ ]]; then + echo $((${BASH_REMATCH[1]} * 1073741824)) + else + echo "0" + fi +} + # Parse parameters while [[ $# -gt 0 ]]; do case "$1" in @@ -51,7 +74,9 @@ for node in $(kubectl get nodes -o json | jq -r '.items[] | select(.status.alloc # GPU allocated gpu_allocated=$(kubectl get pods -A --field-selector spec.nodeName="$node",status.phase=Running \ -o json | jq "[.items[].spec.containers[].resources.requests[\"$GPU_RES\"] | (tonumber? // 0)] | add // 0") - gpu_free=$((gpu_allocatable - gpu_allocated)) + gpu_allocatable_int=$(to_int_quantity "$gpu_allocatable") + gpu_allocated_int=$(to_int_quantity "$gpu_allocated") + gpu_free=$((gpu_allocatable_int - gpu_allocated_int)) # IB capacity & allocatable ib_capacity=$(kubectl get node "$node" -o json | jq -r --arg res "$IB_RES" '.status.capacity[$res] // 0') @@ -59,7 +84,9 @@ for node in $(kubectl get nodes -o json | jq -r '.items[] | select(.status.alloc # IB allocated ib_allocated=$(kubectl get pods -A --field-selector spec.nodeName="$node",status.phase=Running \ -o json | jq "[.items[].spec.containers[].resources.requests[\"$IB_RES\"] | (tonumber? // 0)] | add // 0") - ib_free=$((ib_allocatable - ib_allocated)) + ib_allocatable_int=$(to_int_quantity "$ib_allocatable") + ib_allocated_int=$(to_int_quantity "$ib_allocated") + ib_free=$((ib_allocatable_int - ib_allocated_int)) # Condition: remaining resources < specified value if [[ "$gpu_free" -lt "$MIN_FREE_GPU" || "$ib_free" -lt "$MIN_FREE_IB" ]]; then From 1f32fdf0420975f2775b6c212607ff4aa746d8d3 Mon Sep 17 00:00:00 2001 From: xlliu Date: Tue, 3 Mar 2026 04:22:20 +0000 Subject: [PATCH 4/4] add deploy.yaml --- k8s/README.md | 137 ++++++++++++++++++++++++++++++++ install.yaml => k8s/deploy.yaml | 100 +++++++++++++++++++---- 2 files changed, 220 insertions(+), 17 deletions(-) create mode 100644 k8s/README.md rename install.yaml => k8s/deploy.yaml (77%) diff --git a/k8s/README.md b/k8s/README.md new file mode 100644 index 00000000..32e31f6c --- /dev/null +++ b/k8s/README.md @@ -0,0 +1,137 @@ +# `k8s/deploy.yaml` README + +This document explains how to render, apply, and operate `k8s/deploy.yaml`. + +## What this manifest contains + +`k8s/deploy.yaml` defines: + +- A `DaemonSet` named `sichek-gpu` in namespace `monitoring` +- A `ServiceAccount` (`sa-sichek`) +- A `ClusterRole` and `ClusterRoleBinding` +- A `PodMonitor` (`sichek-exporter`) + +The DaemonSet has: + +- `initContainer` (`sichek-init`) that installs/verifies `sichek` on the host +- Main container (`sichek`) that keeps host-side `sichek` service alive +- Exporter container (`sichek-exporter`) that exposes metrics over HTTP + +Important runtime characteristics: + +- `hostPID: true` +- `hostNetwork: true` +- `privileged: true` +- Uses `nsenter` to execute commands in host namespaces +- Uses hostPath `/var/sichek` (mounted as `/host/var/sichek` in containers) + +## Template variables you must render + +`k8s/deploy.yaml` is templated. Replace these placeholders before apply: + +- `{{ .registry }}`: image registry, for example `registry.example.com` +- `{{ .version }}`: sichek version tag, for example `v0.7.6` +- `{{ .sichek_spec_url }}`: spec fallback URL (or empty string) +- `{{ .metrics_port }}`: exporter metrics port, for example `19092` + +## Prerequisites + +1. Namespace exists: + +```bash +kubectl create namespace monitoring +``` + +2. ConfigMaps exist in `monitoring`: + +- `sichek-default-spec` with key `default_spec.yaml` +- `sichek-default-user-config` with key `default_user_config.yaml` + +Example: + +```bash +kubectl create configmap sichek-default-spec \ + --from-file=default_spec.yaml=/path/to/default_spec.yaml \ + -n monitoring + +kubectl create configmap sichek-default-user-config \ + --from-file=default_user_config.yaml=/path/to/default_user_config.yaml \ + -n monitoring +``` + +3. Cluster has permissions and binaries expected by the script on target nodes: + +- `systemd`/`systemctl` +- package manager (`rpm` or `dpkg`) +- host can run `sichek` after install + +## Render and apply + +From repository root: + +```bash +export REGISTRY="registry-ap-southeast.scitix.ai" +export SICHEK_VERSION="v0.7.6" +export SICHEK_SPEC_URL='""' +export METRICS_PORT="19092" + +sed -e "s|{{ \\.registry }}|${REGISTRY}|g" \ + -e "s|{{ \\.version }}|${SICHEK_VERSION}|g" \ + -e "s|{{ \\.sichek_spec_url }}|${SICHEK_SPEC_URL}|g" \ + -e "s|{{ \\.metrics_port }}|${METRICS_PORT}|g" \ + k8s/deploy.yaml > k8s/deploy.rendered.yaml + +kubectl apply -f k8s/deploy.rendered.yaml +``` + +## Verify deployment + +```bash +kubectl get daemonset -n monitoring sichek-gpu +kubectl get pods -n monitoring -l app=sichek -o wide + +# init/install logs +kubectl logs -n monitoring -c sichek-init + +# keepalive service logs +kubectl logs -n monitoring -c sichek -f + +# exporter logs +kubectl logs -n monitoring -c sichek-exporter -f +``` + +## Update flow + +### Update config only + +1. Update ConfigMaps +2. Restart DaemonSet: + +```bash +kubectl rollout restart daemonset/sichek-gpu -n monitoring +``` + +### Upgrade version + +1. Change rendered values (`REGISTRY` / `SICHEK_VERSION`) +2. Re-render and re-apply +3. Watch rollout: + +```bash +kubectl rollout status daemonset/sichek-gpu -n monitoring +``` + +## Uninstall + +```bash +kubectl delete -f k8s/deploy.rendered.yaml +``` + +Note: this removes Kubernetes resources, but host files under `/var/sichek` and host-installed packages may remain. + +## Troubleshooting quick notes + +- `Init` fails: check `sichek-init` logs first. +- `sichek` keeps restarting: verify host `systemctl is-active sichek`. +- Exporter exits with socket timeout: check `/var/sichek/run/current/metrics.sock` on host. +- Pod cannot schedule: inspect taints/resources and DaemonSet events. diff --git a/install.yaml b/k8s/deploy.yaml similarity index 77% rename from install.yaml rename to k8s/deploy.yaml index 853ce40b..f28a3de0 100644 --- a/install.yaml +++ b/k8s/deploy.yaml @@ -22,13 +22,7 @@ spec: spec: hostPID: true hostNetwork: true - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: scitix.ai/gpu-type - operator: Exists + tolerations: - operator: Exists @@ -54,7 +48,7 @@ spec: ########################################################### containers: - name: sichek - image: registry-ap-southeast.scitix.ai/hisys/sichek:v0.7.6 + image: "{{ .registry }}/hisys/sichek:{{ .version }}" imagePullPolicy: Always securityContext: @@ -66,7 +60,11 @@ spec: fieldRef: fieldPath: spec.nodeName - name: SICHEK_SPEC_URL - value: {{ .sichek_spec_url }} + value: {{ .sichek_spec_url }} + - name: POD_UID + valueFrom: + fieldRef: + fieldPath: metadata.uid volumeMounts: - name: sichek-host @@ -79,8 +77,7 @@ spec: - /bin/bash - -c - | - nsenter -t 1 -m -p -n -u -i -- \ - rm -rf /var/sichek/run/pods/${POD_UID} + rm -rf /host/var/sichek/run/pods/${POD_UID} command: ["/bin/bash", "-c"] args: @@ -109,9 +106,14 @@ spec: memory: 1Gi - name: sichek-exporter - image: registry-ap-southeast.scitix.ai/hisys/sichek:v0.7.6 + image: "{{ .registry }}/hisys/sichek:{{ .version }}" securityContext: privileged: true + env: + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName volumeMounts: - name: sichek-host mountPath: /host/var/sichek @@ -123,10 +125,27 @@ spec: - | set -euo pipefail + echo "[sichek] restart dcgm-exportor" + HOST="nsenter -t 1 -m -p -n -u -i --" + $HOST kubectl --kubeconfig=/var/sichek/run/current/kubeconfig delete pod -nmonitoring -lapp.kubernetes.io/name=dcgm-exporter --field-selector spec.nodeName=${NODE_NAME} + $HOST kubectl --kubeconfig=/var/sichek/run/current/kubeconfig get pod -nmonitoring -lapp.kubernetes.io/name=dcgm-exporter --field-selector spec.nodeName=${NODE_NAME} + RUNTIME=/host/var/sichek/run/current + timeout=10 + elapsed=0 + while [ ! -S "${RUNTIME}/metrics.sock" ]; do + if [ $elapsed -ge $timeout ]; then + echo "[sichek] ERROR: ${RUNTIME}/metrics.sock not ready after ${timeout}s" + exit 1 + fi + sleep 1 + elapsed=$((elapsed + 1)) + done + + echo "[sichek] ${RUNTIME}/metrics.sock is ready, starting exporter..." sichek exporter \ --metrics-socket ${RUNTIME}/metrics.sock \ - --listen :{{ .metrics_port }} + --listen :19092 resources: requests: @@ -141,7 +160,7 @@ spec: ############################################################ initContainers: - name: sichek-init - image: registry-ap-southeast.scitix.ai/hisys/sichek:v0.7.6 + image: "{{ .registry }}/hisys/sichek:{{ .version }}" imagePullPolicy: Always securityContext: @@ -235,6 +254,10 @@ spec: echo "[sichek] install via dpkg" cp "$SICHEK_DEB" "${RUNTIME_BASE}/" $HOST dpkg -P sichek || echo "Failed to remove existing sichek" + ###dpkg: warning: while removing sichek, directory '/var/sichek/scripts' not empty so not removed + ###dpkg: warning: while removing sichek, directory '/var/sichek/config' not empty so not removed + rm -rf /host/var/sichek/scripts + rm -rf /host/var/sichek/config $HOST dpkg -i "${HOST_RUNTIME_BASE}/$(basename "$SICHEK_DEB")" rm "${RUNTIME_BASE}/$(basename "$SICHEK_DEB")" else @@ -313,6 +336,27 @@ spec: chmod 644 "${TMP_ENV}" mv "${TMP_ENV}" "${RUNTIME_BASE}/env" + ########################################################################### + # 3. delete pod dir in /var/sichek/run/pods except current and canary link + ########################################################################### + PODS_DIR="$RUNTIME_BASE/pods" + CURRENT_TARGET=$(readlink -f "$RUNTIME_BASE/current") + CANARY_TARGET=$(readlink -f "$RUNTIME_BASE/canary") + echo "Current points to: $CURRENT_TARGET" + echo "Canary points to: $CANARY_TARGET" + + for d in "$PODS_DIR"/*; do + [ -d "$d" ] || continue + + # skip current/canary dir + if [[ "$d" == "$CURRENT_TARGET" || "$d" == "$CANARY_TARGET" ]]; then + echo "Skipping $d" + continue + fi + echo "Deleting $d" + rm -rf "$d" + done + dnsPolicy: ClusterFirstWithHostNet restartPolicy: Always terminationGracePeriodSeconds: 30 @@ -331,7 +375,7 @@ metadata: rules: - apiGroups: ["kubeflow.org", ""] resources: ["nodes", "pods", "pytorchjobs"] - verbs: ["get", "list", "patch", "update", "watch"] + verbs: ["get", "list", "patch", "update", "watch", "delete"] --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding @@ -339,10 +383,32 @@ metadata: name: cluster-role-binding-sichek namespace: monitoring roleRef: - apiGroup: rbac.authorization.k8s.io # Remove the /v1 + apiGroup: rbac.authorization.k8s.io kind: ClusterRole name: cluster-role-sichek subjects: - kind: ServiceAccount name: sa-sichek - namespace: monitoring \ No newline at end of file + namespace: monitoring +--- +# Source: sichek/templates/prometheus.yaml +apiVersion: monitoring.coreos.com/v1 +kind: PodMonitor +metadata: + name: sichek-exporter + namespace: monitoring + labels: + app: sichek +spec: + podMetricsEndpoints: + - interval: 15s # Scrape interval + path: /metrics # Metrics path + port: metrics + scrapeTimeout: 10s + scheme: http + namespaceSelector: + matchNames: + - monitoring + selector: + matchLabels: + app: sichek \ No newline at end of file