Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -67,3 +67,14 @@ jobs:
ghcr.io/${{ github.repository_owner }}/sichek:latest
labels: |
org.opencontainers.image.source=https://github.com/${{ github.repository }}
- name: Build and push Docker image (ubuntu20.04, cuda12.8)
uses: docker/build-push-action@v4
with:
context: ./
file: docker/Dockerfile.cuda128
platforms: linux/amd64
push: true
tags: |
ghcr.io/${{ github.repository_owner }}/sichek:${{ github.ref_name }}-ubuntu2004-cuda12.8
labels: |
org.opencontainers.image.source=https://github.com/${{ github.repository }}
1 change: 1 addition & 0 deletions .goreleaser.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ builds:
- -mod=vendor
ldflags:
- "-s -w"
- "-X github.com/scitix/sichek/cmd/command.Version={{ .Tag }}"
- "-X github.com/scitix/sichek/cmd/command.Major={{ .Major }}"
- "-X github.com/scitix/sichek/cmd/command.Minor={{ .Minor }}"
- "-X github.com/scitix/sichek/cmd/command.Patch={{ .Patch }}"
Expand Down
7 changes: 6 additions & 1 deletion cmd/command/version.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ import (
)

var (
// Version is the full version string (e.g. v0.7.6.post1), set by ldflags at build time.
// When set, it is used for display instead of Major.Minor.Patch.
Version = ""
Major = ""
Minor = ""
Patch = ""
Expand All @@ -52,7 +55,9 @@ func NewVersionCmd() *cobra.Command {
now := time.Now()
BuildTime = now.Format("2006-01-02T15:04:05")
}
if Major == "" {
if Version != "" {
version = Version
} else if Major == "" {
version = "dev-" + GitCommit
} else {
version = "v" + Major + "." + Minor + "." + Patch
Expand Down
11 changes: 11 additions & 0 deletions components/infiniband/checker/ib_devs.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ func (c *IBDevsChecker) Check(ctx context.Context, data any) (*common.CheckerRes

var mismatchPairs []string
infinibandInfo.RLock()
// 1) Spec -> actual: missing or wrong mapping
for expectedMlx5, expectedIb := range c.spec.IBPFDevs {
// skip mezzanine card in check
if strings.Contains(expectedMlx5, "mezz") {
Expand All @@ -80,6 +81,16 @@ func (c *IBDevsChecker) Check(ctx context.Context, data any) (*common.CheckerRes
mismatchPairs = append(mismatchPairs, fmt.Sprintf("%s -> %s (expected %s)", expectedMlx5, actualIb, expectedIb))
}
}
// 2) Actual -> spec: extra devices not defined in spec (e.g. mlx5_7 in spec but actual shows mlx5_13_6209)
for actualMlx5 := range infinibandInfo.IBPFDevs {
if strings.Contains(actualMlx5, "mezz") {
continue
}
if _, inSpec := c.spec.IBPFDevs[actualMlx5]; !inSpec {
mismatchPairs = append(mismatchPairs, fmt.Sprintf("%s (not in spec)", actualMlx5))
logrus.WithField("component", "infiniband").Debugf("mismatch: actual device %s not defined in spec", actualMlx5)
}
}
infinibandInfo.RUnlock()

if len(mismatchPairs) > 0 {
Expand Down
5 changes: 5 additions & 0 deletions components/nvidia/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,14 @@ type NvidiaConfig struct {
QueryInterval common.Duration `json:"query_interval" yaml:"query_interval"`
CacheSize int64 `json:"cache_size" yaml:"cache_size"`
EnableMetrics bool `json:"enable_metrics" yaml:"enable_metrics"`
EnableXidPoller bool `json:"enable_xid_poller" yaml:"enable_xid_poller"`
IgnoredCheckers []string `json:"ignored_checkers,omitempty" yaml:"ignored_checkers,omitempty"`
}

func (c *NvidiaConfig) IsXidPollerEnabled() bool {
return c.EnableXidPoller
}

func (c *NvidiaUserConfig) GetCheckerSpec() map[string]common.CheckerSpec {
commonCfgMap := make(map[string]common.CheckerSpec)
return commonCfgMap
Expand Down
55 changes: 40 additions & 15 deletions components/nvidia/nvidia.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,10 @@ type component struct {
metrics *metrics.NvidiaMetrics

initError error // Track initialization errors with detailed information

// ReNewNvml rate limit: when called from HealthCheck timeout path, skip if last ReNewNvml was < 60s ago
renewBackoffMtx sync.Mutex
lastReNewNvmlTime time.Time
}

var (
Expand Down Expand Up @@ -117,6 +121,17 @@ func NewNvml(ctx context.Context) (nvml.Interface, error) {
// Note: The caller must hold c.healthCheckMtx lock before calling this function
// to ensure thread-safe access to nvmlInst and xidPoller.
func ReNewNvml(c *component) error {
const minReNewNvmlInterval = 60 * time.Second
// When nvmlInst exists (called from HealthCheck timeout path), rate-limit to avoid thrashing with DCGM
if c.nvmlInst != nil {
c.renewBackoffMtx.Lock()
last := c.lastReNewNvmlTime
c.renewBackoffMtx.Unlock()
if !last.IsZero() && time.Since(last) < minReNewNvmlInterval {
logrus.WithField("component", "nvidia").Debugf("ReNewNvml skipped (last run %v ago)", time.Since(last))
return nil
}
}

// Stop the XidEventPoller before shutting down NVML to prevent SIGSEGV
if c.xidPoller != nil {
Expand Down Expand Up @@ -144,31 +159,34 @@ func ReNewNvml(c *component) error {
if c.nvmlInstPtr != nil {
*c.nvmlInstPtr = nvmlInst
}
// Recreate the XidEventPoller with the new NVML instance
// Use RLock to check running status (nested lock: healthCheckMtx -> serviceMtx is safe)
// Recreate the XidEventPoller with the new NVML instance only when enabled
c.serviceMtx.RLock()
isRunning := c.running
c.serviceMtx.RUnlock()
if isRunning {
newXidPoller, err := NewXidEventPoller(c.ctx, c.cfg, nvmlInst, &c.nvmlMtx, c.resultChannel)
var newPoller *XidEventPoller
if isRunning && c.cfg != nil && c.cfg.Nvidia != nil && c.cfg.Nvidia.IsXidPollerEnabled() {
poller, err := NewXidEventPoller(c.ctx, c.cfg, nvmlInst, &c.nvmlMtx, c.resultChannel)
if err != nil {
logrus.WithField("component", "nvidia").Errorf("failed to recreate xid poller after NVML reinit: %v", err)
} else {
c.xidPoller = newXidPoller
// Restart the poller in a goroutine if the component is still running
newPoller = poller
go func() {
defer func() {
if err := recover(); err != nil {
fmt.Printf("[xidPoller] panic err is %s\n", err)
}
}()
err := c.xidPoller.Start()
err := poller.Start()
if err != nil {
logrus.WithField("component", "nvidia").Errorf("start xid poller failed after reinit: %v", err)
}
}()
}
}
c.xidPoller = newPoller
c.renewBackoffMtx.Lock()
c.lastReNewNvmlTime = time.Now()
c.renewBackoffMtx.Unlock()
}
return ret
}
Expand Down Expand Up @@ -264,10 +282,13 @@ func newNvidia(cfgFile string, specFile string, ignoredCheckers []string) (comp
return component, nil
}

// Use a timeout for collector init so nvidia-smi (SoftwareInfo.Get) cannot hang forever
collectorCtx, collectorCancel := context.WithTimeout(ctx, consts.CmdTimeout)
defer collectorCancel()
// Pass the shared pointer to collector
// Note: NVML calls in collector are protected by locks in nvidia.go where collector methods are called
component.nvmlMtx.Lock()
collectorPointer, err := collector.NewNvidiaCollector(ctx, component.nvmlInstPtr, nvidiaSpecCfg.GpuNums, nvidiaSpecCfg.Name)
collectorPointer, err := collector.NewNvidiaCollector(collectorCtx, component.nvmlInstPtr, nvidiaSpecCfg.GpuNums, nvidiaSpecCfg.Name)
component.nvmlMtx.Unlock()
if err != nil {
logrus.WithField("component", "nvidia").Errorf("NewNvidiaCollector failed: %v", err)
Expand All @@ -282,11 +303,16 @@ func newNvidia(cfgFile string, specFile string, ignoredCheckers []string) (comp
return component, nil
}

xidPoller, err := NewXidEventPoller(ctx, nvidiaCfg, nvmlInst, &component.nvmlMtx, component.resultChannel)
if err != nil {
logrus.WithField("component", "nvidia").Errorf("NewXidEventPoller failed: %v", err)
component.initError = fmt.Errorf("failed to create XID event poller: %w", err)
return component, nil
var xidPoller *XidEventPoller
if nvidiaCfg.Nvidia.IsXidPollerEnabled() {
xidPoller, err = NewXidEventPoller(ctx, nvidiaCfg, nvmlInst, &component.nvmlMtx, component.resultChannel)
if err != nil {
logrus.WithField("component", "nvidia").Errorf("NewXidEventPoller failed: %v", err)
component.initError = fmt.Errorf("failed to create XID event poller: %w", err)
return component, nil
}
} else {
logrus.WithField("component", "nvidia").Infof("XID event poller disabled)")
}

freqController := common.GetFreqController()
Expand Down Expand Up @@ -517,8 +543,7 @@ func (c *component) Start() <-chan *common.Result {
}
// Check if the error message contains "Timeout"
if result != nil && len(result.Checkers) > 0 && strings.Contains(result.Checkers[0].Name, "HealthCheckTimeout") {
// Handle the timeout error
// ReNewNvml requires healthCheckMtx lock to be held
// HealthCheck timed out: try ReNewNvml
c.healthCheckMtx.Lock()
err := ReNewNvml(c)
c.healthCheckMtx.Unlock()
Expand Down
14 changes: 7 additions & 7 deletions components/nvidia/xid_poller.go
Original file line number Diff line number Diff line change
Expand Up @@ -83,21 +83,21 @@ func (x *XidEventPoller) Start() error {

// waits for the duration specified in x.Cfg.UpdateInterval (in seconds)
// ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlEvents.html#group__nvmlEvents
// e, err := x.XidEventSet.Wait(uint32(x.Cfg.UpdateInterval.Microseconds()))
event, ret := x.XidEventSet.Wait(200)
logrus.WithField("component", "nvidia").Infof("XidEventSet.Wait returned: %v, %v", event.EventData, ret)
if ret == nvml.ERROR_NOT_SUPPORTED {
logrus.WithField("component", "nvidia").Warningf("XidEvent not supported -- Skipping: %v", nvml.ErrorString(ret))
event, ret := x.XidEventSet.Wait(uint32(1000))
if ret == nvml.ERROR_TIMEOUT {
// no event within timeout — normal
logrus.WithField("component", "nvidia").Debugf("XidEventSet.Wait timeout (no event)")
continue
}
if ret == nvml.ERROR_TIMEOUT {
// no event within timeout
if ret == nvml.ERROR_NOT_SUPPORTED {
logrus.WithField("component", "nvidia").Warningf("XidEvent not supported -- Skipping: %v", nvml.ErrorString(ret))
continue
}
if ret != nvml.SUCCESS {
logrus.WithField("component", "nvidia").Warningf("XidEventSet.Wait failure -- Retrying: %v", nvml.ErrorString(ret))
continue
}
logrus.WithField("component", "nvidia").Infof("XidEventSet.Wait returned event: %v", event.EventData)

x.handleEvent(event)
}
Expand Down
24 changes: 21 additions & 3 deletions config/default_spec.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -66,9 +66,27 @@ infiniband:
default:
<<: *ib_base
hca:
MT_0000000838: {}
DEL0000000036: {}
MT_0000000223: {}
MT_0000000838:
hardware:
hca_type: "MT4129"
board_id: "MT_0000000838"
fw_ver: ">=28.39.2048"
vpd: "NVIDIA ConnectX-7 HHHL Adapter card, 400GbE / NDR IB (default mode), Single-port OSFP, PCIe 5.0 x16, Crypto Disabled, Secure Boot Enabled"
net_port: 1
port_speed: "400 Gb/sec (4X NDR)"
phy_state: "LinkUp"
port_state: "ACTIVE"
net_operstate: "down"
link_layer: "InfiniBand"
pcie_width: "16"
pcie_speed: "32.0 GT/s PCIe"
pcie_tree_width: "16"
pcie_tree_speed: "32"
pcie_acs: "disable"
pcie_mrr: "4096"
perf:
one_way_bw: 360 # Gbps
avg_latency_us: 10 # us
pcie_topo:
"0x233510de":
numa_config:
Expand Down
80 changes: 80 additions & 0 deletions docker/Dockerfile.cuda128
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
ARG CUDA_VERSION=12.8.1
FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu20.04 AS build

ARG GO_VERSION=1.23.3
ARG GORELEASER_VERSION=v2.13.1

RUN mv /etc/apt/sources.list.d/cuda*.list /tmp/disabled-cuda.list 2>/dev/null || true && \
apt-get update && \
apt-get install -y --no-install-recommends \
build-essential gcc g++ curl git wget ca-certificates && \
rm -rf /var/lib/apt/lists/*

# Install Go
RUN curl -fsSL https://go.dev/dl/go${GO_VERSION}.linux-amd64.tar.gz | \
tar -C /usr/local -xz && \
ln -s /usr/local/go/bin/go /usr/local/bin/go

# Install GoReleaser
RUN curl -fsSL https://github.com/goreleaser/goreleaser/releases/download/${GORELEASER_VERSION}/goreleaser_Linux_x86_64.tar.gz | \
tar -xz -C /usr/local/bin goreleaser && \
goreleaser --version

ENV GOSUMDB=off
WORKDIR /go/src/sichek

COPY . .

ARG BUILD_TIME
RUN if [ -n "$BUILD_TIME" ]; then \
BUILD_TIME=$BUILD_TIME goreleaser release --snapshot --clean ; \
else \
goreleaser release --snapshot --clean ; \
fi

FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu20.04

ENV DEBIAN_FRONTEND=noninteractive
ENV TZ=Asia/Shanghai
ENV NVIDIA_VISIBLE_DEVICES=""

RUN mv /etc/apt/sources.list.d/cuda*.list /tmp/disabled-cuda.list 2>/dev/null || true && \
apt-get update && \
apt-get install -y --no-install-recommends \
vim curl tzdata ca-certificates \
openssh-server openssh-client \
iproute2 iputils-ping jq \
libnuma1 numactl rdmacm-utils perftest xz-utils && \
ln -fs /usr/share/zoneinfo/${TZ} /etc/localtime && \
dpkg-reconfigure --frontend noninteractive tzdata && \
apt-get clean && rm -rf /var/lib/apt/lists/*

# Helm (runtime tool)
RUN curl -fsSL https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash

WORKDIR /opt/sichek
COPY --from=build /go/src/sichek/dist ./dist

ARG SICL_VERSION=sicl-25.11-1.cuda128.ubuntu2004.run
RUN curl -fsSL -o ./dist/${SICL_VERSION} \
https://oss-ap-southeast.scitix.ai/scitix-release/${SICL_VERSION} && \
bash ./dist/${SICL_VERSION} && \
dpkg -i ./dist/sichek_*_linux_amd64.deb

ENV SIHPC_HOME=/usr/local/sihpc
ENV PATH=${SIHPC_HOME}/bin:$PATH
ENV LD_LIBRARY_PATH=${SIHPC_HOME}/lib:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
ENV OMPI_MCA_opal_prefix=${SIHPC_HOME}
ENV OPAL_PREFIX=${SIHPC_HOME}

# SSH dependencies for MPI
RUN sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \
echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \
sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config && \
mkdir -p /var/run/sshd /root/.ssh && \
chmod 700 /root/.ssh
RUN ssh-keygen -t rsa -f /root/.ssh/id_rsa -q -N "" && \
cat /root/.ssh/id_rsa.pub >> /root/.ssh/authorized_keys && \
chmod 600 /root/.ssh/authorized_keys

EXPOSE 22
Loading
Loading