From 91380a1e1d68c74125160c87466c06497c25ef14 Mon Sep 17 00:00:00 2001 From: lzi-a11y Date: Mon, 9 Mar 2026 20:01:41 +0800 Subject: [PATCH 1/3] feat: Add Ethernet component, including its collector, checker, and command-line support. --- cmd/command/command.go | 4 +- cmd/command/component/all.go | 3 + cmd/command/component/ethernet.go | 93 ++++ components/ethernet/checker/checker.go | 530 +++++++++++++++++++ components/ethernet/collector/collector.go | 437 +++++++++++++++ components/ethernet/config/config.go | 67 +++ components/ethernet/config/default_spec.yaml | 13 + components/ethernet/config/spec.go | 63 +++ components/ethernet/ethernet.go | 336 ++++++++++++ components/ethernet/ethernet_test.go | 174 ++++++ config/default_spec.yaml | 15 +- config/default_user_config.yaml | 7 +- consts/consts.go | 2 +- docs/ethernet.md | 70 +++ 14 files changed, 1810 insertions(+), 4 deletions(-) create mode 100644 cmd/command/component/ethernet.go create mode 100644 components/ethernet/checker/checker.go create mode 100644 components/ethernet/collector/collector.go create mode 100644 components/ethernet/config/config.go create mode 100644 components/ethernet/config/default_spec.yaml create mode 100644 components/ethernet/config/spec.go create mode 100644 components/ethernet/ethernet.go create mode 100644 components/ethernet/ethernet_test.go create mode 100644 docs/ethernet.md diff --git a/cmd/command/command.go b/cmd/command/command.go index 22eff6a6..f8bad6ac 100644 --- a/cmd/command/command.go +++ b/cmd/command/command.go @@ -41,6 +41,8 @@ func NewRootCmd() *cobra.Command { "h": true, "all": true, "run": true, + "ethernet": true, + "e": true, } if commandsRequireRoot[cmd.Use] { @@ -57,7 +59,7 @@ func NewRootCmd() *cobra.Command { rootCmd.AddCommand(component.NewCPUCmd()) rootCmd.AddCommand(component.NewNvidiaCmd()) rootCmd.AddCommand(component.NewInfinibandCmd()) - // rootCmd.AddCommand(component.NewEthernetCmd()) + rootCmd.AddCommand(component.NewEthernetCmd()) rootCmd.AddCommand(component.NewGpfsCmd()) rootCmd.AddCommand(component.NewPodLogCmd()) rootCmd.AddCommand(component.NewDmesgCmd()) diff --git a/cmd/command/component/all.go b/cmd/command/component/all.go index fe0533f5..e3d636dd 100644 --- a/cmd/command/component/all.go +++ b/cmd/command/component/all.go @@ -26,6 +26,7 @@ import ( "github.com/scitix/sichek/components/common" "github.com/scitix/sichek/components/cpu" "github.com/scitix/sichek/components/dmesg" + "github.com/scitix/sichek/components/ethernet" "github.com/scitix/sichek/components/gpfs" gpuevents "github.com/scitix/sichek/components/gpuevents" "github.com/scitix/sichek/components/infiniband" @@ -186,6 +187,8 @@ func NewComponent(componentName string, cfgFile string, specFile string, ignored case consts.ComponentNameSyslog: // if skipPercent is -1, use the value from the config file return syslog.NewComponent(cfgFile, "", -1) + case consts.ComponentNameEthernet: + return ethernet.NewEthernetComponent(cfgFile, specFile, ignoredCheckers) default: return nil, fmt.Errorf("invalid component name: %s", componentName) } diff --git a/cmd/command/component/ethernet.go b/cmd/command/component/ethernet.go new file mode 100644 index 00000000..3ef6824c --- /dev/null +++ b/cmd/command/component/ethernet.go @@ -0,0 +1,93 @@ +/* +Copyright 2024 The Scitix Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package component + +import ( + "context" + "strings" + + "github.com/scitix/sichek/cmd/command/spec" + "github.com/scitix/sichek/components/ethernet" + "github.com/scitix/sichek/consts" + + "github.com/sirupsen/logrus" + "github.com/spf13/cobra" +) + +func NewEthernetCmd() *cobra.Command { + var ( + cfgFile string + specFile string + ignoredCheckersStr string + verbose bool + ) + ethernetCmd := &cobra.Command{ + Use: "ethernet", + Aliases: []string{"e"}, + Short: "Perform Ethernet HealthCheck", + Run: func(cmd *cobra.Command, args []string) { + ctx, cancel := context.WithTimeout(context.Background(), consts.CmdTimeout) + + if !verbose { + logrus.SetLevel(logrus.ErrorLevel) + defer cancel() + } else { + logrus.SetLevel(logrus.DebugLevel) + defer func() { + logrus.WithField("component", "ethernet").Info("Run ethernet Cmd context canceled") + cancel() + }() + } + + resolvedCfgFile, err := spec.EnsureCfgFile(cfgFile) + if err != nil { + logrus.WithField("daemon", "ethernet").Errorf("failed to load cfgFile: %v", err) + } else { + logrus.WithField("daemon", "ethernet").Info("load cfgFile: " + resolvedCfgFile) + } + resolvedSpecFile, err := spec.EnsureSpecFile(specFile) + if err != nil { + logrus.WithField("daemon", "ethernet").Errorf("failed to load specFile: %v", err) + } else { + logrus.WithField("daemon", "ethernet").Info("load specFile: " + resolvedSpecFile) + } + + var ignoredCheckers []string + if len(ignoredCheckersStr) > 0 { + ignoredCheckers = strings.Split(ignoredCheckersStr, ",") + } + + component, err := ethernet.NewEthernetComponent(resolvedCfgFile, resolvedSpecFile, ignoredCheckers) + if err != nil { + logrus.WithField("component", "ethernet").Error(err) + return + } + logrus.WithField("component", "ethernet").Infof("Run Ethernet component check: %s", component.Name()) + result, err := RunComponentCheck(ctx, component, consts.CmdTimeout) + if err != nil { + return + } + PrintCheckResults(true, result) + }, + } + + ethernetCmd.Flags().StringVarP(&cfgFile, "cfg", "c", "", "Path to the user config file") + ethernetCmd.Flags().StringVarP(&specFile, "spec", "s", "", "Path to the Ethernet specification file") + ethernetCmd.Flags().StringVarP(&ignoredCheckersStr, "ignored-checkers", "i", "", "Ignored checkers") + ethernetCmd.Flags().BoolVarP(&verbose, "verbose", "v", false, "Enable verbose output") + + return ethernetCmd +} diff --git a/components/ethernet/checker/checker.go b/components/ethernet/checker/checker.go new file mode 100644 index 00000000..df3179b1 --- /dev/null +++ b/components/ethernet/checker/checker.go @@ -0,0 +1,530 @@ +/* +Copyright 2024 The Scitix Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package checker + +import ( + "context" + "fmt" + "regexp" + "strconv" + "strings" + + "github.com/scitix/sichek/components/common" + "github.com/scitix/sichek/components/ethernet/collector" + "github.com/scitix/sichek/components/ethernet/config" + "github.com/scitix/sichek/consts" +) + +func NewCheckers(cfg *config.EthernetUserConfig, spec *config.EthernetSpecConfig) ([]common.Checker, error) { + checkers := []common.Checker{ + &L1Checker{ + spec: spec, + prevCRC: make(map[string]int64), + prevCarrier: make(map[string]int64), + prevDrops: make(map[string]int64), + }, + &L2Checker{ + spec: spec, + prevLinkFailures: make(map[string]int64), + prevActiveSlave: make(map[string]string), + }, + &L3Checker{spec: spec}, + &L4Checker{spec: spec}, + &L5Checker{spec: spec}, + } + // Filter skipped checkers + ignoredMap := make(map[string]bool) + if cfg != nil && cfg.Ethernet != nil { + for _, v := range cfg.Ethernet.IgnoredCheckers { + ignoredMap[v] = true + } + } + var activeCheckers []common.Checker + for _, chk := range checkers { + if !ignoredMap[chk.Name()] { + activeCheckers = append(activeCheckers, chk) + } + } + return activeCheckers, nil +} + +type L1Checker struct { + spec *config.EthernetSpecConfig + prevCRC map[string]int64 + prevCarrier map[string]int64 + prevDrops map[string]int64 +} + +func (c *L1Checker) Name() string { return config.EthernetL1CheckerName } + +// extractInt parses an integer using regex from a string pattern +func extractInt(input, pattern string) int64 { + re := regexp.MustCompile(pattern) + matches := re.FindStringSubmatch(input) + if len(matches) > 1 { + val, _ := strconv.ParseInt(matches[1], 10, 64) + return val + } + return 0 +} + +func (c *L1Checker) Check(ctx context.Context, data any) (*common.CheckerResult, error) { + info, ok := data.(*collector.EthernetInfo) + if !ok { + return nil, fmt.Errorf("invalid data type") + } + + result := &common.CheckerResult{ + Name: c.Name(), + Description: config.EthernetCheckItems[c.Name()], + Status: consts.StatusNormal, + Level: consts.LevelInfo, + Curr: "OK", + } + + expectedSpeed := "25000" // default to 25G + if c.spec != nil && c.spec.Speed != "" { + expectedSpeed = c.spec.Speed + } + + for _, bond := range info.BondInterfaces { + for slaveName, slaveState := range info.Slaves[bond] { + if !slaveState.LinkDetected { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelCritical + result.ErrorName = "LinkDown" + result.Detail += fmt.Sprintf("物理网卡 %s 链路未检测到 UP。执行命令:ethtool %s,预期:Link detected: yes,当前发现未连接或 unknown。\n", slaveName, slaveName) + } + + if len(info.SyslogErrors) > 0 { + for _, errLine := range info.SyslogErrors { + if strings.Contains(errLine, "tx timeout") && strings.Contains(errLine, slaveName) { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelCritical + result.ErrorName = "TxTimeout" + result.Detail += fmt.Sprintf("网卡 %s 在内核日志发现 tx timeout。执行命令:dmesg | grep -iE 'eth|mlx|link'。\n", slaveName) + break + } + } + } + + // check speed + speedStr := strconv.Itoa(slaveState.Speed) + if speedStr != expectedSpeed && slaveState.Speed > 0 { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelWarning + result.ErrorName = "SpeedMismatch" + result.Detail += fmt.Sprintf("网卡 %s 速率不匹配。预期: %sMb/s,当前发现: %sMb/s。\n", slaveName, expectedSpeed, speedStr) + } + + // Parse stats + sStats := info.Stats[slaveName] + + // CRC errors + currCRC := sStats.RXErrors // Approximation, standard ip -s link maps CRC errors to RX errors broadly. For exact CRC, ethtool parsing should remain, but for now we follow the general RX error growth. + if prev, ok := c.prevCRC[slaveName]; ok && currCRC > prev { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelWarning + result.ErrorName = "CRCErrorsGrowing" + result.Detail += fmt.Sprintf("网卡 %s RX (CRC) 错误持续增长。之前: %d,当前: %d。\n", slaveName, prev, currCRC) + } + c.prevCRC[slaveName] = currCRC + + // Carrier errors + currCarrierIPS := sStats.Carrier + if prev, ok := c.prevCarrier[slaveName]; ok && currCarrierIPS > prev { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelWarning + result.ErrorName = "CarrierErrorsGrowing" + result.Detail += fmt.Sprintf("网卡 %s Carrier 错误持续增长。之前: %d,当前: %d。\n", slaveName, prev, currCarrierIPS) + } + c.prevCarrier[slaveName] = currCarrierIPS + + // Drops + currDrops := sStats.Dropped + if prev, ok := c.prevDrops[slaveName]; ok && currDrops > prev { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelWarning + result.ErrorName = "DropsGrowing" + result.Detail += fmt.Sprintf("网卡 %s Drops 持续增长。之前: %d,当前: %d。\n", slaveName, prev, currDrops) + } + c.prevDrops[slaveName] = currDrops + } + } + + if result.Status != consts.StatusNormal { + result.Suggestion = "请检查物理链路、网线、驱动版本(ethtool -i),或查看 dmesg 中确认具体错误;如果是速率不匹配,请检查对应配置。" + } + + return result, nil +} + +type L2Checker struct { + spec *config.EthernetSpecConfig + prevLinkFailures map[string]int64 + prevActiveSlave map[string]string +} + +func (c *L2Checker) Name() string { return config.EthernetL2CheckerName } +func (c *L2Checker) Check(ctx context.Context, data any) (*common.CheckerResult, error) { + info, ok := data.(*collector.EthernetInfo) + if !ok { + return nil, fmt.Errorf("invalid data type") + } + + result := &common.CheckerResult{ + Name: c.Name(), + Description: config.EthernetCheckItems[c.Name()], + Status: consts.StatusNormal, + Level: consts.LevelInfo, + Curr: "OK", + } + + expectedMinSlaves := 2 + if c.spec != nil && c.spec.MinSlaves > 0 { + expectedMinSlaves = c.spec.MinSlaves + } + + for _, bond := range info.BondInterfaces { + bState, exists := info.Bonds[bond] + if !exists || bState.Name == "" { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelCritical + result.ErrorName = "BondingMissing" + result.Detail = fmt.Sprintf("Bond %s 在 /proc/net/bonding 中缺失。\n", bond) + continue + } + + expectedMII := "up" + procContent := info.ProcNetBonding[bond] + if c.spec != nil && c.spec.MIIStatus != "" { + expectedMII = c.spec.MIIStatus + } + + if (expectedMII == "up" && !bState.IsUp) || !strings.Contains(procContent, "MII Status: "+expectedMII) { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelCritical + result.ErrorName = "BondDown" + result.Detail += fmt.Sprintf("Bond 接口 %s 的总体状态不符合预期。命令:cat /proc/net/bonding/%s,预期:MII Status: %s,当前不匹配(可能为 down)。\n", bond, bond, expectedMII) + } + + // check MTU + if c.spec != nil && c.spec.MTU != "" { + expectedMTU, _ := strconv.Atoi(c.spec.MTU) + if bState.MTU > 0 && bState.MTU != expectedMTU { + if result.Status == consts.StatusNormal { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelWarning + result.ErrorName = "MTUMismatch" + } + result.Detail += fmt.Sprintf("Bond %s 的 MTU 不匹配。预期: %d,实际: %d。\n", bond, expectedMTU, bState.MTU) + } + } + + // check xmit_hash_policy + if c.spec != nil && c.spec.XmitHashPolicy != "" { + if bState.XmitHashPolicy != "" && bState.XmitHashPolicy != c.spec.XmitHashPolicy { + if result.Status == consts.StatusNormal { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelWarning + result.ErrorName = "XmitHashPolicyMismatch" + } + result.Detail += fmt.Sprintf("Bond %s 的 xmit_hash_policy 不匹配。预期: %s,当前: %s。执行命令:cat /sys/class/net/%s/bonding/xmit_hash_policy。\n", bond, c.spec.XmitHashPolicy, bState.XmitHashPolicy, bond) + } + } + + // check slave count + slaveCount := len(info.Slaves[bond]) + if slaveCount < expectedMinSlaves { + if result.Status == consts.StatusNormal { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelWarning + result.ErrorName = "SlaveCountMismatch" + } + result.Detail += fmt.Sprintf("Bond %s 的 slave 数量不足。预期至少: %d,实际: %d。\n", bond, expectedMinSlaves, slaveCount) + } + + // check miimon, updelay, downdelay (fetching downdelay and updelay via regex since they aren't fully standard across systems on sysfs) + miimon := int64(bState.Miimon) + updelay := extractInt(procContent, `Up Delay \(ms\):\s*(\d+)`) + downdelay := extractInt(procContent, `Down Delay \(ms\):\s*(\d+)`) + + expectedMiimon := int64(0) + expectedUpDelay := int64(0) + expectedDownDelay := int64(0) + if c.spec != nil { + if c.spec.Miimon > 0 { + expectedMiimon = int64(c.spec.Miimon) + } + expectedUpDelay = int64(c.spec.UpDelay) + expectedDownDelay = int64(c.spec.DownDelay) + } + + if miimon == 0 { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelCritical + result.ErrorName = "MiimonDisabled" + result.Detail += fmt.Sprintf("Bond %s 的 MII Polling Interval (miimon) 为 0,未开启底层链路检测,这会导致物理链路断开时发生持续丢包。执行命令:cat /proc/net/bonding/%s,请务必开启!\n", bond, bond) + } else { + if expectedMiimon > 0 && miimon != expectedMiimon { + if result.Status == consts.StatusNormal { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelWarning + result.ErrorName = "MiimonMismatch" + } + result.Detail += fmt.Sprintf("Bond %s 的 miimon 不匹配。预期: %d ms,实际: %d ms。执行命令:cat /sys/class/net/%s/bonding/miimon。\n", bond, expectedMiimon, miimon, bond) + } + + if downdelay < miimon { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelWarning + result.ErrorName = "DowndelayTooSmall" + result.Detail += fmt.Sprintf("Bond %s 的 downdelay (%d ms) 小于 miimon (%d ms),配置不合理,可能导致不必要的震荡或丢包。执行命令:cat /proc/net/bonding/%s。\n", bond, downdelay, miimon, bond) + } else if expectedDownDelay > 0 && downdelay != expectedDownDelay { + if result.Status == consts.StatusNormal { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelWarning + result.ErrorName = "DowndelayMismatch" + } + result.Detail += fmt.Sprintf("Bond %s 的 downdelay 不匹配。预期: %d ms,实际: %d ms。执行命令:cat /sys/class/net/%s/bonding/downdelay。\n", bond, expectedDownDelay, downdelay, bond) + } + + if updelay == 0 { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelWarning + result.ErrorName = "UpdelayZero" + result.Detail += fmt.Sprintf("Bond %s 的 updelay 为 0,由于交换机端口转发协商需要时间,立即切回流量极易产生丢包黑洞,建议设置 updelay。执行命令:cat /proc/net/bonding/%s。\n", bond, bond) + } else if expectedUpDelay > 0 && updelay != expectedUpDelay { + if result.Status == consts.StatusNormal { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelWarning + result.ErrorName = "UpdelayMismatch" + } + result.Detail += fmt.Sprintf("Bond %s 的 updelay 不匹配。预期: %d ms,实际: %d ms。执行命令:cat /sys/class/net/%s/bonding/updelay。\n", bond, expectedUpDelay, updelay, bond) + } + } + + // track active slave for flapping detection + activeSlaveMatch := regexp.MustCompile(`Currently Active Slave:\s*(\w+)`).FindStringSubmatch(procContent) + if len(activeSlaveMatch) > 1 { + currActive := activeSlaveMatch[1] + if prev, ok := c.prevActiveSlave[bond]; ok && prev != "" && prev != currActive { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelWarning + result.ErrorName = "ActiveSlaveFlapping" + result.Detail += fmt.Sprintf("Bond %s 发生了主备端口切换。之前主端口: %s,当前主端口: %s,如果频繁切换请重点关注物理层稳定性。\n", bond, prev, currActive) + } + c.prevActiveSlave[bond] = currActive + } + + // track Link Failure Count per slave + slavesData := strings.Split(procContent, "Slave Interface: ") + for i := 1; i < len(slavesData); i++ { + lines := strings.Split(slavesData[i], "\n") + if len(lines) == 0 { + continue + } + slaveName := strings.TrimSpace(lines[0]) + failCount := extractInt(slavesData[i], `Link Failure Count:\s*(\d+)`) + trackKey := bond + "-" + slaveName + + if prev, ok := c.prevLinkFailures[trackKey]; ok && failCount > prev { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelWarning + result.ErrorName = "LinkFailureGrowing" + result.Detail += fmt.Sprintf("Bond %s 的从属网卡 %s 发生了链路断开(Link Failure)。之前故障次数: %d,当前: %d。\n", bond, slaveName, prev, failCount) + } + c.prevLinkFailures[trackKey] = failCount + } + } + + if result.Status != consts.StatusNormal { + result.Suggestion = "请使用 cat /proc/net/bonding/bond0 核对 MII 状态及 Link Failure Count;确认配置文件(如 /etc/netplan 或 sysconfig) 中 miimon>0,且 slave 绑卡数量符合预期。" + } + + return result, nil +} + +type L3Checker struct{ spec *config.EthernetSpecConfig } + +func (c *L3Checker) Name() string { return config.EthernetL3CheckerName } +func (c *L3Checker) Check(ctx context.Context, data any) (*common.CheckerResult, error) { + info, ok := data.(*collector.EthernetInfo) + if !ok { + return nil, fmt.Errorf("invalid data type") + } + + result := &common.CheckerResult{ + Name: c.Name(), + Description: config.EthernetCheckItems[c.Name()], + Status: consts.StatusNormal, + Level: consts.LevelInfo, + Curr: "OK", + } + + for _, bond := range info.BondInterfaces { + bState, ok := info.Bonds[bond] + if !ok || !strings.Contains(bState.Mode, "802.3ad") { + continue + } + + lacp, exists := info.LACP[bond] + if !exists || lacp.PartnerMacAddress == "" { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelCritical + result.ErrorName = "ActiveAggregatorMissing" + result.Detail += fmt.Sprintf("Bond %s 配置为 802.3ad 模式,但未找到有效的 Active Aggregator 协商信息,可能对端交换机未配置 LACP 或链路异常。\n", bond) + continue + } + + if lacp.PartnerMacAddress == "00:00:00:00:00:00" { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelCritical + result.ErrorName = "PartnerMacInvalid" + result.Detail += fmt.Sprintf("Bond %s 的 Partner Mac Address 为全零。对端交换机未响应 LACP 报文,聚合失败。\n", bond) + } + + for slaveName, sState := range info.Slaves[bond] { + if !sState.IsUp { + continue + } + + if slaveAggID, ok := lacp.SlaveAggregatorIDs[slaveName]; ok && slaveAggID != lacp.ActiveAggregatorID { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelCritical + result.ErrorName = "AggregatorMismatch" + result.Detail += fmt.Sprintf("从属网卡 %s 的 Aggregator ID (%s) 与全局 Active Aggregator ID (%s) 不一致。该网卡虽然物理 UP,但在二层无法加入到数据转发聚合组中,请检查交换机端口配置或网线。\n", slaveName, slaveAggID, lacp.ActiveAggregatorID) + } + + if portKey, ok := lacp.SlaveActorKeys[slaveName]; ok && portKey != lacp.ActorKey { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelWarning + result.ErrorName = "ActorKeyMismatch" + result.Detail += fmt.Sprintf("从属网卡 %s 的 port key (%s) 与全局 Actor Key (%s) 不一致。\n", slaveName, portKey, lacp.ActorKey) + } + + if operKey, ok := lacp.SlavePartnerKeys[slaveName]; ok && operKey != lacp.PartnerKey { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelWarning + result.ErrorName = "PartnerKeyMismatch" + result.Detail += fmt.Sprintf("从属网卡 %s 的 oper key (%s) 与全局 Partner Key (%s) 不一致,对端交换机 LACP key 协商异常。\n", slaveName, operKey, lacp.PartnerKey) + } + } + + if c.spec != nil && c.spec.LACPRate != "" { + if !strings.Contains(bState.LACPRate, c.spec.LACPRate) { + if result.Status == consts.StatusNormal { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelWarning + result.ErrorName = "LACPRateMismatch" + } + result.Detail += fmt.Sprintf("Bond %s LACP rate 不匹配。命令:cat /sys/class/net/%s/bonding/lacp_rate,预期:%s,当前:%s。\n", bond, bond, c.spec.LACPRate, bState.LACPRate) + } + } + } + + if result.Status != consts.StatusNormal { + result.Suggestion = "建议同步排查对端交换机 (Switch) 上的 LACP / Eth-Trunk 聚合配置是否开启和匹配。" + } + + return result, nil +} + +type L4Checker struct{ spec *config.EthernetSpecConfig } + +func (c *L4Checker) Name() string { return config.EthernetL4CheckerName } +func (c *L4Checker) Check(ctx context.Context, data any) (*common.CheckerResult, error) { + info, ok := data.(*collector.EthernetInfo) + if !ok { + return nil, fmt.Errorf("invalid data type") + } + + result := &common.CheckerResult{ + Name: c.Name(), + Description: config.EthernetCheckItems[c.Name()], + Status: consts.StatusNormal, + Level: consts.LevelInfo, + Curr: "OK", + } + + if strings.Contains(info.IPNeigh, "FAILED") || strings.Contains(info.IPNeigh, "INCOMPLETE") { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelWarning + result.ErrorName = "ARPFailed" + result.Detail = "在 ARP 邻居表中发现 FAILED/INCOMPLETE 失败条目。这证明与某些邻居节点的二层 MAC 解析失败。" + result.Suggestion = "请核对本端及交换机的 VLAN ID 配置及二层放行,或使用 arping 测试连通性并在 bond 口 tcpdump 抓弃 ARP request/reply。" + } + + if info.Routes.GatewayIP != "" && !info.Routes.GatewayReachable { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelCritical + result.ErrorName = "GatewayUnreachable" + result.Detail += fmt.Sprintf("系统的网关 (%s) 不可达 (ping 失败,且不在 ARP 邻居表中)。\n", info.Routes.GatewayIP) + } + + return result, nil +} + +type L5Checker struct{ spec *config.EthernetSpecConfig } + +func (c *L5Checker) Name() string { return config.EthernetL5CheckerName } +func (c *L5Checker) Check(ctx context.Context, data any) (*common.CheckerResult, error) { + info, ok := data.(*collector.EthernetInfo) + if !ok { + return nil, fmt.Errorf("invalid data type") + } + + result := &common.CheckerResult{ + Name: c.Name(), + Description: config.EthernetCheckItems[c.Name()], + Status: consts.StatusNormal, + Level: consts.LevelInfo, + Curr: "OK", + } + + if !info.Routes.DefaultRouteViaBond { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelWarning + result.ErrorName = "DirectRouteMismatch" + result.Detail += "系统的默认路由并非直接指向绑定的目标 bond 网卡,可能会导致预期业务流量不走 bond。\n" + } + + if info.RPFilter["all"] == "1" { + if result.Status == consts.StatusNormal { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelWarning + result.ErrorName = "RPFilterEnabled" + } + result.Detail += "系统启用了 rp_filter (all=1),可能导致非对称路由丢包。命令:sysctl -n net.ipv4.conf.all.rp_filter,预期:0 或 2,当前:1。\n" + } + + for bond, val := range info.RPFilter { + if bond != "all" && val == "1" { + if result.Status == consts.StatusNormal { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelWarning + result.ErrorName = "RPFilterEnabled" + } + result.Detail += fmt.Sprintf("Bond %s 启用了 rp_filter=1。预期:0 或 2。\n", bond) + } + } + + if result.Status != consts.StatusNormal { + result.Suggestion = "如果发生丢包情况,建议检查路由匹配、多网卡下的策略路由(ip rule),以及将 rp_filter 配置为 0(关闭) 或 2(松散模式)。" + } + + return result, nil +} diff --git a/components/ethernet/collector/collector.go b/components/ethernet/collector/collector.go new file mode 100644 index 00000000..c65fb327 --- /dev/null +++ b/components/ethernet/collector/collector.go @@ -0,0 +1,437 @@ +/* +Copyright 2024 The Scitix Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package collector + +import ( + "context" + "encoding/json" + "fmt" + "regexp" + "strconv" + "strings" + + "github.com/scitix/sichek/pkg/utils" +) + +type BondState struct { + Name string `json:"name"` + IsUp bool `json:"is_up"` + HasLowerUp bool `json:"has_lower_up"` + IPAddr string `json:"ip_addr"` + MTU int `json:"mtu"` + Mode string `json:"mode"` + Miimon int `json:"miimon"` + XmitHashPolicy string `json:"xmit_hash_policy"` + LACPRate string `json:"lacp_rate"` + ActiveSlave string `json:"active_slave"` +} + +type SlaveState struct { + Name string `json:"name"` + IsUp bool `json:"is_up"` + LinkDetected bool `json:"link_detected"` + Speed int `json:"speed"` // Mbps + Duplex string `json:"duplex"` // "Full", "Half" +} + +type LACPState struct { + ActiveAggregatorID string `json:"active_aggregator_id"` + ActorKey string `json:"actor_key"` + PartnerKey string `json:"partner_key"` + PartnerMacAddress string `json:"partner_mac_address"` + SlaveAggregatorIDs map[string]string `json:"slave_aggregator_ids"` + SlaveActorKeys map[string]string `json:"slave_actor_keys"` + SlavePartnerKeys map[string]string `json:"slave_partner_keys"` +} + +type TrafficStats struct { + RXErrors int64 `json:"rx_errors"` + TXErrors int64 `json:"tx_errors"` + Dropped int64 `json:"dropped"` + Carrier int64 `json:"carrier"` +} + +type RouteState struct { + DefaultRouteViaBond bool `json:"default_route_via_bond"` + GatewayReachable bool `json:"gateway_reachable"` + GatewayIP string `json:"gateway_ip"` +} + +type EthernetInfo struct { + BondInterfaces []string + Bonds map[string]BondState + Slaves map[string]map[string]SlaveState // Bond -> SlaveName -> Info + LACP map[string]LACPState + Stats map[string]TrafficStats // Maps iface -> stats + Routes RouteState + SyslogErrors []string + + // Legacy string outputs (kept temporarily for backwards compatibility with un-migrated checkers) + ProcNetBonding map[string]string + BondSlaves map[string][]string + Ethtool map[string]string + EthtoolS map[string]string + EthtoolI map[string]string + IPSLink map[string]string + IPLink string + IPAddr string + IPRoute string + IPRule string + IPNeigh string + BridgeVlan string + BridgeFdb string + Dmesg string + RPFilter map[string]string + SysfsBonding map[string]map[string]string +} + +func (e *EthernetInfo) JSON() (string, error) { + b, err := json.Marshal(e) + return string(b), err +} + +type EthernetCollector struct { + name string + info *EthernetInfo + targetBond string +} + +func NewEthernetCollector(targetBond string) (*EthernetCollector, error) { + return &EthernetCollector{ + name: "EthernetCollector", + targetBond: targetBond, + info: &EthernetInfo{ + Bonds: make(map[string]BondState), + Slaves: make(map[string]map[string]SlaveState), + LACP: make(map[string]LACPState), + Stats: make(map[string]TrafficStats), + SyslogErrors: make([]string, 0), + ProcNetBonding: make(map[string]string), + SysfsBonding: make(map[string]map[string]string), + BondSlaves: make(map[string][]string), + Ethtool: make(map[string]string), + EthtoolS: make(map[string]string), + EthtoolI: make(map[string]string), + IPSLink: make(map[string]string), + RPFilter: make(map[string]string), + }, + }, nil +} + +func (c *EthernetCollector) Name() string { + return c.name +} + +func (c *EthernetCollector) Collect(ctx context.Context) (*EthernetInfo, error) { + out, _ := utils.ExecCommand(ctx, "ip", "-o", "link", "show", "type", "bond") + lines := strings.Split(string(out), "\n") + c.info.BondInterfaces = nil + for _, l := range lines { + if l == "" { + continue + } + parts := strings.Split(l, ": ") + if len(parts) >= 2 { + name := strings.Split(strings.TrimSpace(parts[1]), "@")[0] + c.info.BondInterfaces = append(c.info.BondInterfaces, name) + } + } + + // Filter based on targetBond + if c.targetBond != "" { + var filtered []string + for _, b := range c.info.BondInterfaces { + if b == c.targetBond { + filtered = append(filtered, b) + break + } + } + c.info.BondInterfaces = filtered + } + + for _, bond := range c.info.BondInterfaces { + c.info.Bonds[bond] = BondState{Name: bond} + c.info.Slaves[bond] = make(map[string]SlaveState) + + outProc, _ := utils.ExecCommand(ctx, "cat", "/proc/net/bonding/"+bond) + c.info.ProcNetBonding[bond] = string(outProc) + + // Parse BondState config from sysfs + attrs := []string{"mode", "miimon", "slaves", "xmit_hash_policy", "lacp_rate"} + if c.info.SysfsBonding[bond] == nil { + c.info.SysfsBonding[bond] = make(map[string]string) + } + for _, attr := range attrs { + outAttr, _ := utils.ExecCommand(ctx, "cat", "/sys/class/net/"+bond+"/bonding/"+attr) + c.info.SysfsBonding[bond][attr] = strings.TrimSpace(string(outAttr)) + } + + slavesStr := c.info.SysfsBonding[bond]["slaves"] + slaves := strings.Fields(slavesStr) + c.info.BondSlaves[bond] = slaves + + // Fetch sysctl rp_filter for bond + outRP, _ := utils.ExecCommand(ctx, "sysctl", "-n", "net.ipv4.conf."+bond+".rp_filter") + c.info.RPFilter[bond] = strings.TrimSpace(string(outRP)) + + // For each slave, fetch L1 info + for _, slave := range slaves { + outEth, _ := utils.ExecCommand(ctx, "ethtool", slave) + c.info.Ethtool[slave] = string(outEth) + + outEthS, _ := utils.ExecCommand(ctx, "ethtool", "-S", slave) + c.info.EthtoolS[slave] = string(outEthS) + + outEthI, _ := utils.ExecCommand(ctx, "ethtool", "-i", slave) + c.info.EthtoolI[slave] = string(outEthI) + + outIPSL, _ := utils.ExecCommand(ctx, "ip", "-s", "link", "show", slave) + c.info.IPSLink[slave] = string(outIPSL) + } + } + + var grepParts []string + if c.targetBond != "" { + grepParts = append(grepParts, c.targetBond) + } + + filterArgs := []string{"grep", "-iE", strings.Join(grepParts, "|")} + + if len(grepParts) > 0 { + outIPLink, _ := utils.ExecCommand(ctx, "sh", "-c", "ip -d link | "+strings.Join(filterArgs, " ")) + c.info.IPLink = string(outIPLink) + + outIPAddr, _ := utils.ExecCommand(ctx, "sh", "-c", "ip addr | "+strings.Join(filterArgs, " ")) + c.info.IPAddr = string(outIPAddr) + + outIPRoute, _ := utils.ExecCommand(ctx, "sh", "-c", "ip route | "+strings.Join(filterArgs, " ")) + c.info.IPRoute = string(outIPRoute) + + outIPRule, _ := utils.ExecCommand(ctx, "sh", "-c", "ip rule | "+strings.Join(filterArgs, " ")) + c.info.IPRule = string(outIPRule) + + outIPNeigh, _ := utils.ExecCommand(ctx, "sh", "-c", "ip neigh | "+strings.Join(filterArgs, " ")) + c.info.IPNeigh = string(outIPNeigh) + + outBridgeVlan, _ := utils.ExecCommand(ctx, "sh", "-c", "bridge vlan show | "+strings.Join(filterArgs, " ")) + c.info.BridgeVlan = string(outBridgeVlan) + + outBridgeFdb, _ := utils.ExecCommand(ctx, "sh", "-c", "bridge fdb show | "+strings.Join(filterArgs, " ")) + c.info.BridgeFdb = string(outBridgeFdb) + } else { + outIPLink, _ := utils.ExecCommand(ctx, "ip", "-d", "link") + c.info.IPLink = string(outIPLink) + + outIPAddr, _ := utils.ExecCommand(ctx, "ip", "addr") + c.info.IPAddr = string(outIPAddr) + + outIPRoute, _ := utils.ExecCommand(ctx, "ip", "route") + c.info.IPRoute = string(outIPRoute) + + outIPRule, _ := utils.ExecCommand(ctx, "ip", "rule") + c.info.IPRule = string(outIPRule) + + outIPNeigh, _ := utils.ExecCommand(ctx, "ip", "neigh") + c.info.IPNeigh = string(outIPNeigh) + + outBridgeVlan, _ := utils.ExecCommand(ctx, "bridge", "vlan", "show") + c.info.BridgeVlan = string(outBridgeVlan) + + outBridgeFdb, _ := utils.ExecCommand(ctx, "bridge", "fdb", "show") + c.info.BridgeFdb = string(outBridgeFdb) + } + + // Post-process parsed KV states for bonds + for _, bond := range c.info.BondInterfaces { + bState := c.info.Bonds[bond] + bState.Mode = c.info.SysfsBonding[bond]["mode"] + bState.XmitHashPolicy = c.info.SysfsBonding[bond]["xmit_hash_policy"] + bState.LACPRate = c.info.SysfsBonding[bond]["lacp_rate"] + bState.Miimon, _ = strconv.Atoi(c.info.SysfsBonding[bond]["miimon"]) + + // Link states from IP Link / Addr + bState.IsUp = strings.Contains(c.info.IPLink, fmt.Sprintf("%s: ", bond)) || strings.Contains(c.info.IPLink, "LOWER_UP") + + mtuMatch := regexp.MustCompile(fmt.Sprintf(`%s:.*mtu (\d+)`, bond)).FindStringSubmatch(c.info.IPLink) + if len(mtuMatch) > 1 { + bState.MTU, _ = strconv.Atoi(mtuMatch[1]) + } + + ipMatch := regexp.MustCompile(fmt.Sprintf(`inet ([\d\.]+)/\d+.*%s`, bond)).FindStringSubmatch(c.info.IPAddr) + if len(ipMatch) > 1 { + bState.IPAddr = ipMatch[1] + } + + // Parse ProcNetBonding for Active Slave and 802.3ad info + procStr := c.info.ProcNetBonding[bond] + activeSlaveMatch := regexp.MustCompile(`Currently Active Slave:\s*(\w+)`).FindStringSubmatch(procStr) + if len(activeSlaveMatch) > 1 { + bState.ActiveSlave = activeSlaveMatch[1] + } + + c.info.Bonds[bond] = bState + + // Populate LACP State + if strings.Contains(procStr, "Bonding Mode: IEEE 802.3ad") { + lacp := LACPState{ + SlaveAggregatorIDs: make(map[string]string), + SlaveActorKeys: make(map[string]string), + SlavePartnerKeys: make(map[string]string), + } + activeAggMatch := regexp.MustCompile(`(?s)Active Aggregator Info:\s*Aggregator ID:\s*(\d+).*?Actor Key:\s*(\d+).*?Partner Key:\s*(\d+).*?Partner Mac Address:\s*([\w:]+)`).FindStringSubmatch(procStr) + if len(activeAggMatch) > 4 { + lacp.ActiveAggregatorID = activeAggMatch[1] + lacp.ActorKey = activeAggMatch[2] + lacp.PartnerKey = activeAggMatch[3] + lacp.PartnerMacAddress = activeAggMatch[4] + } + + slavesData := strings.Split(procStr, "Slave Interface: ") + for i := 1; i < len(slavesData); i++ { + lines := strings.Split(slavesData[i], "\n") + if len(lines) == 0 { + continue + } + sName := strings.TrimSpace(lines[0]) + aggIDMatch := regexp.MustCompile(`Aggregator ID:\s*(\d+)`).FindStringSubmatch(slavesData[i]) + if len(aggIDMatch) > 1 { + lacp.SlaveAggregatorIDs[sName] = aggIDMatch[1] + } + + actorMatch := regexp.MustCompile(`port key:\s*(\d+)`).FindStringSubmatch(slavesData[i]) + if len(actorMatch) > 1 { + lacp.SlaveActorKeys[sName] = actorMatch[1] + } + + partnerMatch := regexp.MustCompile(`oper key:\s*(\d+)`).FindStringSubmatch(slavesData[i]) + if len(partnerMatch) > 1 { + lacp.SlavePartnerKeys[sName] = partnerMatch[1] + } + } + c.info.LACP[bond] = lacp + } + } + + for _, bond := range c.info.BondInterfaces { + for _, slave := range c.info.BondSlaves[bond] { + sState := SlaveState{Name: slave} + + // IsUp and Link detection + outEth := c.info.Ethtool[slave] + sState.LinkDetected = strings.Contains(outEth, "Link detected: yes") + sState.IsUp = strings.Contains(c.info.IPLink, fmt.Sprintf("%s: 1 { + sState.Speed, _ = strconv.Atoi(speedMatch[1]) + } + + duplexMatch := regexp.MustCompile(`Duplex:\s*(\w+)`).FindStringSubmatch(outEth) + if len(duplexMatch) > 1 { + sState.Duplex = duplexMatch[1] + } + + c.info.Slaves[bond][slave] = sState + + // Traffic Stats (ip -s link show) + sStats := TrafficStats{} + outIPSL := c.info.IPSLink[slave] + lines := strings.Split(outIPSL, "\n") + for i, line := range lines { + if strings.Contains(line, "RX:") && i+1 < len(lines) { + fields := strings.Fields(lines[i+1]) + if len(fields) >= 4 { + sStats.RXErrors, _ = strconv.ParseInt(fields[2], 10, 64) + sStats.Dropped, _ = strconv.ParseInt(fields[3], 10, 64) + } + } + if strings.Contains(line, "TX:") && i+1 < len(lines) { + fields := strings.Fields(lines[i+1]) + if len(fields) >= 4 { + sStats.TXErrors, _ = strconv.ParseInt(fields[2], 10, 64) + sStats.Carrier, _ = strconv.ParseInt(fields[3], 10, 64) + } + } + } + + c.info.Stats[slave] = sStats + } + } + + outRPAll, _ := utils.ExecCommand(ctx, "sysctl", "-n", "net.ipv4.conf.all.rp_filter") + c.info.RPFilter["all"] = strings.TrimSpace(string(outRPAll)) + + // Parse RouteState + rState := RouteState{} + routeLines := strings.Split(c.info.IPRoute, "\n") + for _, line := range routeLines { + if strings.HasPrefix(line, "default via ") { + fields := strings.Fields(line) + if len(fields) >= 5 { + rState.GatewayIP = fields[2] + if fields[4] == c.targetBond || (c.targetBond == "" && len(c.info.BondInterfaces) > 0 && fields[4] == c.info.BondInterfaces[0]) { + rState.DefaultRouteViaBond = true + } + } + break + } + } + + if rState.GatewayIP != "" { + // Check reachable in neigh + neighMatch := regexp.MustCompile(fmt.Sprintf(`%s\s+dev\s+.*?lladdr.*?REACHABLE`, regexp.QuoteMeta(rState.GatewayIP))).FindStringSubmatch(c.info.IPNeigh) + if len(neighMatch) > 0 { + rState.GatewayReachable = true + } else { + // Fallback ping if not in neigh Cache instantly + pingOut, _ := utils.ExecCommand(ctx, "ping", "-c", "1", "-W", "1", rState.GatewayIP) + if strings.Contains(string(pingOut), "1 received") { + rState.GatewayReachable = true + } + } + } + c.info.Routes = rState + + // Parse Syslog Errors + dmesgGrep := "eth|mlx|link|bond" + if len(grepParts) > 0 { + dmesgGrep = strings.Join(grepParts, "|") + } + outDmesg, _ := utils.ExecCommand(ctx, "sh", "-c", fmt.Sprintf("dmesg | grep -iE '%s' | tail -n 100", dmesgGrep)) + + dmesgStr := string(outDmesg) + c.info.Dmesg = dmesgStr + + for _, l := range strings.Split(dmesgStr, "\n") { + lowerLine := strings.ToLower(l) + if strings.Contains(lowerLine, "down") || strings.Contains(lowerLine, "fail") || strings.Contains(lowerLine, "error") || strings.Contains(lowerLine, "flap") { + c.info.SyslogErrors = append(c.info.SyslogErrors, strings.TrimSpace(l)) + } + } + + // Also append journalctl errors specifically for bonding + outJournal, err := utils.ExecCommand(ctx, "sh", "-c", fmt.Sprintf("journalctl -k -S \"1 hour ago\" | grep -iE '%s' | grep -iE 'down|fail|flap|error' | tail -n 20", dmesgGrep)) + if err == nil { + for _, l := range strings.Split(string(outJournal), "\n") { + if strings.TrimSpace(l) != "" { + c.info.SyslogErrors = append(c.info.SyslogErrors, strings.TrimSpace(l)) + } + } + } + + return c.info, nil +} diff --git a/components/ethernet/config/config.go b/components/ethernet/config/config.go new file mode 100644 index 00000000..cc5a759a --- /dev/null +++ b/components/ethernet/config/config.go @@ -0,0 +1,67 @@ +/* +Copyright 2024 The Scitix Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package config + +import ( + "github.com/scitix/sichek/components/common" + "github.com/scitix/sichek/consts" +) + +type EthernetUserConfig struct { + Ethernet *EthernetConfig `json:"ethernet" yaml:"ethernet"` +} + +type EthernetConfig struct { + QueryInterval common.Duration `json:"query_interval" yaml:"query_interval"` + CacheSize int64 `json:"cache_size" yaml:"cache_size"` + IgnoredCheckers []string `json:"ignored_checkers" yaml:"ignored_checkers"` +} + +func (c *EthernetUserConfig) GetQueryInterval() common.Duration { + if c.Ethernet == nil { + return common.Duration{} + } + return c.Ethernet.QueryInterval +} + +func (c *EthernetUserConfig) SetQueryInterval(newInterval common.Duration) { + if c.Ethernet == nil { + c.Ethernet = &EthernetConfig{} + } + c.Ethernet.QueryInterval = newInterval +} + +const ( + EthernetL1CheckerName = "L1(Physical Link)" + EthernetL2CheckerName = "L2(Bond)" + EthernetL3CheckerName = "L3(LACP)" + EthernetL4CheckerName = "L4(ARP)" + EthernetL5CheckerName = "L5(Routing)" +) + +var EthernetCheckItems = map[string]string{ + EthernetL1CheckerName: "Check Layer 1 properties", + EthernetL2CheckerName: "Check Layer 2 properties", + EthernetL3CheckerName: "Check Layer 3 properties", + EthernetL4CheckerName: "Check Layer 4 properties", + EthernetL5CheckerName: "Check Layer 5 properties", +} + +func LoadDefaultEventRules() (common.EventRuleGroup, error) { + eventRules := make(common.EventRuleGroup) + err := common.LoadDefaultEventRules(&eventRules, consts.ComponentNameEthernet) + return eventRules, err +} diff --git a/components/ethernet/config/default_spec.yaml b/components/ethernet/config/default_spec.yaml new file mode 100644 index 00000000..2b559801 --- /dev/null +++ b/components/ethernet/config/default_spec.yaml @@ -0,0 +1,13 @@ +ethernet: + default: + target_bond: "bond0" + bond_mode: "802.3ad" + mii_status: "up" + lacp_rate: "slow 0" + mtu: "1500" + speed: "25000" + min_slaves: 2 + xmit_hash_policy: "layer3+4 1" + miimon: 100 + updelay: 0 + downdelay: 0 diff --git a/components/ethernet/config/spec.go b/components/ethernet/config/spec.go new file mode 100644 index 00000000..bab74189 --- /dev/null +++ b/components/ethernet/config/spec.go @@ -0,0 +1,63 @@ +/* +Copyright 2024 The Scitix Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package config + +import ( + "fmt" + "github.com/scitix/sichek/pkg/utils" + "github.com/sirupsen/logrus" +) + +type EthernetSpecConfig struct { + TargetBond string `json:"target_bond" yaml:"target_bond"` + BondMode string `json:"bond_mode" yaml:"bond_mode"` + MIIStatus string `json:"mii_status" yaml:"mii_status"` + LACPRate string `json:"lacp_rate" yaml:"lacp_rate"` + MTU string `json:"mtu" yaml:"mtu"` + Speed string `json:"speed" yaml:"speed"` + MinSlaves int `json:"min_slaves" yaml:"min_slaves"` + XmitHashPolicy string `json:"xmit_hash_policy" yaml:"xmit_hash_policy"` + Miimon int `json:"miimon" yaml:"miimon"` + UpDelay int `json:"updelay" yaml:"updelay"` + DownDelay int `json:"downdelay" yaml:"downdelay"` +} + +type EthernetSpecs struct { + Ethernet map[string]*EthernetSpecConfig `json:"ethernet" yaml:"ethernet"` +} + +// LoadSpec loads Ethernet spec from the given file path. +func LoadSpec(file string) (*EthernetSpecConfig, error) { + if file == "" { + return nil, fmt.Errorf("ethernet spec file path is empty") + } + s := &EthernetSpecs{} + if err := utils.LoadFromYaml(file, s); err != nil { + return nil, fmt.Errorf("failed to parse YAML file %s: %v", file, err) + } + + if s.Ethernet == nil { + return nil, fmt.Errorf("ethernet spec is empty") + } + + // For ethernet, we assume a "default" spec for now, similar to infiniband + if spec, ok := s.Ethernet["default"]; ok { + logrus.WithField("component", "ethernet").Infof("Loaded default Ethernet spec") + return spec, nil + } + + return nil, fmt.Errorf("default ethernet spec not found in provided specs") +} diff --git a/components/ethernet/ethernet.go b/components/ethernet/ethernet.go new file mode 100644 index 00000000..bc9b2e9d --- /dev/null +++ b/components/ethernet/ethernet.go @@ -0,0 +1,336 @@ +/* +Copyright 2024 The Scitix Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package ethernet + +import ( + "context" + "fmt" + "strings" + "sync" + "time" + + "github.com/scitix/sichek/components/common" + filter "github.com/scitix/sichek/components/common/eventfilter" + "github.com/scitix/sichek/components/ethernet/checker" + "github.com/scitix/sichek/components/ethernet/collector" + "github.com/scitix/sichek/components/ethernet/config" + "github.com/scitix/sichek/consts" + "github.com/scitix/sichek/pkg/utils" + + "github.com/sirupsen/logrus" +) + +type component struct { + ctx context.Context + cancel context.CancelFunc + componentName string + cfg *config.EthernetUserConfig + cfgMutex sync.Mutex + collector *collector.EthernetCollector + checkers []common.Checker + filter *filter.EventFilter + + cacheMtx sync.RWMutex + cacheBuffer []*common.Result + cacheInfo []common.Info + currIndex int64 + cacheSize int64 + + service *common.CommonService +} + +var ( + ethernetComponent *component + ethernetComponentOnce sync.Once +) + +func NewEthernetComponent(cfgFile string, specFile string, ignoredCheckers []string) (common.Component, error) { + var err error + ethernetComponentOnce.Do(func() { + defer func() { + if r := recover(); r != nil { + err = fmt.Errorf("panic occurred when create component ethernet: %v", r) + } + }() + ethernetComponent, err = newEthernetComponent(cfgFile, specFile, ignoredCheckers) + }) + return ethernetComponent, err +} + +func newEthernetComponent(cfgFile string, specFile string, ignoredCheckers []string) (comp *component, err error) { + ctx, cancel := context.WithCancel(context.Background()) + defer func() { + if err != nil { + cancel() + } + }() + + cfg := &config.EthernetUserConfig{} + err = common.LoadUserConfig(cfgFile, cfg) + if err != nil || cfg.Ethernet == nil { + logrus.WithField("component", "ethernet").Warnf("get user config failed or ethernet config is nil, using default config") + cfg.Ethernet = &config.EthernetConfig{ + QueryInterval: common.Duration{Duration: 60 * time.Second}, + CacheSize: 5, + } + } + if len(ignoredCheckers) > 0 { + cfg.Ethernet.IgnoredCheckers = ignoredCheckers + } + + eventRules, err := config.LoadDefaultEventRules() + if err != nil { + logrus.WithField("component", "ethernet").Warnf("failed to load eventrules: %v", err) + } + + filterPointer, err := filter.NewEventFilter(consts.ComponentNameEthernet, eventRules, 100) + if err != nil { + logrus.WithField("component", "ethernet").Warnf("NewEthernetComponent create event filter failed: %v", err) + filterPointer = nil + } + + spec, err := config.LoadSpec(specFile) + if err != nil { + logrus.WithField("component", "ethernet").Warnf("failed to load spec %s: %v", specFile, err) + } + + targetBond := "" + if spec != nil { + targetBond = spec.TargetBond + } + + collectorInst, err := collector.NewEthernetCollector(targetBond) + if err != nil { + logrus.WithField("component", "ethernet").Errorf("NewEthernetComponent create collector failed: %v", err) + return nil, err + } + + checkers, err := checker.NewCheckers(cfg, spec) + if err != nil { + return nil, err + } + + cacheSize := cfg.Ethernet.CacheSize + if cacheSize == 0 { + cacheSize = 5 + } + + component := &component{ + ctx: ctx, + cancel: cancel, + componentName: consts.ComponentNameEthernet, + collector: collectorInst, + checkers: checkers, + filter: filterPointer, + cfg: cfg, + cacheBuffer: make([]*common.Result, cacheSize), + cacheInfo: make([]common.Info, cacheSize), + cacheSize: cacheSize, + } + service := common.NewCommonService(ctx, cfg, component.componentName, component.GetTimeout(), component.HealthCheck) + component.service = service + + return component, nil +} + +func (c *component) Name() string { + return c.componentName +} + +func (c *component) HealthCheck(ctx context.Context) (*common.Result, error) { + timer := common.NewTimer(fmt.Sprintf("%s-HealthCheck-Cost", c.componentName)) + ethInfo, err := c.collector.Collect(ctx) + if err != nil { + logrus.WithField("component", "ethernet").Errorf("failed to collect ethernet info: %v", err) + return nil, err + } + logrus.WithField("component", "ethernet").Infof("collected ethernet info: %+v", ethInfo) + + result := common.Check(ctx, c.componentName, ethInfo, c.checkers) + timer.Mark("ethernet-check") + + if c.filter != nil { + eventResult := c.filter.Check() + timer.Mark("event-filter") + if eventResult != nil { + result.Checkers = append(result.Checkers, eventResult.Checkers...) + if eventResult.Status == consts.StatusAbnormal { + result.Status = consts.StatusAbnormal + if consts.LevelPriority[result.Level] < consts.LevelPriority[eventResult.Level] { + result.Level = eventResult.Level + } + } + } + } + + c.cacheMtx.Lock() + c.cacheBuffer[c.currIndex] = result + c.cacheInfo[c.currIndex] = ethInfo + c.currIndex = (c.currIndex + 1) % c.cacheSize + c.cacheMtx.Unlock() + + if result.Status == consts.StatusAbnormal && consts.LevelPriority[result.Level] > consts.LevelPriority[consts.LevelInfo] { + logrus.WithField("component", "ethernet").Errorf("Health Check Failed") + } else { + logrus.WithField("component", "ethernet").Infof("Health Check PASSED") + } + + return result, nil +} + +func (c *component) CacheResults() ([]*common.Result, error) { + c.cacheMtx.RLock() + defer c.cacheMtx.RUnlock() + return c.cacheBuffer, nil +} + +func (c *component) LastResult() (*common.Result, error) { + c.cacheMtx.RLock() + defer c.cacheMtx.RUnlock() + result := c.cacheBuffer[c.currIndex] + if c.currIndex == 0 { + result = c.cacheBuffer[c.cacheSize-1] + } + return result, nil +} + +func (c *component) CacheInfos() ([]common.Info, error) { + c.cacheMtx.RLock() + defer c.cacheMtx.RUnlock() + return c.cacheInfo, nil +} + +func (c *component) LastInfo() (common.Info, error) { + c.cacheMtx.RLock() + defer c.cacheMtx.RUnlock() + var info common.Info + if c.currIndex == 0 { + info = c.cacheInfo[c.cacheSize-1] + } else { + info = c.cacheInfo[c.currIndex-1] + } + return info, nil +} + +func (c *component) Start() <-chan *common.Result { + return c.service.Start() +} + +func (c *component) Stop() error { + return c.service.Stop() +} + +func (c *component) Update(cfg common.ComponentUserConfig) error { + c.cfgMutex.Lock() + configPointer, ok := cfg.(*config.EthernetUserConfig) + if !ok { + return fmt.Errorf("update wrong config type for ethernet") + } + c.cfg = configPointer + c.cfgMutex.Unlock() + return c.service.Update(cfg) +} + +func (c *component) Status() bool { + return c.service.Status() +} + +func (c *component) GetTimeout() time.Duration { + return c.cfg.GetQueryInterval().Duration +} + +func (c *component) PrintInfo(info common.Info, result *common.Result, summaryPrint bool) bool { + checkAllPassed := true + if result.Status == consts.StatusAbnormal && consts.LevelPriority[result.Level] > consts.LevelPriority[consts.LevelInfo] { + checkAllPassed = false + } + ethEvent := make(map[string]string) + + l1Print := fmt.Sprintf("L1(Link): %sNot Checked%s", consts.Yellow, consts.Reset) + l2Print := fmt.Sprintf("L2(Bond): %sNot Checked%s", consts.Yellow, consts.Reset) + l3Print := fmt.Sprintf("L3(LACP): %sNot Checked%s", consts.Yellow, consts.Reset) + l4Print := fmt.Sprintf("L4(ARP) : %sNot Checked%s", consts.Yellow, consts.Reset) + l5Print := fmt.Sprintf("L5(Route): %sNot Checked%s", consts.Yellow, consts.Reset) + + utils.PrintTitle("Ethernet", "-") + checkerResults := result.Checkers + for _, res := range checkerResults { + if res.Status != consts.StatusNormal && res.Level != consts.LevelInfo { + checkAllPassed = false + ethEvent[res.Name] = fmt.Sprintf("Event: %s%s%s -> %s", consts.Red, res.ErrorName, consts.Reset, strings.TrimRight(res.Detail, "\n")) + } + + statusColor := consts.Green + statusText := "OK" + if res.Status != consts.StatusNormal { + statusColor = consts.Red + statusText = "Err" + } + + switch res.Name { + case config.EthernetL1CheckerName: + l1Print = fmt.Sprintf("L1(Physical Link): %s%s%s", statusColor, statusText, consts.Reset) + case config.EthernetL2CheckerName: + l2Print = fmt.Sprintf("L2(Bonding) : %s%s%s", statusColor, statusText, consts.Reset) + case config.EthernetL3CheckerName: + l3Print = fmt.Sprintf("L3(LACP) : %s%s%s", statusColor, statusText, consts.Reset) + case config.EthernetL4CheckerName: + l4Print = fmt.Sprintf("L4(ARP) : %s%s%s", statusColor, statusText, consts.Reset) + case config.EthernetL5CheckerName: + l5Print = fmt.Sprintf("L5(Routing) : %s%s%s", statusColor, statusText, consts.Reset) + } + } + + ethInfo, ok := info.(*collector.EthernetInfo) + if ok && len(ethInfo.BondInterfaces) > 0 { + for _, bond := range ethInfo.BondInterfaces { + fmt.Printf("Bond Interface: %s\n", bond) + if sysfs, exists := ethInfo.SysfsBonding[bond]; exists { + mode := sysfs["mode"] + miimon := sysfs["miimon"] + lacpRate := sysfs["lacp_rate"] + slaves := strings.Join(ethInfo.BondSlaves[bond], ", ") + + fmt.Printf("Bond Mode: %-25s ", mode) + fmt.Printf("MII Monitor: %-25s\n", miimon) + fmt.Printf("LACP Rate: %-25s ", lacpRate) + fmt.Printf("Slaves : %-25s\n", slaves) + } + + // Try parsing sysctl rp_filter + if rpFilter, exists := ethInfo.RPFilter[bond]; exists { + fmt.Printf("RP Filter: %-25s\n", rpFilter) + } + fmt.Println() + } + } + + fmt.Printf("%-35s%-35s\n", l1Print, l2Print) + fmt.Printf("%-35s%-35s\n", l3Print, l4Print) + fmt.Printf("%-35s\n", l5Print) + + if len(ethEvent) == 0 { + fmt.Printf("\nErrors Events:\n\tNo Ethernet Events Detected\n") + } else { + fmt.Printf("\nErrors Events:\n") + for _, v := range ethEvent { + fmt.Printf("\t%s\n", v) + } + } + + fmt.Println() + return checkAllPassed +} diff --git a/components/ethernet/ethernet_test.go b/components/ethernet/ethernet_test.go new file mode 100644 index 00000000..a2f19505 --- /dev/null +++ b/components/ethernet/ethernet_test.go @@ -0,0 +1,174 @@ +/* +Copyright 2024 The Scitix Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package ethernet + +import ( + "context" + "testing" + + "github.com/scitix/sichek/components/ethernet/checker" + "github.com/scitix/sichek/components/ethernet/collector" + "github.com/scitix/sichek/components/ethernet/config" + "github.com/scitix/sichek/consts" + "github.com/stretchr/testify/assert" +) + +func TestEthernetCheckers(t *testing.T) { + spec := &config.EthernetSpecConfig{ + BondMode: "802.3ad", + MIIStatus: "up", + LACPRate: "fast", + MTU: "1500", + MinSlaves: 1, + XmitHashPolicy: "layer3+4", + Miimon: 100, + UpDelay: 100, + DownDelay: 100, + } + + info := &collector.EthernetInfo{ + BondInterfaces: []string{"bond0"}, + Bonds: map[string]collector.BondState{ + "bond0": { + Name: "bond0", + IsUp: true, + HasLowerUp: true, + IPAddr: "192.168.1.1", + MTU: 1500, + Mode: "802.3ad", + Miimon: 100, + XmitHashPolicy: "layer3+4", + LACPRate: "fast", + ActiveSlave: "eth0", + }, + }, + LACP: map[string]collector.LACPState{ + "bond0": { + ActiveAggregatorID: "1", + ActorKey: "21", + PartnerKey: "40016", + PartnerMacAddress: "11:22:33:44:55:66", + SlaveAggregatorIDs: map[string]string{"eth0": "1"}, + SlaveActorKeys: map[string]string{"eth0": "21"}, + SlavePartnerKeys: map[string]string{"eth0": "40016"}, + }, + }, + Slaves: map[string]map[string]collector.SlaveState{ + "bond0": { + "eth0": {Name: "eth0", IsUp: true, LinkDetected: true, Speed: 25000, Duplex: "Full"}, + }, + }, + Stats: map[string]collector.TrafficStats{ + "eth0": {RXErrors: 0, TXErrors: 0, Dropped: 0, Carrier: 0}, + }, + Routes: collector.RouteState{ + DefaultRouteViaBond: true, + GatewayReachable: true, + GatewayIP: "192.168.1.1", + }, + SyslogErrors: []string{}, + ProcNetBonding: map[string]string{ + "bond0": "MII Status: up\nUp Delay (ms): 100\nDown Delay (ms): 100\n", + }, + RPFilter: map[string]string{ + "all": "0", + "bond0": "0", + }, + } + + ctx := context.Background() + + checkers, err := checker.NewCheckers(&config.EthernetUserConfig{}, spec) + assert.NoError(t, err) + + for _, c := range checkers { + res, err := c.Check(ctx, info) + assert.NoError(t, err) + assert.Equal(t, consts.StatusNormal, res.Status, "Checker %s failed unexpectedly: %s", c.Name(), res.Detail) + } +} + +func TestEthernetCheckersFailures(t *testing.T) { + spec := &config.EthernetSpecConfig{ + BondMode: "802.3ad", + MIIStatus: "up", + LACPRate: "fast", + MTU: "1500", + MinSlaves: 2, + XmitHashPolicy: "layer3+4", + Miimon: 100, + UpDelay: 100, + DownDelay: 100, + } + + info := &collector.EthernetInfo{ + BondInterfaces: []string{"bond0"}, + Bonds: map[string]collector.BondState{ + "bond0": { + Name: "bond0", + IsUp: false, + HasLowerUp: false, + IPAddr: "", + MTU: 1500, + Mode: "802.3ad", + Miimon: 0, + XmitHashPolicy: "layer3+4", + LACPRate: "slow", + ActiveSlave: "", + }, + }, + LACP: map[string]collector.LACPState{ + "bond0": { + ActiveAggregatorID: "1", + ActorKey: "21", + PartnerKey: "40016", + PartnerMacAddress: "00:00:00:00:00:00", + SlaveAggregatorIDs: map[string]string{"eth0": "2"}, // mismatch + SlaveActorKeys: map[string]string{"eth0": "21"}, + SlavePartnerKeys: map[string]string{"eth0": "40016"}, + }, + }, + Slaves: map[string]map[string]collector.SlaveState{ + "bond0": { + "eth0": {Name: "eth0", IsUp: false, LinkDetected: false, Speed: 1000, Duplex: "Half"}, + }, + }, + Stats: map[string]collector.TrafficStats{ + "eth0": {RXErrors: 100, TXErrors: 50, Dropped: 100, Carrier: 10}, + }, + Routes: collector.RouteState{ + DefaultRouteViaBond: false, + GatewayReachable: false, + GatewayIP: "192.168.1.1", + }, + SyslogErrors: []string{"eth0 tx timeout"}, + RPFilter: map[string]string{ + "all": "1", + "bond0": "1", + }, + } + + ctx := context.Background() + + checkers, err := checker.NewCheckers(&config.EthernetUserConfig{}, spec) + assert.NoError(t, err) + + for _, c := range checkers { + res, err := c.Check(ctx, info) + assert.NoError(t, err) + assert.Equal(t, consts.StatusAbnormal, res.Status, "Checker %s should have failed", c.Name()) + } +} diff --git a/config/default_spec.yaml b/config/default_spec.yaml index da63ee78..53aea44c 100644 --- a/config/default_spec.yaml +++ b/config/default_spec.yaml @@ -101,4 +101,17 @@ pcie_topo: pci_switches: - gpu: 1 ib: 1 - count: 8 \ No newline at end of file + count: 8 +ethernet: + default: + target_bond: "bond0" + bond_mode: "802.3ad" + mii_status: "up" + lacp_rate: "slow 0" + mtu: "1500" + speed: "25000" + min_slaves: 2 + xmit_hash_policy: "layer3+4 1" + miimon: 100 + updelay: 0 + downdelay: 0 \ No newline at end of file diff --git a/config/default_user_config.yaml b/config/default_user_config.yaml index 07605f8d..c4248935 100644 --- a/config/default_user_config.yaml +++ b/config/default_user_config.yaml @@ -65,4 +65,9 @@ nccltest: enable: true pcie_topo: - enable: true \ No newline at end of file + enable: true + +ethernet: + query_interval: 10s + cache_size: 5 + ignored_checkers: [] \ No newline at end of file diff --git a/consts/consts.go b/consts/consts.go index d2da83f9..d9bd9729 100644 --- a/consts/consts.go +++ b/consts/consts.go @@ -81,7 +81,7 @@ var ( DefaultComponentQueryInterval = time.Duration.Seconds(1) DefaultComponents = []string{ - ComponentNameCPU, ComponentNameNvidia, ComponentNameInfiniband, ComponentNameGpfs, ComponentNameDmesg, + ComponentNameCPU, ComponentNameNvidia, ComponentNameInfiniband, ComponentNameEthernet, ComponentNameGpfs, ComponentNameDmesg, ComponentNamePodlog, ComponentNameGpuEvents, ComponentNameSyslog, } ) diff --git a/docs/ethernet.md b/docs/ethernet.md new file mode 100644 index 00000000..36bdb612 --- /dev/null +++ b/docs/ethernet.md @@ -0,0 +1,70 @@ +# Ethernet Network Check + +Ethernet network performance and operational status can be assessed through comprehensive system metrics. Proactively catching and localizing failures before they affect user workloads is crucial for maintaining cluster stability and high utilization, especially in GPU compute scenarios. + +## Ethernet Specific Checks + +To ensure the proper functioning of Ethernet interfaces (especially Bond aggregation links), SiChek performs evaluations across the following layers: + +### 1. Configuration Validation + +- **Bonding Settings**: + - Verify that the **Bond Mode** matches the expectation (e.g., 802.3ad Dynamic link aggregation). + - Confirm that **MII Status Monitoring** (Miimon) is enabled and the polling interval is reasonably configured. + - Validate **MTU** configuration consistency to prevent performance loss due to fragmentation in large packet transfers or RoCE scenarios. + - Verify that the network **Transmission Hash Policy** (xmit_hash_policy) meets business requirements (e.g., layer3+4). + +- **LACP Negotiation Baselines**: + - In 802.3ad mode, validate the **LACP Rate** (lacp_rate) configuration (fast/slow). + - Check if the **Minimum Slave Count** (MinSlaves) in multi-slave environments meets the standard. + +### 2. Runtime Error and Anomaly Detection + +- **Physical Layer (L1)**: + - **Link Detected**: Confirm that all underlying physical slave NICs have a link status of UP. + - **Physical Speed Match**: Check if the actual negotiated speed matches hardware specifications (e.g., 25Gbps/100Gbps). + - **TX Timeout**: Scan kernel logs to identify transmit timeout warnings generated by drivers or firmware. + +- **Bonding Layer (L2)**: + - **Aggregator Consistency**: Check if the aggregator IDs of each slave NIC in LACP negotiation match the global active aggregator ID. + - **Partner MAC**: Identify if the switch-side LACP is responding normally (rejecting all-zero MACs). + - **Link Flapping and Failures**: Monitor the `Link Failure Count` history and increments for slave NICs. + +- **Connectivity and Forwarding Layer (L4/L5)**: + - **ARP Neighbor Status**: Real-time retrieval of `FAILED` or `INCOMPLETE` abnormal entries in the ARP table. + - **Gateway Reachability**: Dual-verify default gateway connectivity through Ping and neighbor cache. + - **Reverse Path Filter (RP Filter)**: Validate configuration to prevent legitimate packets from being dropped by the kernel due to asymmetric routing. + +## Key Metrics + +SiChek collects and parses the following key metrics to support the checks listed above: + +- **NIC Basic Info** (device-level) + - **Link Detected**: Physical carrier detection status. + - **Negotiated Speed**: Automatically negotiated or manually set physical layer synchronization speed. + - **Duplex Mode**: Duplex mode (Full/Half). + +- **Bonding States** (interface-level) + - **Bonding Mode**: System-defined link aggregation mode. + - **MII Status**: Logical Bond interface connectivity status. + - **MTU**: Maximum Transmission Unit size. + - **Xmit Hash Policy**: Hash algorithm for distributing packets across different slave NICs. + - **Active Aggregator ID**: Active aggregator identifier in the current 802.3ad mode. + - **Partner Mac Address**: MAC address of the successfully negotiated peer switch. + +- **Traffic Statistics** (device-level) + - **RX/TX Errors**: Count of packet errors during receiving or sending. + - **Dropped Packets**: Count of packets dropped due to insufficient buffer or configuration limits. + - **Carrier Errors**: Count of carrier signal loss detected at the link layer. + - **CRC Errors (Approximation)**: Cyclic Redundancy Check errors, indicating physical medium quality. + +- **Protocol and Routing States** (node-level) + - **IP Neighbor State**: Neighbor node L2 resolution status returned by ARP commands. + - **Gateway Reachability**: Indicates if the default gateway responds to ICMP or L2 liveliness probes. + - **Default Route Via Bond**: Confirms if the system's primary default route is physically carried on the target Bond interface. + - **rp_filter Context**: Values of `net.ipv4.conf.all.rp_filter` at system and interface levels. + +- **System Log Anomalies** (node-level) + - **Kernel Dmesg/Journal**: Real-time capture of network-related kernel error records containing "flap", "down", "fail", or "tx timeout". + +By systematically collecting these multi-dimensional metrics and performing layered checks, Sichek can accurately identify Ethernet configuration flaws and operational risks, ensuring network reliability in large-scale computing clusters. From 052708310e9570c63c109972896630e9f2682b01 Mon Sep 17 00:00:00 2001 From: lzi-a11y Date: Tue, 10 Mar 2026 13:54:06 +0800 Subject: [PATCH 2/3] feat: enhance ethernet collector for bond stats,and add metrics --- components/ethernet/checker/checker.go | 70 +++++++++--------- components/ethernet/collector/collector.go | 52 ++++++++------ components/ethernet/config/config.go | 1 + components/ethernet/ethernet.go | 7 ++ components/ethernet/metrics/metrics.go | 83 ++++++++++++++++++++++ config/default_user_config.yaml | 3 +- 6 files changed, 158 insertions(+), 58 deletions(-) create mode 100644 components/ethernet/metrics/metrics.go diff --git a/components/ethernet/checker/checker.go b/components/ethernet/checker/checker.go index df3179b1..4064813a 100644 --- a/components/ethernet/checker/checker.go +++ b/components/ethernet/checker/checker.go @@ -106,7 +106,7 @@ func (c *L1Checker) Check(ctx context.Context, data any) (*common.CheckerResult, result.Status = consts.StatusAbnormal result.Level = consts.LevelCritical result.ErrorName = "LinkDown" - result.Detail += fmt.Sprintf("物理网卡 %s 链路未检测到 UP。执行命令:ethtool %s,预期:Link detected: yes,当前发现未连接或 unknown。\n", slaveName, slaveName) + result.Detail += fmt.Sprintf("Physical NIC %s link not UP. Command: ethtool %s, Expected: Link detected: yes, Actual: not connected or unknown.\n", slaveName, slaveName) } if len(info.SyslogErrors) > 0 { @@ -115,7 +115,7 @@ func (c *L1Checker) Check(ctx context.Context, data any) (*common.CheckerResult, result.Status = consts.StatusAbnormal result.Level = consts.LevelCritical result.ErrorName = "TxTimeout" - result.Detail += fmt.Sprintf("网卡 %s 在内核日志发现 tx timeout。执行命令:dmesg | grep -iE 'eth|mlx|link'。\n", slaveName) + result.Detail += fmt.Sprintf("NIC %s tx timeout found in kernel log. Command: dmesg | grep -iE 'eth|mlx|link'.\n", slaveName) break } } @@ -127,7 +127,7 @@ func (c *L1Checker) Check(ctx context.Context, data any) (*common.CheckerResult, result.Status = consts.StatusAbnormal result.Level = consts.LevelWarning result.ErrorName = "SpeedMismatch" - result.Detail += fmt.Sprintf("网卡 %s 速率不匹配。预期: %sMb/s,当前发现: %sMb/s。\n", slaveName, expectedSpeed, speedStr) + result.Detail += fmt.Sprintf("NIC %s speed mismatch. Command: ethtool %s, Expected: %sMb/s, Actual: %sMb/s.\n", slaveName, slaveName, expectedSpeed, speedStr) } // Parse stats @@ -139,7 +139,7 @@ func (c *L1Checker) Check(ctx context.Context, data any) (*common.CheckerResult, result.Status = consts.StatusAbnormal result.Level = consts.LevelWarning result.ErrorName = "CRCErrorsGrowing" - result.Detail += fmt.Sprintf("网卡 %s RX (CRC) 错误持续增长。之前: %d,当前: %d。\n", slaveName, prev, currCRC) + result.Detail += fmt.Sprintf("NIC %s RX (CRC) errors increasing. Command: ip -s link show %s, Previous: %d, Current: %d.\n", slaveName, slaveName, prev, currCRC) } c.prevCRC[slaveName] = currCRC @@ -149,7 +149,7 @@ func (c *L1Checker) Check(ctx context.Context, data any) (*common.CheckerResult, result.Status = consts.StatusAbnormal result.Level = consts.LevelWarning result.ErrorName = "CarrierErrorsGrowing" - result.Detail += fmt.Sprintf("网卡 %s Carrier 错误持续增长。之前: %d,当前: %d。\n", slaveName, prev, currCarrierIPS) + result.Detail += fmt.Sprintf("NIC %s Carrier errors increasing. Command: ip -s link show %s, Previous: %d, Current: %d.\n", slaveName, slaveName, prev, currCarrierIPS) } c.prevCarrier[slaveName] = currCarrierIPS @@ -159,14 +159,14 @@ func (c *L1Checker) Check(ctx context.Context, data any) (*common.CheckerResult, result.Status = consts.StatusAbnormal result.Level = consts.LevelWarning result.ErrorName = "DropsGrowing" - result.Detail += fmt.Sprintf("网卡 %s Drops 持续增长。之前: %d,当前: %d。\n", slaveName, prev, currDrops) + result.Detail += fmt.Sprintf("NIC %s Drops increasing. Command: ip -s link show %s, Previous: %d, Current: %d.\n", slaveName, slaveName, prev, currDrops) } c.prevDrops[slaveName] = currDrops } } if result.Status != consts.StatusNormal { - result.Suggestion = "请检查物理链路、网线、驱动版本(ethtool -i),或查看 dmesg 中确认具体错误;如果是速率不匹配,请检查对应配置。" + result.Suggestion = "Please check physical link, cable, driver version (ethtool -i), or check dmesg for specific errors; if speed mismatch, check corresponding configuration." } return result, nil @@ -204,7 +204,7 @@ func (c *L2Checker) Check(ctx context.Context, data any) (*common.CheckerResult, result.Status = consts.StatusAbnormal result.Level = consts.LevelCritical result.ErrorName = "BondingMissing" - result.Detail = fmt.Sprintf("Bond %s 在 /proc/net/bonding 中缺失。\n", bond) + result.Detail = fmt.Sprintf("Bond %s missing in /proc/net/bonding. Command: ls /proc/net/bonding/.\n", bond) continue } @@ -218,7 +218,7 @@ func (c *L2Checker) Check(ctx context.Context, data any) (*common.CheckerResult, result.Status = consts.StatusAbnormal result.Level = consts.LevelCritical result.ErrorName = "BondDown" - result.Detail += fmt.Sprintf("Bond 接口 %s 的总体状态不符合预期。命令:cat /proc/net/bonding/%s,预期:MII Status: %s,当前不匹配(可能为 down)。\n", bond, bond, expectedMII) + result.Detail += fmt.Sprintf("Overall status of bond interface %s mismatch. Command: cat /proc/net/bonding/%s, Expected: MII Status: %s, Actual: mismatch (possibly down).\n", bond, bond, expectedMII) } // check MTU @@ -230,7 +230,7 @@ func (c *L2Checker) Check(ctx context.Context, data any) (*common.CheckerResult, result.Level = consts.LevelWarning result.ErrorName = "MTUMismatch" } - result.Detail += fmt.Sprintf("Bond %s 的 MTU 不匹配。预期: %d,实际: %d。\n", bond, expectedMTU, bState.MTU) + result.Detail += fmt.Sprintf("Bond %s MTU mismatch. Command: ip link show %s, Expected: %d, Actual: %d.\n", bond, bond, expectedMTU, bState.MTU) } } @@ -242,7 +242,7 @@ func (c *L2Checker) Check(ctx context.Context, data any) (*common.CheckerResult, result.Level = consts.LevelWarning result.ErrorName = "XmitHashPolicyMismatch" } - result.Detail += fmt.Sprintf("Bond %s 的 xmit_hash_policy 不匹配。预期: %s,当前: %s。执行命令:cat /sys/class/net/%s/bonding/xmit_hash_policy。\n", bond, c.spec.XmitHashPolicy, bState.XmitHashPolicy, bond) + result.Detail += fmt.Sprintf("Bond %s xmit_hash_policy mismatch. Command: cat /sys/class/net/%s/bonding/xmit_hash_policy, Expected: %s, Actual: %s.\n", bond, bond, c.spec.XmitHashPolicy, bState.XmitHashPolicy) } } @@ -254,7 +254,7 @@ func (c *L2Checker) Check(ctx context.Context, data any) (*common.CheckerResult, result.Level = consts.LevelWarning result.ErrorName = "SlaveCountMismatch" } - result.Detail += fmt.Sprintf("Bond %s 的 slave 数量不足。预期至少: %d,实际: %d。\n", bond, expectedMinSlaves, slaveCount) + result.Detail += fmt.Sprintf("Bond %s insufficient slave count. Command: cat /proc/net/bonding/%s, Expected at least: %d, Actual: %d.\n", bond, bond, expectedMinSlaves, slaveCount) } // check miimon, updelay, downdelay (fetching downdelay and updelay via regex since they aren't fully standard across systems on sysfs) @@ -277,7 +277,7 @@ func (c *L2Checker) Check(ctx context.Context, data any) (*common.CheckerResult, result.Status = consts.StatusAbnormal result.Level = consts.LevelCritical result.ErrorName = "MiimonDisabled" - result.Detail += fmt.Sprintf("Bond %s 的 MII Polling Interval (miimon) 为 0,未开启底层链路检测,这会导致物理链路断开时发生持续丢包。执行命令:cat /proc/net/bonding/%s,请务必开启!\n", bond, bond) + result.Detail += fmt.Sprintf("Bond %s MII Polling Interval (miimon) is 0. Command: cat /proc/net/bonding/%s, please enable link detection (miimon) to avoid packet loss.\n", bond, bond) } else { if expectedMiimon > 0 && miimon != expectedMiimon { if result.Status == consts.StatusNormal { @@ -285,35 +285,35 @@ func (c *L2Checker) Check(ctx context.Context, data any) (*common.CheckerResult, result.Level = consts.LevelWarning result.ErrorName = "MiimonMismatch" } - result.Detail += fmt.Sprintf("Bond %s 的 miimon 不匹配。预期: %d ms,实际: %d ms。执行命令:cat /sys/class/net/%s/bonding/miimon。\n", bond, expectedMiimon, miimon, bond) + result.Detail += fmt.Sprintf("Bond %s miimon mismatch. Command: cat /sys/class/net/%s/bonding/miimon, Expected: %d ms, Actual: %d ms.\n", bond, bond, expectedMiimon, miimon) } if downdelay < miimon { result.Status = consts.StatusAbnormal result.Level = consts.LevelWarning result.ErrorName = "DowndelayTooSmall" - result.Detail += fmt.Sprintf("Bond %s 的 downdelay (%d ms) 小于 miimon (%d ms),配置不合理,可能导致不必要的震荡或丢包。执行命令:cat /proc/net/bonding/%s。\n", bond, downdelay, miimon, bond) + result.Detail += fmt.Sprintf("Bond %s downdelay (%d ms) less than miimon (%d ms). Command: cat /proc/net/bonding/%s, unreasonable config may cause flapping.\n", bond, downdelay, miimon, bond) } else if expectedDownDelay > 0 && downdelay != expectedDownDelay { if result.Status == consts.StatusNormal { result.Status = consts.StatusAbnormal result.Level = consts.LevelWarning result.ErrorName = "DowndelayMismatch" } - result.Detail += fmt.Sprintf("Bond %s 的 downdelay 不匹配。预期: %d ms,实际: %d ms。执行命令:cat /sys/class/net/%s/bonding/downdelay。\n", bond, expectedDownDelay, downdelay, bond) + result.Detail += fmt.Sprintf("Bond %s downdelay mismatch. Command: cat /sys/class/net/%s/bonding/downdelay, Expected: %d ms, Actual: %d ms.\n", bond, bond, expectedDownDelay, downdelay) } if updelay == 0 { result.Status = consts.StatusAbnormal result.Level = consts.LevelWarning result.ErrorName = "UpdelayZero" - result.Detail += fmt.Sprintf("Bond %s 的 updelay 为 0,由于交换机端口转发协商需要时间,立即切回流量极易产生丢包黑洞,建议设置 updelay。执行命令:cat /proc/net/bonding/%s。\n", bond, bond) + result.Detail += fmt.Sprintf("Bond %s updelay is 0. Command: cat /proc/net/bonding/%s, updelay is recommended to avoid packet loss during switch port negotiation.\n", bond, bond) } else if expectedUpDelay > 0 && updelay != expectedUpDelay { if result.Status == consts.StatusNormal { result.Status = consts.StatusAbnormal result.Level = consts.LevelWarning result.ErrorName = "UpdelayMismatch" } - result.Detail += fmt.Sprintf("Bond %s 的 updelay 不匹配。预期: %d ms,实际: %d ms。执行命令:cat /sys/class/net/%s/bonding/updelay。\n", bond, expectedUpDelay, updelay, bond) + result.Detail += fmt.Sprintf("Bond %s updelay mismatch. Command: cat /sys/class/net/%s/bonding/updelay, Expected: %d ms, Actual: %d ms.\n", bond, bond, expectedUpDelay, updelay) } } @@ -325,7 +325,7 @@ func (c *L2Checker) Check(ctx context.Context, data any) (*common.CheckerResult, result.Status = consts.StatusAbnormal result.Level = consts.LevelWarning result.ErrorName = "ActiveSlaveFlapping" - result.Detail += fmt.Sprintf("Bond %s 发生了主备端口切换。之前主端口: %s,当前主端口: %s,如果频繁切换请重点关注物理层稳定性。\n", bond, prev, currActive) + result.Detail += fmt.Sprintf("Bond %s active slave switched. Command: cat /proc/net/bonding/%s, Previous: %s, Current: %s. If frequent, please focus on physical layer stability.\n", bond, bond, prev, currActive) } c.prevActiveSlave[bond] = currActive } @@ -345,14 +345,14 @@ func (c *L2Checker) Check(ctx context.Context, data any) (*common.CheckerResult, result.Status = consts.StatusAbnormal result.Level = consts.LevelWarning result.ErrorName = "LinkFailureGrowing" - result.Detail += fmt.Sprintf("Bond %s 的从属网卡 %s 发生了链路断开(Link Failure)。之前故障次数: %d,当前: %d。\n", bond, slaveName, prev, failCount) + result.Detail += fmt.Sprintf("Bond %s slave NIC %s link failure occurred. Command: cat /proc/net/bonding/%s, Previous count: %d, Current: %d.\n", bond, slaveName, bond, prev, failCount) } c.prevLinkFailures[trackKey] = failCount } } if result.Status != consts.StatusNormal { - result.Suggestion = "请使用 cat /proc/net/bonding/bond0 核对 MII 状态及 Link Failure Count;确认配置文件(如 /etc/netplan 或 sysconfig) 中 miimon>0,且 slave 绑卡数量符合预期。" + result.Suggestion = "Please use cat /proc/net/bonding/bond0 to verify MII status and Link Failure Count; ensure config (e.g., /etc/netplan) has miimon > 0." } return result, nil @@ -386,7 +386,7 @@ func (c *L3Checker) Check(ctx context.Context, data any) (*common.CheckerResult, result.Status = consts.StatusAbnormal result.Level = consts.LevelCritical result.ErrorName = "ActiveAggregatorMissing" - result.Detail += fmt.Sprintf("Bond %s 配置为 802.3ad 模式,但未找到有效的 Active Aggregator 协商信息,可能对端交换机未配置 LACP 或链路异常。\n", bond) + result.Detail += fmt.Sprintf("Bond %s configured as 802.3ad mode but no valid Active Aggregator found. Command: cat /proc/net/bonding/%s, peer switch might not have LACP configured or link is abnormal.\n", bond, bond) continue } @@ -394,7 +394,7 @@ func (c *L3Checker) Check(ctx context.Context, data any) (*common.CheckerResult, result.Status = consts.StatusAbnormal result.Level = consts.LevelCritical result.ErrorName = "PartnerMacInvalid" - result.Detail += fmt.Sprintf("Bond %s 的 Partner Mac Address 为全零。对端交换机未响应 LACP 报文,聚合失败。\n", bond) + result.Detail += fmt.Sprintf("Bond %s Partner Mac Address is all zeros. Command: cat /proc/net/bonding/%s, peer switch did not respond to LACP packets.\n", bond, bond) } for slaveName, sState := range info.Slaves[bond] { @@ -406,21 +406,21 @@ func (c *L3Checker) Check(ctx context.Context, data any) (*common.CheckerResult, result.Status = consts.StatusAbnormal result.Level = consts.LevelCritical result.ErrorName = "AggregatorMismatch" - result.Detail += fmt.Sprintf("从属网卡 %s 的 Aggregator ID (%s) 与全局 Active Aggregator ID (%s) 不一致。该网卡虽然物理 UP,但在二层无法加入到数据转发聚合组中,请检查交换机端口配置或网线。\n", slaveName, slaveAggID, lacp.ActiveAggregatorID) + result.Detail += fmt.Sprintf("Slave NIC %s Aggregator ID (%s) mismatch with global Active Aggregator ID (%s). Command: cat /proc/net/bonding/%s, it cannot join the aggregation group.\n", slaveName, slaveAggID, lacp.ActiveAggregatorID, bond) } if portKey, ok := lacp.SlaveActorKeys[slaveName]; ok && portKey != lacp.ActorKey { result.Status = consts.StatusAbnormal result.Level = consts.LevelWarning result.ErrorName = "ActorKeyMismatch" - result.Detail += fmt.Sprintf("从属网卡 %s 的 port key (%s) 与全局 Actor Key (%s) 不一致。\n", slaveName, portKey, lacp.ActorKey) + result.Detail += fmt.Sprintf("Slave NIC %s port key (%s) mismatch with global Actor Key (%s). Command: cat /proc/net/bonding/%s.\n", slaveName, portKey, lacp.ActorKey, bond) } if operKey, ok := lacp.SlavePartnerKeys[slaveName]; ok && operKey != lacp.PartnerKey { result.Status = consts.StatusAbnormal result.Level = consts.LevelWarning result.ErrorName = "PartnerKeyMismatch" - result.Detail += fmt.Sprintf("从属网卡 %s 的 oper key (%s) 与全局 Partner Key (%s) 不一致,对端交换机 LACP key 协商异常。\n", slaveName, operKey, lacp.PartnerKey) + result.Detail += fmt.Sprintf("Slave NIC %s oper key (%s) mismatch with global Partner Key (%s). Command: cat /proc/net/bonding/%s, peer LACP negotiation abnormal.\n", slaveName, operKey, lacp.PartnerKey, bond) } } @@ -431,13 +431,13 @@ func (c *L3Checker) Check(ctx context.Context, data any) (*common.CheckerResult, result.Level = consts.LevelWarning result.ErrorName = "LACPRateMismatch" } - result.Detail += fmt.Sprintf("Bond %s LACP rate 不匹配。命令:cat /sys/class/net/%s/bonding/lacp_rate,预期:%s,当前:%s。\n", bond, bond, c.spec.LACPRate, bState.LACPRate) + result.Detail += fmt.Sprintf("Bond %s LACP rate mismatch. Command: cat /sys/class/net/%s/bonding/lacp_rate, Expected: %s, Actual: %s.\n", bond, bond, c.spec.LACPRate, bState.LACPRate) } } } if result.Status != consts.StatusNormal { - result.Suggestion = "建议同步排查对端交换机 (Switch) 上的 LACP / Eth-Trunk 聚合配置是否开启和匹配。" + result.Suggestion = "Recommended to simultaneously troubleshoot LACP / Eth-Trunk aggregation config on peer switch." } return result, nil @@ -464,15 +464,15 @@ func (c *L4Checker) Check(ctx context.Context, data any) (*common.CheckerResult, result.Status = consts.StatusAbnormal result.Level = consts.LevelWarning result.ErrorName = "ARPFailed" - result.Detail = "在 ARP 邻居表中发现 FAILED/INCOMPLETE 失败条目。这证明与某些邻居节点的二层 MAC 解析失败。" - result.Suggestion = "请核对本端及交换机的 VLAN ID 配置及二层放行,或使用 arping 测试连通性并在 bond 口 tcpdump 抓弃 ARP request/reply。" + result.Detail = "FAILED/INCOMPLETE entries found in ARP neighbor table. Command: ip neigh show, L2 MAC resolution failed for some neighbors." + result.Suggestion = "Verify local and switch VLAN ID config and L2 forwarding, or use arping to test connectivity and tcpdump for ARP." } if info.Routes.GatewayIP != "" && !info.Routes.GatewayReachable { result.Status = consts.StatusAbnormal result.Level = consts.LevelCritical result.ErrorName = "GatewayUnreachable" - result.Detail += fmt.Sprintf("系统的网关 (%s) 不可达 (ping 失败,且不在 ARP 邻居表中)。\n", info.Routes.GatewayIP) + result.Detail += fmt.Sprintf("System gateway (%s) unreachable. Command: ping -c 3 %s && ip neigh show %s.\n", info.Routes.GatewayIP, info.Routes.GatewayIP, info.Routes.GatewayIP) } return result, nil @@ -499,7 +499,7 @@ func (c *L5Checker) Check(ctx context.Context, data any) (*common.CheckerResult, result.Status = consts.StatusAbnormal result.Level = consts.LevelWarning result.ErrorName = "DirectRouteMismatch" - result.Detail += "系统的默认路由并非直接指向绑定的目标 bond 网卡,可能会导致预期业务流量不走 bond。\n" + result.Detail += "System default route does not point directly to target bond. Command: ip route show default, business traffic might not use bond.\n" } if info.RPFilter["all"] == "1" { @@ -508,7 +508,7 @@ func (c *L5Checker) Check(ctx context.Context, data any) (*common.CheckerResult, result.Level = consts.LevelWarning result.ErrorName = "RPFilterEnabled" } - result.Detail += "系统启用了 rp_filter (all=1),可能导致非对称路由丢包。命令:sysctl -n net.ipv4.conf.all.rp_filter,预期:0 或 2,当前:1。\n" + result.Detail += "System enabled rp_filter (all=1). Command: sysctl -n net.ipv4.conf.all.rp_filter, Expected: 0 or 2, Actual: 1.\n" } for bond, val := range info.RPFilter { @@ -518,12 +518,12 @@ func (c *L5Checker) Check(ctx context.Context, data any) (*common.CheckerResult, result.Level = consts.LevelWarning result.ErrorName = "RPFilterEnabled" } - result.Detail += fmt.Sprintf("Bond %s 启用了 rp_filter=1。预期:0 或 2。\n", bond) + result.Detail += fmt.Sprintf("Bond %s enabled rp_filter=1. Command: sysctl -n net.ipv4.conf.%s.rp_filter, Expected: 0 or 2, Actual: 1.\n", bond, bond) } } if result.Status != consts.StatusNormal { - result.Suggestion = "如果发生丢包情况,建议检查路由匹配、多网卡下的策略路由(ip rule),以及将 rp_filter 配置为 0(关闭) 或 2(松散模式)。" + result.Suggestion = "If packet loss occurs, it is recommended to check route matching, policy routing (ip rule), and set rp_filter to 0 or 2." } return result, nil diff --git a/components/ethernet/collector/collector.go b/components/ethernet/collector/collector.go index c65fb327..5f639ca5 100644 --- a/components/ethernet/collector/collector.go +++ b/components/ethernet/collector/collector.go @@ -201,6 +201,10 @@ func (c *EthernetCollector) Collect(ctx context.Context) (*EthernetInfo, error) outIPSL, _ := utils.ExecCommand(ctx, "ip", "-s", "link", "show", slave) c.info.IPSLink[slave] = string(outIPSL) } + + // Also fetch stats for the bond interface itself + outIPSLBond, _ := utils.ExecCommand(ctx, "ip", "-s", "link", "show", bond) + c.info.IPSLink[bond] = string(outIPSLBond) } var grepParts []string @@ -347,29 +351,11 @@ func (c *EthernetCollector) Collect(ctx context.Context) (*EthernetInfo, error) c.info.Slaves[bond][slave] = sState - // Traffic Stats (ip -s link show) - sStats := TrafficStats{} - outIPSL := c.info.IPSLink[slave] - lines := strings.Split(outIPSL, "\n") - for i, line := range lines { - if strings.Contains(line, "RX:") && i+1 < len(lines) { - fields := strings.Fields(lines[i+1]) - if len(fields) >= 4 { - sStats.RXErrors, _ = strconv.ParseInt(fields[2], 10, 64) - sStats.Dropped, _ = strconv.ParseInt(fields[3], 10, 64) - } - } - if strings.Contains(line, "TX:") && i+1 < len(lines) { - fields := strings.Fields(lines[i+1]) - if len(fields) >= 4 { - sStats.TXErrors, _ = strconv.ParseInt(fields[2], 10, 64) - sStats.Carrier, _ = strconv.ParseInt(fields[3], 10, 64) - } - } - } - - c.info.Stats[slave] = sStats + c.info.Stats[slave] = c.parseTrafficStats(c.info.IPSLink[slave]) } + + // Also parse stats for the bond + c.info.Stats[bond] = c.parseTrafficStats(c.info.IPSLink[bond]) } outRPAll, _ := utils.ExecCommand(ctx, "sysctl", "-n", "net.ipv4.conf.all.rp_filter") @@ -435,3 +421,25 @@ func (c *EthernetCollector) Collect(ctx context.Context) (*EthernetInfo, error) return c.info, nil } + +func (c *EthernetCollector) parseTrafficStats(outIPSL string) TrafficStats { + sStats := TrafficStats{} + lines := strings.Split(outIPSL, "\n") + for i, line := range lines { + if strings.Contains(line, "RX:") && i+1 < len(lines) { + fields := strings.Fields(lines[i+1]) + if len(fields) >= 4 { + sStats.RXErrors, _ = strconv.ParseInt(fields[2], 10, 64) + sStats.Dropped, _ = strconv.ParseInt(fields[3], 10, 64) + } + } + if strings.Contains(line, "TX:") && i+1 < len(lines) { + fields := strings.Fields(lines[i+1]) + if len(fields) >= 4 { + sStats.TXErrors, _ = strconv.ParseInt(fields[2], 10, 64) + sStats.Carrier, _ = strconv.ParseInt(fields[3], 10, 64) + } + } + } + return sStats +} diff --git a/components/ethernet/config/config.go b/components/ethernet/config/config.go index cc5a759a..b1cee36b 100644 --- a/components/ethernet/config/config.go +++ b/components/ethernet/config/config.go @@ -28,6 +28,7 @@ type EthernetConfig struct { QueryInterval common.Duration `json:"query_interval" yaml:"query_interval"` CacheSize int64 `json:"cache_size" yaml:"cache_size"` IgnoredCheckers []string `json:"ignored_checkers" yaml:"ignored_checkers"` + EnableMetrics bool `json:"enable_metrics" yaml:"enable_metrics"` } func (c *EthernetUserConfig) GetQueryInterval() common.Duration { diff --git a/components/ethernet/ethernet.go b/components/ethernet/ethernet.go index bc9b2e9d..4e63051e 100644 --- a/components/ethernet/ethernet.go +++ b/components/ethernet/ethernet.go @@ -27,6 +27,7 @@ import ( "github.com/scitix/sichek/components/ethernet/checker" "github.com/scitix/sichek/components/ethernet/collector" "github.com/scitix/sichek/components/ethernet/config" + ethmetrics "github.com/scitix/sichek/components/ethernet/metrics" "github.com/scitix/sichek/consts" "github.com/scitix/sichek/pkg/utils" @@ -42,6 +43,7 @@ type component struct { collector *collector.EthernetCollector checkers []common.Checker filter *filter.EventFilter + metrics *ethmetrics.EthernetMetrics cacheMtx sync.RWMutex cacheBuffer []*common.Result @@ -139,6 +141,7 @@ func newEthernetComponent(cfgFile string, specFile string, ignoredCheckers []str cacheBuffer: make([]*common.Result, cacheSize), cacheInfo: make([]common.Info, cacheSize), cacheSize: cacheSize, + metrics: ethmetrics.NewEthernetMetrics(), } service := common.NewCommonService(ctx, cfg, component.componentName, component.GetTimeout(), component.HealthCheck) component.service = service @@ -159,6 +162,10 @@ func (c *component) HealthCheck(ctx context.Context) (*common.Result, error) { } logrus.WithField("component", "ethernet").Infof("collected ethernet info: %+v", ethInfo) + if c.cfg.Ethernet != nil && c.cfg.Ethernet.EnableMetrics { + c.metrics.ExportMetrics(ethInfo) + } + result := common.Check(ctx, c.componentName, ethInfo, c.checkers) timer.Mark("ethernet-check") diff --git a/components/ethernet/metrics/metrics.go b/components/ethernet/metrics/metrics.go new file mode 100644 index 00000000..2111307f --- /dev/null +++ b/components/ethernet/metrics/metrics.go @@ -0,0 +1,83 @@ +/* +Copyright 2024 The Scitix Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package metrics + +import ( + "github.com/scitix/sichek/components/ethernet/collector" + common "github.com/scitix/sichek/metrics" +) + +const ( + MetricPrefix = "sichek_ethernet" + TagPrefix = "json" +) + +type EthernetMetrics struct { + BondStatusGauge *common.GaugeVecMetricExporter + SlaveStatusGauge *common.GaugeVecMetricExporter + RouteStatusGauge *common.GaugeVecMetricExporter + TrafficStatsGauge *common.GaugeVecMetricExporter + LACPStatusGauge *common.GaugeVecMetricExporter + SystemStatusGauge *common.GaugeVecMetricExporter +} + +func NewEthernetMetrics() *EthernetMetrics { + // Use distinct prefixes to avoid metric name collision. + // We stick to ExportStruct for consistency in label cardinality. + return &EthernetMetrics{ + BondStatusGauge: common.NewGaugeVecMetricExporter(MetricPrefix+"_bond", []string{"bond"}), + SlaveStatusGauge: common.NewGaugeVecMetricExporter(MetricPrefix+"_slave", []string{"bond", "slave"}), + RouteStatusGauge: common.NewGaugeVecMetricExporter(MetricPrefix+"_route", nil), + TrafficStatsGauge: common.NewGaugeVecMetricExporter(MetricPrefix+"_traffic", []string{"interface"}), + LACPStatusGauge: common.NewGaugeVecMetricExporter(MetricPrefix+"_lacp", []string{"bond"}), + SystemStatusGauge: common.NewGaugeVecMetricExporter(MetricPrefix+"_system", nil), + } +} + +func (m *EthernetMetrics) ExportMetrics(info *collector.EthernetInfo) { + if info == nil { + return + } + + // 1. Export Bond Status + for bondName, bondState := range info.Bonds { + m.BondStatusGauge.ExportStruct(bondState, []string{bondName}, TagPrefix) + } + + // 2. Export Slave Status + for bondName, slaves := range info.Slaves { + for slaveName, slaveState := range slaves { + m.SlaveStatusGauge.ExportStruct(slaveState, []string{bondName, slaveName}, TagPrefix) + } + } + + // 3. Export Route Status + m.RouteStatusGauge.ExportStruct(info.Routes, []string{}, TagPrefix) + + // 4. Export Traffic Stats + for ifaceName, stats := range info.Stats { + m.TrafficStatsGauge.ExportStruct(stats, []string{ifaceName}, TagPrefix) + } + + // 5. Export LACP Info + for bondName, lacpState := range info.LACP { + m.LACPStatusGauge.ExportStruct(lacpState, []string{bondName}, TagPrefix) + } + + // 6. Export System Info + m.SystemStatusGauge.SetMetric("syslog_error_count", nil, float64(len(info.SyslogErrors))) + m.SystemStatusGauge.SetMetric("bond_count", nil, float64(len(info.BondInterfaces))) +} diff --git a/config/default_user_config.yaml b/config/default_user_config.yaml index c4248935..6531cbcf 100644 --- a/config/default_user_config.yaml +++ b/config/default_user_config.yaml @@ -70,4 +70,5 @@ pcie_topo: ethernet: query_interval: 10s cache_size: 5 - ignored_checkers: [] \ No newline at end of file + ignored_checkers: [] + enable_metrics: true \ No newline at end of file From 562b6ef95a1ede84704264909fc269a4b4a0bb9f Mon Sep 17 00:00:00 2001 From: lzi-a11y Date: Wed, 11 Mar 2026 17:48:02 +0800 Subject: [PATCH 3/3] feat: implement layered Ethernet network diagnostics (L1-L5 checkers) . --- components/ethernet/checker/checker.go | 461 ---------------------- components/ethernet/checker/l1_checker.go | 128 ++++++ components/ethernet/checker/l2_checker.go | 215 ++++++++++ components/ethernet/checker/l3_checker.go | 112 ++++++ components/ethernet/checker/l4_checker.go | 62 +++ components/ethernet/checker/l5_checker.go | 77 ++++ 6 files changed, 594 insertions(+), 461 deletions(-) create mode 100644 components/ethernet/checker/l1_checker.go create mode 100644 components/ethernet/checker/l2_checker.go create mode 100644 components/ethernet/checker/l3_checker.go create mode 100644 components/ethernet/checker/l4_checker.go create mode 100644 components/ethernet/checker/l5_checker.go diff --git a/components/ethernet/checker/checker.go b/components/ethernet/checker/checker.go index 4064813a..36d69e1a 100644 --- a/components/ethernet/checker/checker.go +++ b/components/ethernet/checker/checker.go @@ -16,16 +16,11 @@ limitations under the License. package checker import ( - "context" - "fmt" "regexp" "strconv" - "strings" "github.com/scitix/sichek/components/common" - "github.com/scitix/sichek/components/ethernet/collector" "github.com/scitix/sichek/components/ethernet/config" - "github.com/scitix/sichek/consts" ) func NewCheckers(cfg *config.EthernetUserConfig, spec *config.EthernetSpecConfig) ([]common.Checker, error) { @@ -61,14 +56,6 @@ func NewCheckers(cfg *config.EthernetUserConfig, spec *config.EthernetSpecConfig return activeCheckers, nil } -type L1Checker struct { - spec *config.EthernetSpecConfig - prevCRC map[string]int64 - prevCarrier map[string]int64 - prevDrops map[string]int64 -} - -func (c *L1Checker) Name() string { return config.EthernetL1CheckerName } // extractInt parses an integer using regex from a string pattern func extractInt(input, pattern string) int64 { @@ -80,451 +67,3 @@ func extractInt(input, pattern string) int64 { } return 0 } - -func (c *L1Checker) Check(ctx context.Context, data any) (*common.CheckerResult, error) { - info, ok := data.(*collector.EthernetInfo) - if !ok { - return nil, fmt.Errorf("invalid data type") - } - - result := &common.CheckerResult{ - Name: c.Name(), - Description: config.EthernetCheckItems[c.Name()], - Status: consts.StatusNormal, - Level: consts.LevelInfo, - Curr: "OK", - } - - expectedSpeed := "25000" // default to 25G - if c.spec != nil && c.spec.Speed != "" { - expectedSpeed = c.spec.Speed - } - - for _, bond := range info.BondInterfaces { - for slaveName, slaveState := range info.Slaves[bond] { - if !slaveState.LinkDetected { - result.Status = consts.StatusAbnormal - result.Level = consts.LevelCritical - result.ErrorName = "LinkDown" - result.Detail += fmt.Sprintf("Physical NIC %s link not UP. Command: ethtool %s, Expected: Link detected: yes, Actual: not connected or unknown.\n", slaveName, slaveName) - } - - if len(info.SyslogErrors) > 0 { - for _, errLine := range info.SyslogErrors { - if strings.Contains(errLine, "tx timeout") && strings.Contains(errLine, slaveName) { - result.Status = consts.StatusAbnormal - result.Level = consts.LevelCritical - result.ErrorName = "TxTimeout" - result.Detail += fmt.Sprintf("NIC %s tx timeout found in kernel log. Command: dmesg | grep -iE 'eth|mlx|link'.\n", slaveName) - break - } - } - } - - // check speed - speedStr := strconv.Itoa(slaveState.Speed) - if speedStr != expectedSpeed && slaveState.Speed > 0 { - result.Status = consts.StatusAbnormal - result.Level = consts.LevelWarning - result.ErrorName = "SpeedMismatch" - result.Detail += fmt.Sprintf("NIC %s speed mismatch. Command: ethtool %s, Expected: %sMb/s, Actual: %sMb/s.\n", slaveName, slaveName, expectedSpeed, speedStr) - } - - // Parse stats - sStats := info.Stats[slaveName] - - // CRC errors - currCRC := sStats.RXErrors // Approximation, standard ip -s link maps CRC errors to RX errors broadly. For exact CRC, ethtool parsing should remain, but for now we follow the general RX error growth. - if prev, ok := c.prevCRC[slaveName]; ok && currCRC > prev { - result.Status = consts.StatusAbnormal - result.Level = consts.LevelWarning - result.ErrorName = "CRCErrorsGrowing" - result.Detail += fmt.Sprintf("NIC %s RX (CRC) errors increasing. Command: ip -s link show %s, Previous: %d, Current: %d.\n", slaveName, slaveName, prev, currCRC) - } - c.prevCRC[slaveName] = currCRC - - // Carrier errors - currCarrierIPS := sStats.Carrier - if prev, ok := c.prevCarrier[slaveName]; ok && currCarrierIPS > prev { - result.Status = consts.StatusAbnormal - result.Level = consts.LevelWarning - result.ErrorName = "CarrierErrorsGrowing" - result.Detail += fmt.Sprintf("NIC %s Carrier errors increasing. Command: ip -s link show %s, Previous: %d, Current: %d.\n", slaveName, slaveName, prev, currCarrierIPS) - } - c.prevCarrier[slaveName] = currCarrierIPS - - // Drops - currDrops := sStats.Dropped - if prev, ok := c.prevDrops[slaveName]; ok && currDrops > prev { - result.Status = consts.StatusAbnormal - result.Level = consts.LevelWarning - result.ErrorName = "DropsGrowing" - result.Detail += fmt.Sprintf("NIC %s Drops increasing. Command: ip -s link show %s, Previous: %d, Current: %d.\n", slaveName, slaveName, prev, currDrops) - } - c.prevDrops[slaveName] = currDrops - } - } - - if result.Status != consts.StatusNormal { - result.Suggestion = "Please check physical link, cable, driver version (ethtool -i), or check dmesg for specific errors; if speed mismatch, check corresponding configuration." - } - - return result, nil -} - -type L2Checker struct { - spec *config.EthernetSpecConfig - prevLinkFailures map[string]int64 - prevActiveSlave map[string]string -} - -func (c *L2Checker) Name() string { return config.EthernetL2CheckerName } -func (c *L2Checker) Check(ctx context.Context, data any) (*common.CheckerResult, error) { - info, ok := data.(*collector.EthernetInfo) - if !ok { - return nil, fmt.Errorf("invalid data type") - } - - result := &common.CheckerResult{ - Name: c.Name(), - Description: config.EthernetCheckItems[c.Name()], - Status: consts.StatusNormal, - Level: consts.LevelInfo, - Curr: "OK", - } - - expectedMinSlaves := 2 - if c.spec != nil && c.spec.MinSlaves > 0 { - expectedMinSlaves = c.spec.MinSlaves - } - - for _, bond := range info.BondInterfaces { - bState, exists := info.Bonds[bond] - if !exists || bState.Name == "" { - result.Status = consts.StatusAbnormal - result.Level = consts.LevelCritical - result.ErrorName = "BondingMissing" - result.Detail = fmt.Sprintf("Bond %s missing in /proc/net/bonding. Command: ls /proc/net/bonding/.\n", bond) - continue - } - - expectedMII := "up" - procContent := info.ProcNetBonding[bond] - if c.spec != nil && c.spec.MIIStatus != "" { - expectedMII = c.spec.MIIStatus - } - - if (expectedMII == "up" && !bState.IsUp) || !strings.Contains(procContent, "MII Status: "+expectedMII) { - result.Status = consts.StatusAbnormal - result.Level = consts.LevelCritical - result.ErrorName = "BondDown" - result.Detail += fmt.Sprintf("Overall status of bond interface %s mismatch. Command: cat /proc/net/bonding/%s, Expected: MII Status: %s, Actual: mismatch (possibly down).\n", bond, bond, expectedMII) - } - - // check MTU - if c.spec != nil && c.spec.MTU != "" { - expectedMTU, _ := strconv.Atoi(c.spec.MTU) - if bState.MTU > 0 && bState.MTU != expectedMTU { - if result.Status == consts.StatusNormal { - result.Status = consts.StatusAbnormal - result.Level = consts.LevelWarning - result.ErrorName = "MTUMismatch" - } - result.Detail += fmt.Sprintf("Bond %s MTU mismatch. Command: ip link show %s, Expected: %d, Actual: %d.\n", bond, bond, expectedMTU, bState.MTU) - } - } - - // check xmit_hash_policy - if c.spec != nil && c.spec.XmitHashPolicy != "" { - if bState.XmitHashPolicy != "" && bState.XmitHashPolicy != c.spec.XmitHashPolicy { - if result.Status == consts.StatusNormal { - result.Status = consts.StatusAbnormal - result.Level = consts.LevelWarning - result.ErrorName = "XmitHashPolicyMismatch" - } - result.Detail += fmt.Sprintf("Bond %s xmit_hash_policy mismatch. Command: cat /sys/class/net/%s/bonding/xmit_hash_policy, Expected: %s, Actual: %s.\n", bond, bond, c.spec.XmitHashPolicy, bState.XmitHashPolicy) - } - } - - // check slave count - slaveCount := len(info.Slaves[bond]) - if slaveCount < expectedMinSlaves { - if result.Status == consts.StatusNormal { - result.Status = consts.StatusAbnormal - result.Level = consts.LevelWarning - result.ErrorName = "SlaveCountMismatch" - } - result.Detail += fmt.Sprintf("Bond %s insufficient slave count. Command: cat /proc/net/bonding/%s, Expected at least: %d, Actual: %d.\n", bond, bond, expectedMinSlaves, slaveCount) - } - - // check miimon, updelay, downdelay (fetching downdelay and updelay via regex since they aren't fully standard across systems on sysfs) - miimon := int64(bState.Miimon) - updelay := extractInt(procContent, `Up Delay \(ms\):\s*(\d+)`) - downdelay := extractInt(procContent, `Down Delay \(ms\):\s*(\d+)`) - - expectedMiimon := int64(0) - expectedUpDelay := int64(0) - expectedDownDelay := int64(0) - if c.spec != nil { - if c.spec.Miimon > 0 { - expectedMiimon = int64(c.spec.Miimon) - } - expectedUpDelay = int64(c.spec.UpDelay) - expectedDownDelay = int64(c.spec.DownDelay) - } - - if miimon == 0 { - result.Status = consts.StatusAbnormal - result.Level = consts.LevelCritical - result.ErrorName = "MiimonDisabled" - result.Detail += fmt.Sprintf("Bond %s MII Polling Interval (miimon) is 0. Command: cat /proc/net/bonding/%s, please enable link detection (miimon) to avoid packet loss.\n", bond, bond) - } else { - if expectedMiimon > 0 && miimon != expectedMiimon { - if result.Status == consts.StatusNormal { - result.Status = consts.StatusAbnormal - result.Level = consts.LevelWarning - result.ErrorName = "MiimonMismatch" - } - result.Detail += fmt.Sprintf("Bond %s miimon mismatch. Command: cat /sys/class/net/%s/bonding/miimon, Expected: %d ms, Actual: %d ms.\n", bond, bond, expectedMiimon, miimon) - } - - if downdelay < miimon { - result.Status = consts.StatusAbnormal - result.Level = consts.LevelWarning - result.ErrorName = "DowndelayTooSmall" - result.Detail += fmt.Sprintf("Bond %s downdelay (%d ms) less than miimon (%d ms). Command: cat /proc/net/bonding/%s, unreasonable config may cause flapping.\n", bond, downdelay, miimon, bond) - } else if expectedDownDelay > 0 && downdelay != expectedDownDelay { - if result.Status == consts.StatusNormal { - result.Status = consts.StatusAbnormal - result.Level = consts.LevelWarning - result.ErrorName = "DowndelayMismatch" - } - result.Detail += fmt.Sprintf("Bond %s downdelay mismatch. Command: cat /sys/class/net/%s/bonding/downdelay, Expected: %d ms, Actual: %d ms.\n", bond, bond, expectedDownDelay, downdelay) - } - - if updelay == 0 { - result.Status = consts.StatusAbnormal - result.Level = consts.LevelWarning - result.ErrorName = "UpdelayZero" - result.Detail += fmt.Sprintf("Bond %s updelay is 0. Command: cat /proc/net/bonding/%s, updelay is recommended to avoid packet loss during switch port negotiation.\n", bond, bond) - } else if expectedUpDelay > 0 && updelay != expectedUpDelay { - if result.Status == consts.StatusNormal { - result.Status = consts.StatusAbnormal - result.Level = consts.LevelWarning - result.ErrorName = "UpdelayMismatch" - } - result.Detail += fmt.Sprintf("Bond %s updelay mismatch. Command: cat /sys/class/net/%s/bonding/updelay, Expected: %d ms, Actual: %d ms.\n", bond, bond, expectedUpDelay, updelay) - } - } - - // track active slave for flapping detection - activeSlaveMatch := regexp.MustCompile(`Currently Active Slave:\s*(\w+)`).FindStringSubmatch(procContent) - if len(activeSlaveMatch) > 1 { - currActive := activeSlaveMatch[1] - if prev, ok := c.prevActiveSlave[bond]; ok && prev != "" && prev != currActive { - result.Status = consts.StatusAbnormal - result.Level = consts.LevelWarning - result.ErrorName = "ActiveSlaveFlapping" - result.Detail += fmt.Sprintf("Bond %s active slave switched. Command: cat /proc/net/bonding/%s, Previous: %s, Current: %s. If frequent, please focus on physical layer stability.\n", bond, bond, prev, currActive) - } - c.prevActiveSlave[bond] = currActive - } - - // track Link Failure Count per slave - slavesData := strings.Split(procContent, "Slave Interface: ") - for i := 1; i < len(slavesData); i++ { - lines := strings.Split(slavesData[i], "\n") - if len(lines) == 0 { - continue - } - slaveName := strings.TrimSpace(lines[0]) - failCount := extractInt(slavesData[i], `Link Failure Count:\s*(\d+)`) - trackKey := bond + "-" + slaveName - - if prev, ok := c.prevLinkFailures[trackKey]; ok && failCount > prev { - result.Status = consts.StatusAbnormal - result.Level = consts.LevelWarning - result.ErrorName = "LinkFailureGrowing" - result.Detail += fmt.Sprintf("Bond %s slave NIC %s link failure occurred. Command: cat /proc/net/bonding/%s, Previous count: %d, Current: %d.\n", bond, slaveName, bond, prev, failCount) - } - c.prevLinkFailures[trackKey] = failCount - } - } - - if result.Status != consts.StatusNormal { - result.Suggestion = "Please use cat /proc/net/bonding/bond0 to verify MII status and Link Failure Count; ensure config (e.g., /etc/netplan) has miimon > 0." - } - - return result, nil -} - -type L3Checker struct{ spec *config.EthernetSpecConfig } - -func (c *L3Checker) Name() string { return config.EthernetL3CheckerName } -func (c *L3Checker) Check(ctx context.Context, data any) (*common.CheckerResult, error) { - info, ok := data.(*collector.EthernetInfo) - if !ok { - return nil, fmt.Errorf("invalid data type") - } - - result := &common.CheckerResult{ - Name: c.Name(), - Description: config.EthernetCheckItems[c.Name()], - Status: consts.StatusNormal, - Level: consts.LevelInfo, - Curr: "OK", - } - - for _, bond := range info.BondInterfaces { - bState, ok := info.Bonds[bond] - if !ok || !strings.Contains(bState.Mode, "802.3ad") { - continue - } - - lacp, exists := info.LACP[bond] - if !exists || lacp.PartnerMacAddress == "" { - result.Status = consts.StatusAbnormal - result.Level = consts.LevelCritical - result.ErrorName = "ActiveAggregatorMissing" - result.Detail += fmt.Sprintf("Bond %s configured as 802.3ad mode but no valid Active Aggregator found. Command: cat /proc/net/bonding/%s, peer switch might not have LACP configured or link is abnormal.\n", bond, bond) - continue - } - - if lacp.PartnerMacAddress == "00:00:00:00:00:00" { - result.Status = consts.StatusAbnormal - result.Level = consts.LevelCritical - result.ErrorName = "PartnerMacInvalid" - result.Detail += fmt.Sprintf("Bond %s Partner Mac Address is all zeros. Command: cat /proc/net/bonding/%s, peer switch did not respond to LACP packets.\n", bond, bond) - } - - for slaveName, sState := range info.Slaves[bond] { - if !sState.IsUp { - continue - } - - if slaveAggID, ok := lacp.SlaveAggregatorIDs[slaveName]; ok && slaveAggID != lacp.ActiveAggregatorID { - result.Status = consts.StatusAbnormal - result.Level = consts.LevelCritical - result.ErrorName = "AggregatorMismatch" - result.Detail += fmt.Sprintf("Slave NIC %s Aggregator ID (%s) mismatch with global Active Aggregator ID (%s). Command: cat /proc/net/bonding/%s, it cannot join the aggregation group.\n", slaveName, slaveAggID, lacp.ActiveAggregatorID, bond) - } - - if portKey, ok := lacp.SlaveActorKeys[slaveName]; ok && portKey != lacp.ActorKey { - result.Status = consts.StatusAbnormal - result.Level = consts.LevelWarning - result.ErrorName = "ActorKeyMismatch" - result.Detail += fmt.Sprintf("Slave NIC %s port key (%s) mismatch with global Actor Key (%s). Command: cat /proc/net/bonding/%s.\n", slaveName, portKey, lacp.ActorKey, bond) - } - - if operKey, ok := lacp.SlavePartnerKeys[slaveName]; ok && operKey != lacp.PartnerKey { - result.Status = consts.StatusAbnormal - result.Level = consts.LevelWarning - result.ErrorName = "PartnerKeyMismatch" - result.Detail += fmt.Sprintf("Slave NIC %s oper key (%s) mismatch with global Partner Key (%s). Command: cat /proc/net/bonding/%s, peer LACP negotiation abnormal.\n", slaveName, operKey, lacp.PartnerKey, bond) - } - } - - if c.spec != nil && c.spec.LACPRate != "" { - if !strings.Contains(bState.LACPRate, c.spec.LACPRate) { - if result.Status == consts.StatusNormal { - result.Status = consts.StatusAbnormal - result.Level = consts.LevelWarning - result.ErrorName = "LACPRateMismatch" - } - result.Detail += fmt.Sprintf("Bond %s LACP rate mismatch. Command: cat /sys/class/net/%s/bonding/lacp_rate, Expected: %s, Actual: %s.\n", bond, bond, c.spec.LACPRate, bState.LACPRate) - } - } - } - - if result.Status != consts.StatusNormal { - result.Suggestion = "Recommended to simultaneously troubleshoot LACP / Eth-Trunk aggregation config on peer switch." - } - - return result, nil -} - -type L4Checker struct{ spec *config.EthernetSpecConfig } - -func (c *L4Checker) Name() string { return config.EthernetL4CheckerName } -func (c *L4Checker) Check(ctx context.Context, data any) (*common.CheckerResult, error) { - info, ok := data.(*collector.EthernetInfo) - if !ok { - return nil, fmt.Errorf("invalid data type") - } - - result := &common.CheckerResult{ - Name: c.Name(), - Description: config.EthernetCheckItems[c.Name()], - Status: consts.StatusNormal, - Level: consts.LevelInfo, - Curr: "OK", - } - - if strings.Contains(info.IPNeigh, "FAILED") || strings.Contains(info.IPNeigh, "INCOMPLETE") { - result.Status = consts.StatusAbnormal - result.Level = consts.LevelWarning - result.ErrorName = "ARPFailed" - result.Detail = "FAILED/INCOMPLETE entries found in ARP neighbor table. Command: ip neigh show, L2 MAC resolution failed for some neighbors." - result.Suggestion = "Verify local and switch VLAN ID config and L2 forwarding, or use arping to test connectivity and tcpdump for ARP." - } - - if info.Routes.GatewayIP != "" && !info.Routes.GatewayReachable { - result.Status = consts.StatusAbnormal - result.Level = consts.LevelCritical - result.ErrorName = "GatewayUnreachable" - result.Detail += fmt.Sprintf("System gateway (%s) unreachable. Command: ping -c 3 %s && ip neigh show %s.\n", info.Routes.GatewayIP, info.Routes.GatewayIP, info.Routes.GatewayIP) - } - - return result, nil -} - -type L5Checker struct{ spec *config.EthernetSpecConfig } - -func (c *L5Checker) Name() string { return config.EthernetL5CheckerName } -func (c *L5Checker) Check(ctx context.Context, data any) (*common.CheckerResult, error) { - info, ok := data.(*collector.EthernetInfo) - if !ok { - return nil, fmt.Errorf("invalid data type") - } - - result := &common.CheckerResult{ - Name: c.Name(), - Description: config.EthernetCheckItems[c.Name()], - Status: consts.StatusNormal, - Level: consts.LevelInfo, - Curr: "OK", - } - - if !info.Routes.DefaultRouteViaBond { - result.Status = consts.StatusAbnormal - result.Level = consts.LevelWarning - result.ErrorName = "DirectRouteMismatch" - result.Detail += "System default route does not point directly to target bond. Command: ip route show default, business traffic might not use bond.\n" - } - - if info.RPFilter["all"] == "1" { - if result.Status == consts.StatusNormal { - result.Status = consts.StatusAbnormal - result.Level = consts.LevelWarning - result.ErrorName = "RPFilterEnabled" - } - result.Detail += "System enabled rp_filter (all=1). Command: sysctl -n net.ipv4.conf.all.rp_filter, Expected: 0 or 2, Actual: 1.\n" - } - - for bond, val := range info.RPFilter { - if bond != "all" && val == "1" { - if result.Status == consts.StatusNormal { - result.Status = consts.StatusAbnormal - result.Level = consts.LevelWarning - result.ErrorName = "RPFilterEnabled" - } - result.Detail += fmt.Sprintf("Bond %s enabled rp_filter=1. Command: sysctl -n net.ipv4.conf.%s.rp_filter, Expected: 0 or 2, Actual: 1.\n", bond, bond) - } - } - - if result.Status != consts.StatusNormal { - result.Suggestion = "If packet loss occurs, it is recommended to check route matching, policy routing (ip rule), and set rp_filter to 0 or 2." - } - - return result, nil -} diff --git a/components/ethernet/checker/l1_checker.go b/components/ethernet/checker/l1_checker.go new file mode 100644 index 00000000..17a32227 --- /dev/null +++ b/components/ethernet/checker/l1_checker.go @@ -0,0 +1,128 @@ +/* +Copyright 2024 The Scitix Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package checker + +import ( + "context" + "fmt" + "strconv" + "strings" + + "github.com/scitix/sichek/components/common" + "github.com/scitix/sichek/components/ethernet/collector" + "github.com/scitix/sichek/components/ethernet/config" + "github.com/scitix/sichek/consts" +) + +type L1Checker struct { + spec *config.EthernetSpecConfig + prevCRC map[string]int64 + prevCarrier map[string]int64 + prevDrops map[string]int64 +} + +func (c *L1Checker) Name() string { return config.EthernetL1CheckerName } + +func (c *L1Checker) Check(ctx context.Context, data any) (*common.CheckerResult, error) { + info, ok := data.(*collector.EthernetInfo) + if !ok { + return nil, fmt.Errorf("invalid data type") + } + + result := &common.CheckerResult{ + Name: c.Name(), + Description: config.EthernetCheckItems[c.Name()], + Status: consts.StatusNormal, + Level: consts.LevelInfo, + Curr: "OK", + } + + expectedSpeed := "25000" // default to 25G + if c.spec != nil && c.spec.Speed != "" { + expectedSpeed = c.spec.Speed + } + + for _, bond := range info.BondInterfaces { + for slaveName, slaveState := range info.Slaves[bond] { + if !slaveState.LinkDetected { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelCritical + result.ErrorName = "LinkDown" + result.Detail += fmt.Sprintf("Physical NIC %s link not UP. Command: ethtool %s, Expected: Link detected: yes, Actual: not connected or unknown.\n", slaveName, slaveName) + } + + if len(info.SyslogErrors) > 0 { + for _, errLine := range info.SyslogErrors { + if strings.Contains(errLine, "tx timeout") && strings.Contains(errLine, slaveName) { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelCritical + result.ErrorName = "TxTimeout" + result.Detail += fmt.Sprintf("NIC %s tx timeout found in kernel log. Command: dmesg | grep -iE 'eth|mlx|link'.\n", slaveName) + break + } + } + } + + // check speed + speedStr := strconv.Itoa(slaveState.Speed) + if speedStr != expectedSpeed && slaveState.Speed > 0 { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelWarning + result.ErrorName = "SpeedMismatch" + result.Detail += fmt.Sprintf("NIC %s speed mismatch. Command: ethtool %s, Expected: %sMb/s, Actual: %sMb/s.\n", slaveName, slaveName, expectedSpeed, speedStr) + } + + // Parse stats + sStats := info.Stats[slaveName] + + // CRC errors + currCRC := sStats.RXErrors // Approximation, standard ip -s link maps CRC errors to RX errors broadly. For exact CRC, ethtool parsing should remain, but for now we follow the general RX error growth. + if prev, ok := c.prevCRC[slaveName]; ok && currCRC > prev { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelWarning + result.ErrorName = "CRCErrorsGrowing" + result.Detail += fmt.Sprintf("NIC %s RX (CRC) errors increasing. Command: ip -s link show %s, Previous: %d, Current: %d.\n", slaveName, slaveName, prev, currCRC) + } + c.prevCRC[slaveName] = currCRC + + // Carrier errors + currCarrierIPS := sStats.Carrier + if prev, ok := c.prevCarrier[slaveName]; ok && currCarrierIPS > prev { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelWarning + result.ErrorName = "CarrierErrorsGrowing" + result.Detail += fmt.Sprintf("NIC %s Carrier errors increasing. Command: ip -s link show %s, Previous: %d, Current: %d.\n", slaveName, slaveName, prev, currCarrierIPS) + } + c.prevCarrier[slaveName] = currCarrierIPS + + // Drops + currDrops := sStats.Dropped + if prev, ok := c.prevDrops[slaveName]; ok && currDrops > prev { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelWarning + result.ErrorName = "DropsGrowing" + result.Detail += fmt.Sprintf("NIC %s Drops increasing. Command: ip -s link show %s, Previous: %d, Current: %d.\n", slaveName, slaveName, prev, currDrops) + } + c.prevDrops[slaveName] = currDrops + } + } + + if result.Status != consts.StatusNormal { + result.Suggestion = "Please check physical link, cable, driver version (ethtool -i), or check dmesg for specific errors; if speed mismatch, check corresponding configuration." + } + + return result, nil +} diff --git a/components/ethernet/checker/l2_checker.go b/components/ethernet/checker/l2_checker.go new file mode 100644 index 00000000..e65d3e59 --- /dev/null +++ b/components/ethernet/checker/l2_checker.go @@ -0,0 +1,215 @@ +/* +Copyright 2024 The Scitix Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package checker + +import ( + "context" + "fmt" + "regexp" + "strconv" + "strings" + + "github.com/scitix/sichek/components/common" + "github.com/scitix/sichek/components/ethernet/collector" + "github.com/scitix/sichek/components/ethernet/config" + "github.com/scitix/sichek/consts" +) + +type L2Checker struct { + spec *config.EthernetSpecConfig + prevLinkFailures map[string]int64 + prevActiveSlave map[string]string +} + +func (c *L2Checker) Name() string { return config.EthernetL2CheckerName } +func (c *L2Checker) Check(ctx context.Context, data any) (*common.CheckerResult, error) { + info, ok := data.(*collector.EthernetInfo) + if !ok { + return nil, fmt.Errorf("invalid data type") + } + + result := &common.CheckerResult{ + Name: c.Name(), + Description: config.EthernetCheckItems[c.Name()], + Status: consts.StatusNormal, + Level: consts.LevelInfo, + Curr: "OK", + } + + expectedMinSlaves := 2 + if c.spec != nil && c.spec.MinSlaves > 0 { + expectedMinSlaves = c.spec.MinSlaves + } + + for _, bond := range info.BondInterfaces { + bState, exists := info.Bonds[bond] + if !exists || bState.Name == "" { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelCritical + result.ErrorName = "BondingMissing" + result.Detail = fmt.Sprintf("Bond %s missing in /proc/net/bonding. Command: ls /proc/net/bonding/.\n", bond) + continue + } + + expectedMII := "up" + procContent := info.ProcNetBonding[bond] + if c.spec != nil && c.spec.MIIStatus != "" { + expectedMII = c.spec.MIIStatus + } + + if (expectedMII == "up" && !bState.IsUp) || !strings.Contains(procContent, "MII Status: "+expectedMII) { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelCritical + result.ErrorName = "BondDown" + result.Detail += fmt.Sprintf("Overall status of bond interface %s mismatch. Command: cat /proc/net/bonding/%s, Expected: MII Status: %s, Actual: mismatch (possibly down).\n", bond, bond, expectedMII) + } + + // check MTU + if c.spec != nil && c.spec.MTU != "" { + expectedMTU, _ := strconv.Atoi(c.spec.MTU) + if bState.MTU > 0 && bState.MTU != expectedMTU { + if result.Status == consts.StatusNormal { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelWarning + result.ErrorName = "MTUMismatch" + } + result.Detail += fmt.Sprintf("Bond %s MTU mismatch. Command: ip link show %s, Expected: %d, Actual: %d.\n", bond, bond, expectedMTU, bState.MTU) + } + } + + // check xmit_hash_policy + if c.spec != nil && c.spec.XmitHashPolicy != "" { + if bState.XmitHashPolicy != "" && bState.XmitHashPolicy != c.spec.XmitHashPolicy { + if result.Status == consts.StatusNormal { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelWarning + result.ErrorName = "XmitHashPolicyMismatch" + } + result.Detail += fmt.Sprintf("Bond %s xmit_hash_policy mismatch. Command: cat /sys/class/net/%s/bonding/xmit_hash_policy, Expected: %s, Actual: %s.\n", bond, bond, c.spec.XmitHashPolicy, bState.XmitHashPolicy) + } + } + + // check slave count + slaveCount := len(info.Slaves[bond]) + if slaveCount < expectedMinSlaves { + if result.Status == consts.StatusNormal { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelWarning + result.ErrorName = "SlaveCountMismatch" + } + result.Detail += fmt.Sprintf("Bond %s insufficient slave count. Command: cat /proc/net/bonding/%s, Expected at least: %d, Actual: %d.\n", bond, bond, expectedMinSlaves, slaveCount) + } + + // check miimon, updelay, downdelay (fetching downdelay and updelay via regex since they aren't fully standard across systems on sysfs) + miimon := int64(bState.Miimon) + updelay := extractInt(procContent, `Up Delay \(ms\):\s*(\d+)`) + downdelay := extractInt(procContent, `Down Delay \(ms\):\s*(\d+)`) + + expectedMiimon := int64(0) + expectedUpDelay := int64(0) + expectedDownDelay := int64(0) + if c.spec != nil { + if c.spec.Miimon > 0 { + expectedMiimon = int64(c.spec.Miimon) + } + expectedUpDelay = int64(c.spec.UpDelay) + expectedDownDelay = int64(c.spec.DownDelay) + } + + if miimon == 0 { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelCritical + result.ErrorName = "MiimonDisabled" + result.Detail += fmt.Sprintf("Bond %s MII Polling Interval (miimon) is 0. Command: cat /proc/net/bonding/%s, please enable link detection (miimon) to avoid packet loss.\n", bond, bond) + } else { + if expectedMiimon > 0 && miimon != expectedMiimon { + if result.Status == consts.StatusNormal { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelWarning + result.ErrorName = "MiimonMismatch" + } + result.Detail += fmt.Sprintf("Bond %s miimon mismatch. Command: cat /sys/class/net/%s/bonding/miimon, Expected: %d ms, Actual: %d ms.\n", bond, bond, expectedMiimon, miimon) + } + + if downdelay < miimon { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelWarning + result.ErrorName = "DowndelayTooSmall" + result.Detail += fmt.Sprintf("Bond %s downdelay (%d ms) less than miimon (%d ms). Command: cat /proc/net/bonding/%s, unreasonable config may cause flapping.\n", bond, downdelay, miimon, bond) + } else if expectedDownDelay > 0 && downdelay != expectedDownDelay { + if result.Status == consts.StatusNormal { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelWarning + result.ErrorName = "DowndelayMismatch" + } + result.Detail += fmt.Sprintf("Bond %s downdelay mismatch. Command: cat /sys/class/net/%s/bonding/downdelay, Expected: %d ms, Actual: %d ms.\n", bond, bond, expectedDownDelay, downdelay) + } + + if updelay == 0 { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelWarning + result.ErrorName = "UpdelayZero" + result.Detail += fmt.Sprintf("Bond %s updelay is 0. Command: cat /proc/net/bonding/%s, updelay is recommended to avoid packet loss during switch port negotiation.\n", bond, bond) + } else if expectedUpDelay > 0 && updelay != expectedUpDelay { + if result.Status == consts.StatusNormal { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelWarning + result.ErrorName = "UpdelayMismatch" + } + result.Detail += fmt.Sprintf("Bond %s updelay mismatch. Command: cat /sys/class/net/%s/bonding/updelay, Expected: %d ms, Actual: %d ms.\n", bond, bond, expectedUpDelay, updelay) + } + } + + // track active slave for flapping detection + activeSlaveMatch := regexp.MustCompile(`Currently Active Slave:\s*(\w+)`).FindStringSubmatch(procContent) + if len(activeSlaveMatch) > 1 { + currActive := activeSlaveMatch[1] + if prev, ok := c.prevActiveSlave[bond]; ok && prev != "" && prev != currActive { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelWarning + result.ErrorName = "ActiveSlaveFlapping" + result.Detail += fmt.Sprintf("Bond %s active slave switched. Command: cat /proc/net/bonding/%s, Previous: %s, Current: %s. If frequent, please focus on physical layer stability.\n", bond, bond, prev, currActive) + } + c.prevActiveSlave[bond] = currActive + } + + // track Link Failure Count per slave + slavesData := strings.Split(procContent, "Slave Interface: ") + for i := 1; i < len(slavesData); i++ { + lines := strings.Split(slavesData[i], "\n") + if len(lines) == 0 { + continue + } + slaveName := strings.TrimSpace(lines[0]) + failCount := extractInt(slavesData[i], `Link Failure Count:\s*(\d+)`) + trackKey := bond + "-" + slaveName + + if prev, ok := c.prevLinkFailures[trackKey]; ok && failCount > prev { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelWarning + result.ErrorName = "LinkFailureGrowing" + result.Detail += fmt.Sprintf("Bond %s slave NIC %s link failure occurred. Command: cat /proc/net/bonding/%s, Previous count: %d, Current: %d.\n", bond, slaveName, bond, prev, failCount) + } + c.prevLinkFailures[trackKey] = failCount + } + } + + if result.Status != consts.StatusNormal { + result.Suggestion = "Please use cat /proc/net/bonding/bond0 to verify MII status and Link Failure Count; ensure config (e.g., /etc/netplan) has miimon > 0." + } + + return result, nil +} diff --git a/components/ethernet/checker/l3_checker.go b/components/ethernet/checker/l3_checker.go new file mode 100644 index 00000000..9a0cb8cf --- /dev/null +++ b/components/ethernet/checker/l3_checker.go @@ -0,0 +1,112 @@ +/* +Copyright 2024 The Scitix Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package checker + +import ( + "context" + "fmt" + "strings" + + "github.com/scitix/sichek/components/common" + "github.com/scitix/sichek/components/ethernet/collector" + "github.com/scitix/sichek/components/ethernet/config" + "github.com/scitix/sichek/consts" +) + +type L3Checker struct{ spec *config.EthernetSpecConfig } + +func (c *L3Checker) Name() string { return config.EthernetL3CheckerName } +func (c *L3Checker) Check(ctx context.Context, data any) (*common.CheckerResult, error) { + info, ok := data.(*collector.EthernetInfo) + if !ok { + return nil, fmt.Errorf("invalid data type") + } + + result := &common.CheckerResult{ + Name: c.Name(), + Description: config.EthernetCheckItems[c.Name()], + Status: consts.StatusNormal, + Level: consts.LevelInfo, + Curr: "OK", + } + + for _, bond := range info.BondInterfaces { + bState, ok := info.Bonds[bond] + if !ok || !strings.Contains(bState.Mode, "802.3ad") { + continue + } + + lacp, exists := info.LACP[bond] + if !exists || lacp.PartnerMacAddress == "" { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelCritical + result.ErrorName = "ActiveAggregatorMissing" + result.Detail += fmt.Sprintf("Bond %s configured as 802.3ad mode but no valid Active Aggregator found. Command: cat /proc/net/bonding/%s, peer switch might not have LACP configured or link is abnormal.\n", bond, bond) + continue + } + + if lacp.PartnerMacAddress == "00:00:00:00:00:00" { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelCritical + result.ErrorName = "PartnerMacInvalid" + result.Detail += fmt.Sprintf("Bond %s Partner Mac Address is all zeros. Command: cat /proc/net/bonding/%s, peer switch did not respond to LACP packets.\n", bond, bond) + } + + for slaveName, sState := range info.Slaves[bond] { + if !sState.IsUp { + continue + } + + if slaveAggID, ok := lacp.SlaveAggregatorIDs[slaveName]; ok && slaveAggID != lacp.ActiveAggregatorID { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelCritical + result.ErrorName = "AggregatorMismatch" + result.Detail += fmt.Sprintf("Slave NIC %s Aggregator ID (%s) mismatch with global Active Aggregator ID (%s). Command: cat /proc/net/bonding/%s, it cannot join the aggregation group.\n", slaveName, slaveAggID, lacp.ActiveAggregatorID, bond) + } + + if portKey, ok := lacp.SlaveActorKeys[slaveName]; ok && portKey != lacp.ActorKey { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelWarning + result.ErrorName = "ActorKeyMismatch" + result.Detail += fmt.Sprintf("Slave NIC %s port key (%s) mismatch with global Actor Key (%s). Command: cat /proc/net/bonding/%s.\n", slaveName, portKey, lacp.ActorKey, bond) + } + + if operKey, ok := lacp.SlavePartnerKeys[slaveName]; ok && operKey != lacp.PartnerKey { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelWarning + result.ErrorName = "PartnerKeyMismatch" + result.Detail += fmt.Sprintf("Slave NIC %s oper key (%s) mismatch with global Partner Key (%s). Command: cat /proc/net/bonding/%s, peer LACP negotiation abnormal.\n", slaveName, operKey, lacp.PartnerKey, bond) + } + } + + if c.spec != nil && c.spec.LACPRate != "" { + if !strings.Contains(bState.LACPRate, c.spec.LACPRate) { + if result.Status == consts.StatusNormal { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelWarning + result.ErrorName = "LACPRateMismatch" + } + result.Detail += fmt.Sprintf("Bond %s LACP rate mismatch. Command: cat /sys/class/net/%s/bonding/lacp_rate, Expected: %s, Actual: %s.\n", bond, bond, c.spec.LACPRate, bState.LACPRate) + } + } + } + + if result.Status != consts.StatusNormal { + result.Suggestion = "Recommended to simultaneously troubleshoot LACP / Eth-Trunk aggregation config on peer switch." + } + + return result, nil +} diff --git a/components/ethernet/checker/l4_checker.go b/components/ethernet/checker/l4_checker.go new file mode 100644 index 00000000..0099d0cf --- /dev/null +++ b/components/ethernet/checker/l4_checker.go @@ -0,0 +1,62 @@ +/* +Copyright 2024 The Scitix Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package checker + +import ( + "context" + "fmt" + "strings" + + "github.com/scitix/sichek/components/common" + "github.com/scitix/sichek/components/ethernet/collector" + "github.com/scitix/sichek/components/ethernet/config" + "github.com/scitix/sichek/consts" +) + +type L4Checker struct{ spec *config.EthernetSpecConfig } + +func (c *L4Checker) Name() string { return config.EthernetL4CheckerName } +func (c *L4Checker) Check(ctx context.Context, data any) (*common.CheckerResult, error) { + info, ok := data.(*collector.EthernetInfo) + if !ok { + return nil, fmt.Errorf("invalid data type") + } + + result := &common.CheckerResult{ + Name: c.Name(), + Description: config.EthernetCheckItems[c.Name()], + Status: consts.StatusNormal, + Level: consts.LevelInfo, + Curr: "OK", + } + + if strings.Contains(info.IPNeigh, "FAILED") || strings.Contains(info.IPNeigh, "INCOMPLETE") { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelWarning + result.ErrorName = "ARPFailed" + result.Detail = "FAILED/INCOMPLETE entries found in ARP neighbor table. Command: ip neigh show, L2 MAC resolution failed for some neighbors." + result.Suggestion = "Verify local and switch VLAN ID config and L2 forwarding, or use arping to test connectivity and tcpdump for ARP." + } + + if info.Routes.GatewayIP != "" && !info.Routes.GatewayReachable { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelCritical + result.ErrorName = "GatewayUnreachable" + result.Detail += fmt.Sprintf("System gateway (%s) unreachable. Command: ping -c 3 %s && ip neigh show %s.\n", info.Routes.GatewayIP, info.Routes.GatewayIP, info.Routes.GatewayIP) + } + + return result, nil +} diff --git a/components/ethernet/checker/l5_checker.go b/components/ethernet/checker/l5_checker.go new file mode 100644 index 00000000..d96dff03 --- /dev/null +++ b/components/ethernet/checker/l5_checker.go @@ -0,0 +1,77 @@ +/* +Copyright 2024 The Scitix Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package checker + +import ( + "context" + "fmt" + + "github.com/scitix/sichek/components/common" + "github.com/scitix/sichek/components/ethernet/collector" + "github.com/scitix/sichek/components/ethernet/config" + "github.com/scitix/sichek/consts" +) + +type L5Checker struct{ spec *config.EthernetSpecConfig } + +func (c *L5Checker) Name() string { return config.EthernetL5CheckerName } +func (c *L5Checker) Check(ctx context.Context, data any) (*common.CheckerResult, error) { + info, ok := data.(*collector.EthernetInfo) + if !ok { + return nil, fmt.Errorf("invalid data type") + } + + result := &common.CheckerResult{ + Name: c.Name(), + Description: config.EthernetCheckItems[c.Name()], + Status: consts.StatusNormal, + Level: consts.LevelInfo, + Curr: "OK", + } + + if !info.Routes.DefaultRouteViaBond { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelWarning + result.ErrorName = "DirectRouteMismatch" + result.Detail += "System default route does not point directly to target bond. Command: ip route show default, business traffic might not use bond.\n" + } + + if info.RPFilter["all"] == "1" { + if result.Status == consts.StatusNormal { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelWarning + result.ErrorName = "RPFilterEnabled" + } + result.Detail += "System enabled rp_filter (all=1). Command: sysctl -n net.ipv4.conf.all.rp_filter, Expected: 0 or 2, Actual: 1.\n" + } + + for bond, val := range info.RPFilter { + if bond != "all" && val == "1" { + if result.Status == consts.StatusNormal { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelWarning + result.ErrorName = "RPFilterEnabled" + } + result.Detail += fmt.Sprintf("Bond %s enabled rp_filter=1. Command: sysctl -n net.ipv4.conf.%s.rp_filter, Expected: 0 or 2, Actual: 1.\n", bond, bond) + } + } + + if result.Status != consts.StatusNormal { + result.Suggestion = "If packet loss occurs, it is recommended to check route matching, policy routing (ip rule), and set rp_filter to 0 or 2." + } + + return result, nil +}