diff --git a/cmd/command/command.go b/cmd/command/command.go index 22eff6a6..f8bad6ac 100644 --- a/cmd/command/command.go +++ b/cmd/command/command.go @@ -41,6 +41,8 @@ func NewRootCmd() *cobra.Command { "h": true, "all": true, "run": true, + "ethernet": true, + "e": true, } if commandsRequireRoot[cmd.Use] { @@ -57,7 +59,7 @@ func NewRootCmd() *cobra.Command { rootCmd.AddCommand(component.NewCPUCmd()) rootCmd.AddCommand(component.NewNvidiaCmd()) rootCmd.AddCommand(component.NewInfinibandCmd()) - // rootCmd.AddCommand(component.NewEthernetCmd()) + rootCmd.AddCommand(component.NewEthernetCmd()) rootCmd.AddCommand(component.NewGpfsCmd()) rootCmd.AddCommand(component.NewPodLogCmd()) rootCmd.AddCommand(component.NewDmesgCmd()) diff --git a/cmd/command/component/all.go b/cmd/command/component/all.go index fe0533f5..e3d636dd 100644 --- a/cmd/command/component/all.go +++ b/cmd/command/component/all.go @@ -26,6 +26,7 @@ import ( "github.com/scitix/sichek/components/common" "github.com/scitix/sichek/components/cpu" "github.com/scitix/sichek/components/dmesg" + "github.com/scitix/sichek/components/ethernet" "github.com/scitix/sichek/components/gpfs" gpuevents "github.com/scitix/sichek/components/gpuevents" "github.com/scitix/sichek/components/infiniband" @@ -186,6 +187,8 @@ func NewComponent(componentName string, cfgFile string, specFile string, ignored case consts.ComponentNameSyslog: // if skipPercent is -1, use the value from the config file return syslog.NewComponent(cfgFile, "", -1) + case consts.ComponentNameEthernet: + return ethernet.NewEthernetComponent(cfgFile, specFile, ignoredCheckers) default: return nil, fmt.Errorf("invalid component name: %s", componentName) } diff --git a/cmd/command/component/ethernet.go b/cmd/command/component/ethernet.go new file mode 100644 index 00000000..3ef6824c --- /dev/null +++ b/cmd/command/component/ethernet.go @@ -0,0 +1,93 @@ +/* +Copyright 2024 The Scitix Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package component + +import ( + "context" + "strings" + + "github.com/scitix/sichek/cmd/command/spec" + "github.com/scitix/sichek/components/ethernet" + "github.com/scitix/sichek/consts" + + "github.com/sirupsen/logrus" + "github.com/spf13/cobra" +) + +func NewEthernetCmd() *cobra.Command { + var ( + cfgFile string + specFile string + ignoredCheckersStr string + verbose bool + ) + ethernetCmd := &cobra.Command{ + Use: "ethernet", + Aliases: []string{"e"}, + Short: "Perform Ethernet HealthCheck", + Run: func(cmd *cobra.Command, args []string) { + ctx, cancel := context.WithTimeout(context.Background(), consts.CmdTimeout) + + if !verbose { + logrus.SetLevel(logrus.ErrorLevel) + defer cancel() + } else { + logrus.SetLevel(logrus.DebugLevel) + defer func() { + logrus.WithField("component", "ethernet").Info("Run ethernet Cmd context canceled") + cancel() + }() + } + + resolvedCfgFile, err := spec.EnsureCfgFile(cfgFile) + if err != nil { + logrus.WithField("daemon", "ethernet").Errorf("failed to load cfgFile: %v", err) + } else { + logrus.WithField("daemon", "ethernet").Info("load cfgFile: " + resolvedCfgFile) + } + resolvedSpecFile, err := spec.EnsureSpecFile(specFile) + if err != nil { + logrus.WithField("daemon", "ethernet").Errorf("failed to load specFile: %v", err) + } else { + logrus.WithField("daemon", "ethernet").Info("load specFile: " + resolvedSpecFile) + } + + var ignoredCheckers []string + if len(ignoredCheckersStr) > 0 { + ignoredCheckers = strings.Split(ignoredCheckersStr, ",") + } + + component, err := ethernet.NewEthernetComponent(resolvedCfgFile, resolvedSpecFile, ignoredCheckers) + if err != nil { + logrus.WithField("component", "ethernet").Error(err) + return + } + logrus.WithField("component", "ethernet").Infof("Run Ethernet component check: %s", component.Name()) + result, err := RunComponentCheck(ctx, component, consts.CmdTimeout) + if err != nil { + return + } + PrintCheckResults(true, result) + }, + } + + ethernetCmd.Flags().StringVarP(&cfgFile, "cfg", "c", "", "Path to the user config file") + ethernetCmd.Flags().StringVarP(&specFile, "spec", "s", "", "Path to the Ethernet specification file") + ethernetCmd.Flags().StringVarP(&ignoredCheckersStr, "ignored-checkers", "i", "", "Ignored checkers") + ethernetCmd.Flags().BoolVarP(&verbose, "verbose", "v", false, "Enable verbose output") + + return ethernetCmd +} diff --git a/components/ethernet/checker/checker.go b/components/ethernet/checker/checker.go new file mode 100644 index 00000000..36d69e1a --- /dev/null +++ b/components/ethernet/checker/checker.go @@ -0,0 +1,69 @@ +/* +Copyright 2024 The Scitix Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package checker + +import ( + "regexp" + "strconv" + + "github.com/scitix/sichek/components/common" + "github.com/scitix/sichek/components/ethernet/config" +) + +func NewCheckers(cfg *config.EthernetUserConfig, spec *config.EthernetSpecConfig) ([]common.Checker, error) { + checkers := []common.Checker{ + &L1Checker{ + spec: spec, + prevCRC: make(map[string]int64), + prevCarrier: make(map[string]int64), + prevDrops: make(map[string]int64), + }, + &L2Checker{ + spec: spec, + prevLinkFailures: make(map[string]int64), + prevActiveSlave: make(map[string]string), + }, + &L3Checker{spec: spec}, + &L4Checker{spec: spec}, + &L5Checker{spec: spec}, + } + // Filter skipped checkers + ignoredMap := make(map[string]bool) + if cfg != nil && cfg.Ethernet != nil { + for _, v := range cfg.Ethernet.IgnoredCheckers { + ignoredMap[v] = true + } + } + var activeCheckers []common.Checker + for _, chk := range checkers { + if !ignoredMap[chk.Name()] { + activeCheckers = append(activeCheckers, chk) + } + } + return activeCheckers, nil +} + + +// extractInt parses an integer using regex from a string pattern +func extractInt(input, pattern string) int64 { + re := regexp.MustCompile(pattern) + matches := re.FindStringSubmatch(input) + if len(matches) > 1 { + val, _ := strconv.ParseInt(matches[1], 10, 64) + return val + } + return 0 +} diff --git a/components/ethernet/checker/l1_checker.go b/components/ethernet/checker/l1_checker.go new file mode 100644 index 00000000..17a32227 --- /dev/null +++ b/components/ethernet/checker/l1_checker.go @@ -0,0 +1,128 @@ +/* +Copyright 2024 The Scitix Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package checker + +import ( + "context" + "fmt" + "strconv" + "strings" + + "github.com/scitix/sichek/components/common" + "github.com/scitix/sichek/components/ethernet/collector" + "github.com/scitix/sichek/components/ethernet/config" + "github.com/scitix/sichek/consts" +) + +type L1Checker struct { + spec *config.EthernetSpecConfig + prevCRC map[string]int64 + prevCarrier map[string]int64 + prevDrops map[string]int64 +} + +func (c *L1Checker) Name() string { return config.EthernetL1CheckerName } + +func (c *L1Checker) Check(ctx context.Context, data any) (*common.CheckerResult, error) { + info, ok := data.(*collector.EthernetInfo) + if !ok { + return nil, fmt.Errorf("invalid data type") + } + + result := &common.CheckerResult{ + Name: c.Name(), + Description: config.EthernetCheckItems[c.Name()], + Status: consts.StatusNormal, + Level: consts.LevelInfo, + Curr: "OK", + } + + expectedSpeed := "25000" // default to 25G + if c.spec != nil && c.spec.Speed != "" { + expectedSpeed = c.spec.Speed + } + + for _, bond := range info.BondInterfaces { + for slaveName, slaveState := range info.Slaves[bond] { + if !slaveState.LinkDetected { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelCritical + result.ErrorName = "LinkDown" + result.Detail += fmt.Sprintf("Physical NIC %s link not UP. Command: ethtool %s, Expected: Link detected: yes, Actual: not connected or unknown.\n", slaveName, slaveName) + } + + if len(info.SyslogErrors) > 0 { + for _, errLine := range info.SyslogErrors { + if strings.Contains(errLine, "tx timeout") && strings.Contains(errLine, slaveName) { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelCritical + result.ErrorName = "TxTimeout" + result.Detail += fmt.Sprintf("NIC %s tx timeout found in kernel log. Command: dmesg | grep -iE 'eth|mlx|link'.\n", slaveName) + break + } + } + } + + // check speed + speedStr := strconv.Itoa(slaveState.Speed) + if speedStr != expectedSpeed && slaveState.Speed > 0 { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelWarning + result.ErrorName = "SpeedMismatch" + result.Detail += fmt.Sprintf("NIC %s speed mismatch. Command: ethtool %s, Expected: %sMb/s, Actual: %sMb/s.\n", slaveName, slaveName, expectedSpeed, speedStr) + } + + // Parse stats + sStats := info.Stats[slaveName] + + // CRC errors + currCRC := sStats.RXErrors // Approximation, standard ip -s link maps CRC errors to RX errors broadly. For exact CRC, ethtool parsing should remain, but for now we follow the general RX error growth. + if prev, ok := c.prevCRC[slaveName]; ok && currCRC > prev { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelWarning + result.ErrorName = "CRCErrorsGrowing" + result.Detail += fmt.Sprintf("NIC %s RX (CRC) errors increasing. Command: ip -s link show %s, Previous: %d, Current: %d.\n", slaveName, slaveName, prev, currCRC) + } + c.prevCRC[slaveName] = currCRC + + // Carrier errors + currCarrierIPS := sStats.Carrier + if prev, ok := c.prevCarrier[slaveName]; ok && currCarrierIPS > prev { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelWarning + result.ErrorName = "CarrierErrorsGrowing" + result.Detail += fmt.Sprintf("NIC %s Carrier errors increasing. Command: ip -s link show %s, Previous: %d, Current: %d.\n", slaveName, slaveName, prev, currCarrierIPS) + } + c.prevCarrier[slaveName] = currCarrierIPS + + // Drops + currDrops := sStats.Dropped + if prev, ok := c.prevDrops[slaveName]; ok && currDrops > prev { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelWarning + result.ErrorName = "DropsGrowing" + result.Detail += fmt.Sprintf("NIC %s Drops increasing. Command: ip -s link show %s, Previous: %d, Current: %d.\n", slaveName, slaveName, prev, currDrops) + } + c.prevDrops[slaveName] = currDrops + } + } + + if result.Status != consts.StatusNormal { + result.Suggestion = "Please check physical link, cable, driver version (ethtool -i), or check dmesg for specific errors; if speed mismatch, check corresponding configuration." + } + + return result, nil +} diff --git a/components/ethernet/checker/l2_checker.go b/components/ethernet/checker/l2_checker.go new file mode 100644 index 00000000..e65d3e59 --- /dev/null +++ b/components/ethernet/checker/l2_checker.go @@ -0,0 +1,215 @@ +/* +Copyright 2024 The Scitix Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package checker + +import ( + "context" + "fmt" + "regexp" + "strconv" + "strings" + + "github.com/scitix/sichek/components/common" + "github.com/scitix/sichek/components/ethernet/collector" + "github.com/scitix/sichek/components/ethernet/config" + "github.com/scitix/sichek/consts" +) + +type L2Checker struct { + spec *config.EthernetSpecConfig + prevLinkFailures map[string]int64 + prevActiveSlave map[string]string +} + +func (c *L2Checker) Name() string { return config.EthernetL2CheckerName } +func (c *L2Checker) Check(ctx context.Context, data any) (*common.CheckerResult, error) { + info, ok := data.(*collector.EthernetInfo) + if !ok { + return nil, fmt.Errorf("invalid data type") + } + + result := &common.CheckerResult{ + Name: c.Name(), + Description: config.EthernetCheckItems[c.Name()], + Status: consts.StatusNormal, + Level: consts.LevelInfo, + Curr: "OK", + } + + expectedMinSlaves := 2 + if c.spec != nil && c.spec.MinSlaves > 0 { + expectedMinSlaves = c.spec.MinSlaves + } + + for _, bond := range info.BondInterfaces { + bState, exists := info.Bonds[bond] + if !exists || bState.Name == "" { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelCritical + result.ErrorName = "BondingMissing" + result.Detail = fmt.Sprintf("Bond %s missing in /proc/net/bonding. Command: ls /proc/net/bonding/.\n", bond) + continue + } + + expectedMII := "up" + procContent := info.ProcNetBonding[bond] + if c.spec != nil && c.spec.MIIStatus != "" { + expectedMII = c.spec.MIIStatus + } + + if (expectedMII == "up" && !bState.IsUp) || !strings.Contains(procContent, "MII Status: "+expectedMII) { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelCritical + result.ErrorName = "BondDown" + result.Detail += fmt.Sprintf("Overall status of bond interface %s mismatch. Command: cat /proc/net/bonding/%s, Expected: MII Status: %s, Actual: mismatch (possibly down).\n", bond, bond, expectedMII) + } + + // check MTU + if c.spec != nil && c.spec.MTU != "" { + expectedMTU, _ := strconv.Atoi(c.spec.MTU) + if bState.MTU > 0 && bState.MTU != expectedMTU { + if result.Status == consts.StatusNormal { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelWarning + result.ErrorName = "MTUMismatch" + } + result.Detail += fmt.Sprintf("Bond %s MTU mismatch. Command: ip link show %s, Expected: %d, Actual: %d.\n", bond, bond, expectedMTU, bState.MTU) + } + } + + // check xmit_hash_policy + if c.spec != nil && c.spec.XmitHashPolicy != "" { + if bState.XmitHashPolicy != "" && bState.XmitHashPolicy != c.spec.XmitHashPolicy { + if result.Status == consts.StatusNormal { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelWarning + result.ErrorName = "XmitHashPolicyMismatch" + } + result.Detail += fmt.Sprintf("Bond %s xmit_hash_policy mismatch. Command: cat /sys/class/net/%s/bonding/xmit_hash_policy, Expected: %s, Actual: %s.\n", bond, bond, c.spec.XmitHashPolicy, bState.XmitHashPolicy) + } + } + + // check slave count + slaveCount := len(info.Slaves[bond]) + if slaveCount < expectedMinSlaves { + if result.Status == consts.StatusNormal { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelWarning + result.ErrorName = "SlaveCountMismatch" + } + result.Detail += fmt.Sprintf("Bond %s insufficient slave count. Command: cat /proc/net/bonding/%s, Expected at least: %d, Actual: %d.\n", bond, bond, expectedMinSlaves, slaveCount) + } + + // check miimon, updelay, downdelay (fetching downdelay and updelay via regex since they aren't fully standard across systems on sysfs) + miimon := int64(bState.Miimon) + updelay := extractInt(procContent, `Up Delay \(ms\):\s*(\d+)`) + downdelay := extractInt(procContent, `Down Delay \(ms\):\s*(\d+)`) + + expectedMiimon := int64(0) + expectedUpDelay := int64(0) + expectedDownDelay := int64(0) + if c.spec != nil { + if c.spec.Miimon > 0 { + expectedMiimon = int64(c.spec.Miimon) + } + expectedUpDelay = int64(c.spec.UpDelay) + expectedDownDelay = int64(c.spec.DownDelay) + } + + if miimon == 0 { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelCritical + result.ErrorName = "MiimonDisabled" + result.Detail += fmt.Sprintf("Bond %s MII Polling Interval (miimon) is 0. Command: cat /proc/net/bonding/%s, please enable link detection (miimon) to avoid packet loss.\n", bond, bond) + } else { + if expectedMiimon > 0 && miimon != expectedMiimon { + if result.Status == consts.StatusNormal { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelWarning + result.ErrorName = "MiimonMismatch" + } + result.Detail += fmt.Sprintf("Bond %s miimon mismatch. Command: cat /sys/class/net/%s/bonding/miimon, Expected: %d ms, Actual: %d ms.\n", bond, bond, expectedMiimon, miimon) + } + + if downdelay < miimon { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelWarning + result.ErrorName = "DowndelayTooSmall" + result.Detail += fmt.Sprintf("Bond %s downdelay (%d ms) less than miimon (%d ms). Command: cat /proc/net/bonding/%s, unreasonable config may cause flapping.\n", bond, downdelay, miimon, bond) + } else if expectedDownDelay > 0 && downdelay != expectedDownDelay { + if result.Status == consts.StatusNormal { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelWarning + result.ErrorName = "DowndelayMismatch" + } + result.Detail += fmt.Sprintf("Bond %s downdelay mismatch. Command: cat /sys/class/net/%s/bonding/downdelay, Expected: %d ms, Actual: %d ms.\n", bond, bond, expectedDownDelay, downdelay) + } + + if updelay == 0 { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelWarning + result.ErrorName = "UpdelayZero" + result.Detail += fmt.Sprintf("Bond %s updelay is 0. Command: cat /proc/net/bonding/%s, updelay is recommended to avoid packet loss during switch port negotiation.\n", bond, bond) + } else if expectedUpDelay > 0 && updelay != expectedUpDelay { + if result.Status == consts.StatusNormal { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelWarning + result.ErrorName = "UpdelayMismatch" + } + result.Detail += fmt.Sprintf("Bond %s updelay mismatch. Command: cat /sys/class/net/%s/bonding/updelay, Expected: %d ms, Actual: %d ms.\n", bond, bond, expectedUpDelay, updelay) + } + } + + // track active slave for flapping detection + activeSlaveMatch := regexp.MustCompile(`Currently Active Slave:\s*(\w+)`).FindStringSubmatch(procContent) + if len(activeSlaveMatch) > 1 { + currActive := activeSlaveMatch[1] + if prev, ok := c.prevActiveSlave[bond]; ok && prev != "" && prev != currActive { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelWarning + result.ErrorName = "ActiveSlaveFlapping" + result.Detail += fmt.Sprintf("Bond %s active slave switched. Command: cat /proc/net/bonding/%s, Previous: %s, Current: %s. If frequent, please focus on physical layer stability.\n", bond, bond, prev, currActive) + } + c.prevActiveSlave[bond] = currActive + } + + // track Link Failure Count per slave + slavesData := strings.Split(procContent, "Slave Interface: ") + for i := 1; i < len(slavesData); i++ { + lines := strings.Split(slavesData[i], "\n") + if len(lines) == 0 { + continue + } + slaveName := strings.TrimSpace(lines[0]) + failCount := extractInt(slavesData[i], `Link Failure Count:\s*(\d+)`) + trackKey := bond + "-" + slaveName + + if prev, ok := c.prevLinkFailures[trackKey]; ok && failCount > prev { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelWarning + result.ErrorName = "LinkFailureGrowing" + result.Detail += fmt.Sprintf("Bond %s slave NIC %s link failure occurred. Command: cat /proc/net/bonding/%s, Previous count: %d, Current: %d.\n", bond, slaveName, bond, prev, failCount) + } + c.prevLinkFailures[trackKey] = failCount + } + } + + if result.Status != consts.StatusNormal { + result.Suggestion = "Please use cat /proc/net/bonding/bond0 to verify MII status and Link Failure Count; ensure config (e.g., /etc/netplan) has miimon > 0." + } + + return result, nil +} diff --git a/components/ethernet/checker/l3_checker.go b/components/ethernet/checker/l3_checker.go new file mode 100644 index 00000000..9a0cb8cf --- /dev/null +++ b/components/ethernet/checker/l3_checker.go @@ -0,0 +1,112 @@ +/* +Copyright 2024 The Scitix Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package checker + +import ( + "context" + "fmt" + "strings" + + "github.com/scitix/sichek/components/common" + "github.com/scitix/sichek/components/ethernet/collector" + "github.com/scitix/sichek/components/ethernet/config" + "github.com/scitix/sichek/consts" +) + +type L3Checker struct{ spec *config.EthernetSpecConfig } + +func (c *L3Checker) Name() string { return config.EthernetL3CheckerName } +func (c *L3Checker) Check(ctx context.Context, data any) (*common.CheckerResult, error) { + info, ok := data.(*collector.EthernetInfo) + if !ok { + return nil, fmt.Errorf("invalid data type") + } + + result := &common.CheckerResult{ + Name: c.Name(), + Description: config.EthernetCheckItems[c.Name()], + Status: consts.StatusNormal, + Level: consts.LevelInfo, + Curr: "OK", + } + + for _, bond := range info.BondInterfaces { + bState, ok := info.Bonds[bond] + if !ok || !strings.Contains(bState.Mode, "802.3ad") { + continue + } + + lacp, exists := info.LACP[bond] + if !exists || lacp.PartnerMacAddress == "" { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelCritical + result.ErrorName = "ActiveAggregatorMissing" + result.Detail += fmt.Sprintf("Bond %s configured as 802.3ad mode but no valid Active Aggregator found. Command: cat /proc/net/bonding/%s, peer switch might not have LACP configured or link is abnormal.\n", bond, bond) + continue + } + + if lacp.PartnerMacAddress == "00:00:00:00:00:00" { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelCritical + result.ErrorName = "PartnerMacInvalid" + result.Detail += fmt.Sprintf("Bond %s Partner Mac Address is all zeros. Command: cat /proc/net/bonding/%s, peer switch did not respond to LACP packets.\n", bond, bond) + } + + for slaveName, sState := range info.Slaves[bond] { + if !sState.IsUp { + continue + } + + if slaveAggID, ok := lacp.SlaveAggregatorIDs[slaveName]; ok && slaveAggID != lacp.ActiveAggregatorID { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelCritical + result.ErrorName = "AggregatorMismatch" + result.Detail += fmt.Sprintf("Slave NIC %s Aggregator ID (%s) mismatch with global Active Aggregator ID (%s). Command: cat /proc/net/bonding/%s, it cannot join the aggregation group.\n", slaveName, slaveAggID, lacp.ActiveAggregatorID, bond) + } + + if portKey, ok := lacp.SlaveActorKeys[slaveName]; ok && portKey != lacp.ActorKey { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelWarning + result.ErrorName = "ActorKeyMismatch" + result.Detail += fmt.Sprintf("Slave NIC %s port key (%s) mismatch with global Actor Key (%s). Command: cat /proc/net/bonding/%s.\n", slaveName, portKey, lacp.ActorKey, bond) + } + + if operKey, ok := lacp.SlavePartnerKeys[slaveName]; ok && operKey != lacp.PartnerKey { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelWarning + result.ErrorName = "PartnerKeyMismatch" + result.Detail += fmt.Sprintf("Slave NIC %s oper key (%s) mismatch with global Partner Key (%s). Command: cat /proc/net/bonding/%s, peer LACP negotiation abnormal.\n", slaveName, operKey, lacp.PartnerKey, bond) + } + } + + if c.spec != nil && c.spec.LACPRate != "" { + if !strings.Contains(bState.LACPRate, c.spec.LACPRate) { + if result.Status == consts.StatusNormal { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelWarning + result.ErrorName = "LACPRateMismatch" + } + result.Detail += fmt.Sprintf("Bond %s LACP rate mismatch. Command: cat /sys/class/net/%s/bonding/lacp_rate, Expected: %s, Actual: %s.\n", bond, bond, c.spec.LACPRate, bState.LACPRate) + } + } + } + + if result.Status != consts.StatusNormal { + result.Suggestion = "Recommended to simultaneously troubleshoot LACP / Eth-Trunk aggregation config on peer switch." + } + + return result, nil +} diff --git a/components/ethernet/checker/l4_checker.go b/components/ethernet/checker/l4_checker.go new file mode 100644 index 00000000..0099d0cf --- /dev/null +++ b/components/ethernet/checker/l4_checker.go @@ -0,0 +1,62 @@ +/* +Copyright 2024 The Scitix Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package checker + +import ( + "context" + "fmt" + "strings" + + "github.com/scitix/sichek/components/common" + "github.com/scitix/sichek/components/ethernet/collector" + "github.com/scitix/sichek/components/ethernet/config" + "github.com/scitix/sichek/consts" +) + +type L4Checker struct{ spec *config.EthernetSpecConfig } + +func (c *L4Checker) Name() string { return config.EthernetL4CheckerName } +func (c *L4Checker) Check(ctx context.Context, data any) (*common.CheckerResult, error) { + info, ok := data.(*collector.EthernetInfo) + if !ok { + return nil, fmt.Errorf("invalid data type") + } + + result := &common.CheckerResult{ + Name: c.Name(), + Description: config.EthernetCheckItems[c.Name()], + Status: consts.StatusNormal, + Level: consts.LevelInfo, + Curr: "OK", + } + + if strings.Contains(info.IPNeigh, "FAILED") || strings.Contains(info.IPNeigh, "INCOMPLETE") { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelWarning + result.ErrorName = "ARPFailed" + result.Detail = "FAILED/INCOMPLETE entries found in ARP neighbor table. Command: ip neigh show, L2 MAC resolution failed for some neighbors." + result.Suggestion = "Verify local and switch VLAN ID config and L2 forwarding, or use arping to test connectivity and tcpdump for ARP." + } + + if info.Routes.GatewayIP != "" && !info.Routes.GatewayReachable { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelCritical + result.ErrorName = "GatewayUnreachable" + result.Detail += fmt.Sprintf("System gateway (%s) unreachable. Command: ping -c 3 %s && ip neigh show %s.\n", info.Routes.GatewayIP, info.Routes.GatewayIP, info.Routes.GatewayIP) + } + + return result, nil +} diff --git a/components/ethernet/checker/l5_checker.go b/components/ethernet/checker/l5_checker.go new file mode 100644 index 00000000..d96dff03 --- /dev/null +++ b/components/ethernet/checker/l5_checker.go @@ -0,0 +1,77 @@ +/* +Copyright 2024 The Scitix Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package checker + +import ( + "context" + "fmt" + + "github.com/scitix/sichek/components/common" + "github.com/scitix/sichek/components/ethernet/collector" + "github.com/scitix/sichek/components/ethernet/config" + "github.com/scitix/sichek/consts" +) + +type L5Checker struct{ spec *config.EthernetSpecConfig } + +func (c *L5Checker) Name() string { return config.EthernetL5CheckerName } +func (c *L5Checker) Check(ctx context.Context, data any) (*common.CheckerResult, error) { + info, ok := data.(*collector.EthernetInfo) + if !ok { + return nil, fmt.Errorf("invalid data type") + } + + result := &common.CheckerResult{ + Name: c.Name(), + Description: config.EthernetCheckItems[c.Name()], + Status: consts.StatusNormal, + Level: consts.LevelInfo, + Curr: "OK", + } + + if !info.Routes.DefaultRouteViaBond { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelWarning + result.ErrorName = "DirectRouteMismatch" + result.Detail += "System default route does not point directly to target bond. Command: ip route show default, business traffic might not use bond.\n" + } + + if info.RPFilter["all"] == "1" { + if result.Status == consts.StatusNormal { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelWarning + result.ErrorName = "RPFilterEnabled" + } + result.Detail += "System enabled rp_filter (all=1). Command: sysctl -n net.ipv4.conf.all.rp_filter, Expected: 0 or 2, Actual: 1.\n" + } + + for bond, val := range info.RPFilter { + if bond != "all" && val == "1" { + if result.Status == consts.StatusNormal { + result.Status = consts.StatusAbnormal + result.Level = consts.LevelWarning + result.ErrorName = "RPFilterEnabled" + } + result.Detail += fmt.Sprintf("Bond %s enabled rp_filter=1. Command: sysctl -n net.ipv4.conf.%s.rp_filter, Expected: 0 or 2, Actual: 1.\n", bond, bond) + } + } + + if result.Status != consts.StatusNormal { + result.Suggestion = "If packet loss occurs, it is recommended to check route matching, policy routing (ip rule), and set rp_filter to 0 or 2." + } + + return result, nil +} diff --git a/components/ethernet/collector/collector.go b/components/ethernet/collector/collector.go new file mode 100644 index 00000000..5f639ca5 --- /dev/null +++ b/components/ethernet/collector/collector.go @@ -0,0 +1,445 @@ +/* +Copyright 2024 The Scitix Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package collector + +import ( + "context" + "encoding/json" + "fmt" + "regexp" + "strconv" + "strings" + + "github.com/scitix/sichek/pkg/utils" +) + +type BondState struct { + Name string `json:"name"` + IsUp bool `json:"is_up"` + HasLowerUp bool `json:"has_lower_up"` + IPAddr string `json:"ip_addr"` + MTU int `json:"mtu"` + Mode string `json:"mode"` + Miimon int `json:"miimon"` + XmitHashPolicy string `json:"xmit_hash_policy"` + LACPRate string `json:"lacp_rate"` + ActiveSlave string `json:"active_slave"` +} + +type SlaveState struct { + Name string `json:"name"` + IsUp bool `json:"is_up"` + LinkDetected bool `json:"link_detected"` + Speed int `json:"speed"` // Mbps + Duplex string `json:"duplex"` // "Full", "Half" +} + +type LACPState struct { + ActiveAggregatorID string `json:"active_aggregator_id"` + ActorKey string `json:"actor_key"` + PartnerKey string `json:"partner_key"` + PartnerMacAddress string `json:"partner_mac_address"` + SlaveAggregatorIDs map[string]string `json:"slave_aggregator_ids"` + SlaveActorKeys map[string]string `json:"slave_actor_keys"` + SlavePartnerKeys map[string]string `json:"slave_partner_keys"` +} + +type TrafficStats struct { + RXErrors int64 `json:"rx_errors"` + TXErrors int64 `json:"tx_errors"` + Dropped int64 `json:"dropped"` + Carrier int64 `json:"carrier"` +} + +type RouteState struct { + DefaultRouteViaBond bool `json:"default_route_via_bond"` + GatewayReachable bool `json:"gateway_reachable"` + GatewayIP string `json:"gateway_ip"` +} + +type EthernetInfo struct { + BondInterfaces []string + Bonds map[string]BondState + Slaves map[string]map[string]SlaveState // Bond -> SlaveName -> Info + LACP map[string]LACPState + Stats map[string]TrafficStats // Maps iface -> stats + Routes RouteState + SyslogErrors []string + + // Legacy string outputs (kept temporarily for backwards compatibility with un-migrated checkers) + ProcNetBonding map[string]string + BondSlaves map[string][]string + Ethtool map[string]string + EthtoolS map[string]string + EthtoolI map[string]string + IPSLink map[string]string + IPLink string + IPAddr string + IPRoute string + IPRule string + IPNeigh string + BridgeVlan string + BridgeFdb string + Dmesg string + RPFilter map[string]string + SysfsBonding map[string]map[string]string +} + +func (e *EthernetInfo) JSON() (string, error) { + b, err := json.Marshal(e) + return string(b), err +} + +type EthernetCollector struct { + name string + info *EthernetInfo + targetBond string +} + +func NewEthernetCollector(targetBond string) (*EthernetCollector, error) { + return &EthernetCollector{ + name: "EthernetCollector", + targetBond: targetBond, + info: &EthernetInfo{ + Bonds: make(map[string]BondState), + Slaves: make(map[string]map[string]SlaveState), + LACP: make(map[string]LACPState), + Stats: make(map[string]TrafficStats), + SyslogErrors: make([]string, 0), + ProcNetBonding: make(map[string]string), + SysfsBonding: make(map[string]map[string]string), + BondSlaves: make(map[string][]string), + Ethtool: make(map[string]string), + EthtoolS: make(map[string]string), + EthtoolI: make(map[string]string), + IPSLink: make(map[string]string), + RPFilter: make(map[string]string), + }, + }, nil +} + +func (c *EthernetCollector) Name() string { + return c.name +} + +func (c *EthernetCollector) Collect(ctx context.Context) (*EthernetInfo, error) { + out, _ := utils.ExecCommand(ctx, "ip", "-o", "link", "show", "type", "bond") + lines := strings.Split(string(out), "\n") + c.info.BondInterfaces = nil + for _, l := range lines { + if l == "" { + continue + } + parts := strings.Split(l, ": ") + if len(parts) >= 2 { + name := strings.Split(strings.TrimSpace(parts[1]), "@")[0] + c.info.BondInterfaces = append(c.info.BondInterfaces, name) + } + } + + // Filter based on targetBond + if c.targetBond != "" { + var filtered []string + for _, b := range c.info.BondInterfaces { + if b == c.targetBond { + filtered = append(filtered, b) + break + } + } + c.info.BondInterfaces = filtered + } + + for _, bond := range c.info.BondInterfaces { + c.info.Bonds[bond] = BondState{Name: bond} + c.info.Slaves[bond] = make(map[string]SlaveState) + + outProc, _ := utils.ExecCommand(ctx, "cat", "/proc/net/bonding/"+bond) + c.info.ProcNetBonding[bond] = string(outProc) + + // Parse BondState config from sysfs + attrs := []string{"mode", "miimon", "slaves", "xmit_hash_policy", "lacp_rate"} + if c.info.SysfsBonding[bond] == nil { + c.info.SysfsBonding[bond] = make(map[string]string) + } + for _, attr := range attrs { + outAttr, _ := utils.ExecCommand(ctx, "cat", "/sys/class/net/"+bond+"/bonding/"+attr) + c.info.SysfsBonding[bond][attr] = strings.TrimSpace(string(outAttr)) + } + + slavesStr := c.info.SysfsBonding[bond]["slaves"] + slaves := strings.Fields(slavesStr) + c.info.BondSlaves[bond] = slaves + + // Fetch sysctl rp_filter for bond + outRP, _ := utils.ExecCommand(ctx, "sysctl", "-n", "net.ipv4.conf."+bond+".rp_filter") + c.info.RPFilter[bond] = strings.TrimSpace(string(outRP)) + + // For each slave, fetch L1 info + for _, slave := range slaves { + outEth, _ := utils.ExecCommand(ctx, "ethtool", slave) + c.info.Ethtool[slave] = string(outEth) + + outEthS, _ := utils.ExecCommand(ctx, "ethtool", "-S", slave) + c.info.EthtoolS[slave] = string(outEthS) + + outEthI, _ := utils.ExecCommand(ctx, "ethtool", "-i", slave) + c.info.EthtoolI[slave] = string(outEthI) + + outIPSL, _ := utils.ExecCommand(ctx, "ip", "-s", "link", "show", slave) + c.info.IPSLink[slave] = string(outIPSL) + } + + // Also fetch stats for the bond interface itself + outIPSLBond, _ := utils.ExecCommand(ctx, "ip", "-s", "link", "show", bond) + c.info.IPSLink[bond] = string(outIPSLBond) + } + + var grepParts []string + if c.targetBond != "" { + grepParts = append(grepParts, c.targetBond) + } + + filterArgs := []string{"grep", "-iE", strings.Join(grepParts, "|")} + + if len(grepParts) > 0 { + outIPLink, _ := utils.ExecCommand(ctx, "sh", "-c", "ip -d link | "+strings.Join(filterArgs, " ")) + c.info.IPLink = string(outIPLink) + + outIPAddr, _ := utils.ExecCommand(ctx, "sh", "-c", "ip addr | "+strings.Join(filterArgs, " ")) + c.info.IPAddr = string(outIPAddr) + + outIPRoute, _ := utils.ExecCommand(ctx, "sh", "-c", "ip route | "+strings.Join(filterArgs, " ")) + c.info.IPRoute = string(outIPRoute) + + outIPRule, _ := utils.ExecCommand(ctx, "sh", "-c", "ip rule | "+strings.Join(filterArgs, " ")) + c.info.IPRule = string(outIPRule) + + outIPNeigh, _ := utils.ExecCommand(ctx, "sh", "-c", "ip neigh | "+strings.Join(filterArgs, " ")) + c.info.IPNeigh = string(outIPNeigh) + + outBridgeVlan, _ := utils.ExecCommand(ctx, "sh", "-c", "bridge vlan show | "+strings.Join(filterArgs, " ")) + c.info.BridgeVlan = string(outBridgeVlan) + + outBridgeFdb, _ := utils.ExecCommand(ctx, "sh", "-c", "bridge fdb show | "+strings.Join(filterArgs, " ")) + c.info.BridgeFdb = string(outBridgeFdb) + } else { + outIPLink, _ := utils.ExecCommand(ctx, "ip", "-d", "link") + c.info.IPLink = string(outIPLink) + + outIPAddr, _ := utils.ExecCommand(ctx, "ip", "addr") + c.info.IPAddr = string(outIPAddr) + + outIPRoute, _ := utils.ExecCommand(ctx, "ip", "route") + c.info.IPRoute = string(outIPRoute) + + outIPRule, _ := utils.ExecCommand(ctx, "ip", "rule") + c.info.IPRule = string(outIPRule) + + outIPNeigh, _ := utils.ExecCommand(ctx, "ip", "neigh") + c.info.IPNeigh = string(outIPNeigh) + + outBridgeVlan, _ := utils.ExecCommand(ctx, "bridge", "vlan", "show") + c.info.BridgeVlan = string(outBridgeVlan) + + outBridgeFdb, _ := utils.ExecCommand(ctx, "bridge", "fdb", "show") + c.info.BridgeFdb = string(outBridgeFdb) + } + + // Post-process parsed KV states for bonds + for _, bond := range c.info.BondInterfaces { + bState := c.info.Bonds[bond] + bState.Mode = c.info.SysfsBonding[bond]["mode"] + bState.XmitHashPolicy = c.info.SysfsBonding[bond]["xmit_hash_policy"] + bState.LACPRate = c.info.SysfsBonding[bond]["lacp_rate"] + bState.Miimon, _ = strconv.Atoi(c.info.SysfsBonding[bond]["miimon"]) + + // Link states from IP Link / Addr + bState.IsUp = strings.Contains(c.info.IPLink, fmt.Sprintf("%s: ", bond)) || strings.Contains(c.info.IPLink, "LOWER_UP") + + mtuMatch := regexp.MustCompile(fmt.Sprintf(`%s:.*mtu (\d+)`, bond)).FindStringSubmatch(c.info.IPLink) + if len(mtuMatch) > 1 { + bState.MTU, _ = strconv.Atoi(mtuMatch[1]) + } + + ipMatch := regexp.MustCompile(fmt.Sprintf(`inet ([\d\.]+)/\d+.*%s`, bond)).FindStringSubmatch(c.info.IPAddr) + if len(ipMatch) > 1 { + bState.IPAddr = ipMatch[1] + } + + // Parse ProcNetBonding for Active Slave and 802.3ad info + procStr := c.info.ProcNetBonding[bond] + activeSlaveMatch := regexp.MustCompile(`Currently Active Slave:\s*(\w+)`).FindStringSubmatch(procStr) + if len(activeSlaveMatch) > 1 { + bState.ActiveSlave = activeSlaveMatch[1] + } + + c.info.Bonds[bond] = bState + + // Populate LACP State + if strings.Contains(procStr, "Bonding Mode: IEEE 802.3ad") { + lacp := LACPState{ + SlaveAggregatorIDs: make(map[string]string), + SlaveActorKeys: make(map[string]string), + SlavePartnerKeys: make(map[string]string), + } + activeAggMatch := regexp.MustCompile(`(?s)Active Aggregator Info:\s*Aggregator ID:\s*(\d+).*?Actor Key:\s*(\d+).*?Partner Key:\s*(\d+).*?Partner Mac Address:\s*([\w:]+)`).FindStringSubmatch(procStr) + if len(activeAggMatch) > 4 { + lacp.ActiveAggregatorID = activeAggMatch[1] + lacp.ActorKey = activeAggMatch[2] + lacp.PartnerKey = activeAggMatch[3] + lacp.PartnerMacAddress = activeAggMatch[4] + } + + slavesData := strings.Split(procStr, "Slave Interface: ") + for i := 1; i < len(slavesData); i++ { + lines := strings.Split(slavesData[i], "\n") + if len(lines) == 0 { + continue + } + sName := strings.TrimSpace(lines[0]) + aggIDMatch := regexp.MustCompile(`Aggregator ID:\s*(\d+)`).FindStringSubmatch(slavesData[i]) + if len(aggIDMatch) > 1 { + lacp.SlaveAggregatorIDs[sName] = aggIDMatch[1] + } + + actorMatch := regexp.MustCompile(`port key:\s*(\d+)`).FindStringSubmatch(slavesData[i]) + if len(actorMatch) > 1 { + lacp.SlaveActorKeys[sName] = actorMatch[1] + } + + partnerMatch := regexp.MustCompile(`oper key:\s*(\d+)`).FindStringSubmatch(slavesData[i]) + if len(partnerMatch) > 1 { + lacp.SlavePartnerKeys[sName] = partnerMatch[1] + } + } + c.info.LACP[bond] = lacp + } + } + + for _, bond := range c.info.BondInterfaces { + for _, slave := range c.info.BondSlaves[bond] { + sState := SlaveState{Name: slave} + + // IsUp and Link detection + outEth := c.info.Ethtool[slave] + sState.LinkDetected = strings.Contains(outEth, "Link detected: yes") + sState.IsUp = strings.Contains(c.info.IPLink, fmt.Sprintf("%s: 1 { + sState.Speed, _ = strconv.Atoi(speedMatch[1]) + } + + duplexMatch := regexp.MustCompile(`Duplex:\s*(\w+)`).FindStringSubmatch(outEth) + if len(duplexMatch) > 1 { + sState.Duplex = duplexMatch[1] + } + + c.info.Slaves[bond][slave] = sState + + c.info.Stats[slave] = c.parseTrafficStats(c.info.IPSLink[slave]) + } + + // Also parse stats for the bond + c.info.Stats[bond] = c.parseTrafficStats(c.info.IPSLink[bond]) + } + + outRPAll, _ := utils.ExecCommand(ctx, "sysctl", "-n", "net.ipv4.conf.all.rp_filter") + c.info.RPFilter["all"] = strings.TrimSpace(string(outRPAll)) + + // Parse RouteState + rState := RouteState{} + routeLines := strings.Split(c.info.IPRoute, "\n") + for _, line := range routeLines { + if strings.HasPrefix(line, "default via ") { + fields := strings.Fields(line) + if len(fields) >= 5 { + rState.GatewayIP = fields[2] + if fields[4] == c.targetBond || (c.targetBond == "" && len(c.info.BondInterfaces) > 0 && fields[4] == c.info.BondInterfaces[0]) { + rState.DefaultRouteViaBond = true + } + } + break + } + } + + if rState.GatewayIP != "" { + // Check reachable in neigh + neighMatch := regexp.MustCompile(fmt.Sprintf(`%s\s+dev\s+.*?lladdr.*?REACHABLE`, regexp.QuoteMeta(rState.GatewayIP))).FindStringSubmatch(c.info.IPNeigh) + if len(neighMatch) > 0 { + rState.GatewayReachable = true + } else { + // Fallback ping if not in neigh Cache instantly + pingOut, _ := utils.ExecCommand(ctx, "ping", "-c", "1", "-W", "1", rState.GatewayIP) + if strings.Contains(string(pingOut), "1 received") { + rState.GatewayReachable = true + } + } + } + c.info.Routes = rState + + // Parse Syslog Errors + dmesgGrep := "eth|mlx|link|bond" + if len(grepParts) > 0 { + dmesgGrep = strings.Join(grepParts, "|") + } + outDmesg, _ := utils.ExecCommand(ctx, "sh", "-c", fmt.Sprintf("dmesg | grep -iE '%s' | tail -n 100", dmesgGrep)) + + dmesgStr := string(outDmesg) + c.info.Dmesg = dmesgStr + + for _, l := range strings.Split(dmesgStr, "\n") { + lowerLine := strings.ToLower(l) + if strings.Contains(lowerLine, "down") || strings.Contains(lowerLine, "fail") || strings.Contains(lowerLine, "error") || strings.Contains(lowerLine, "flap") { + c.info.SyslogErrors = append(c.info.SyslogErrors, strings.TrimSpace(l)) + } + } + + // Also append journalctl errors specifically for bonding + outJournal, err := utils.ExecCommand(ctx, "sh", "-c", fmt.Sprintf("journalctl -k -S \"1 hour ago\" | grep -iE '%s' | grep -iE 'down|fail|flap|error' | tail -n 20", dmesgGrep)) + if err == nil { + for _, l := range strings.Split(string(outJournal), "\n") { + if strings.TrimSpace(l) != "" { + c.info.SyslogErrors = append(c.info.SyslogErrors, strings.TrimSpace(l)) + } + } + } + + return c.info, nil +} + +func (c *EthernetCollector) parseTrafficStats(outIPSL string) TrafficStats { + sStats := TrafficStats{} + lines := strings.Split(outIPSL, "\n") + for i, line := range lines { + if strings.Contains(line, "RX:") && i+1 < len(lines) { + fields := strings.Fields(lines[i+1]) + if len(fields) >= 4 { + sStats.RXErrors, _ = strconv.ParseInt(fields[2], 10, 64) + sStats.Dropped, _ = strconv.ParseInt(fields[3], 10, 64) + } + } + if strings.Contains(line, "TX:") && i+1 < len(lines) { + fields := strings.Fields(lines[i+1]) + if len(fields) >= 4 { + sStats.TXErrors, _ = strconv.ParseInt(fields[2], 10, 64) + sStats.Carrier, _ = strconv.ParseInt(fields[3], 10, 64) + } + } + } + return sStats +} diff --git a/components/ethernet/config/config.go b/components/ethernet/config/config.go new file mode 100644 index 00000000..b1cee36b --- /dev/null +++ b/components/ethernet/config/config.go @@ -0,0 +1,68 @@ +/* +Copyright 2024 The Scitix Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package config + +import ( + "github.com/scitix/sichek/components/common" + "github.com/scitix/sichek/consts" +) + +type EthernetUserConfig struct { + Ethernet *EthernetConfig `json:"ethernet" yaml:"ethernet"` +} + +type EthernetConfig struct { + QueryInterval common.Duration `json:"query_interval" yaml:"query_interval"` + CacheSize int64 `json:"cache_size" yaml:"cache_size"` + IgnoredCheckers []string `json:"ignored_checkers" yaml:"ignored_checkers"` + EnableMetrics bool `json:"enable_metrics" yaml:"enable_metrics"` +} + +func (c *EthernetUserConfig) GetQueryInterval() common.Duration { + if c.Ethernet == nil { + return common.Duration{} + } + return c.Ethernet.QueryInterval +} + +func (c *EthernetUserConfig) SetQueryInterval(newInterval common.Duration) { + if c.Ethernet == nil { + c.Ethernet = &EthernetConfig{} + } + c.Ethernet.QueryInterval = newInterval +} + +const ( + EthernetL1CheckerName = "L1(Physical Link)" + EthernetL2CheckerName = "L2(Bond)" + EthernetL3CheckerName = "L3(LACP)" + EthernetL4CheckerName = "L4(ARP)" + EthernetL5CheckerName = "L5(Routing)" +) + +var EthernetCheckItems = map[string]string{ + EthernetL1CheckerName: "Check Layer 1 properties", + EthernetL2CheckerName: "Check Layer 2 properties", + EthernetL3CheckerName: "Check Layer 3 properties", + EthernetL4CheckerName: "Check Layer 4 properties", + EthernetL5CheckerName: "Check Layer 5 properties", +} + +func LoadDefaultEventRules() (common.EventRuleGroup, error) { + eventRules := make(common.EventRuleGroup) + err := common.LoadDefaultEventRules(&eventRules, consts.ComponentNameEthernet) + return eventRules, err +} diff --git a/components/ethernet/config/default_spec.yaml b/components/ethernet/config/default_spec.yaml new file mode 100644 index 00000000..2b559801 --- /dev/null +++ b/components/ethernet/config/default_spec.yaml @@ -0,0 +1,13 @@ +ethernet: + default: + target_bond: "bond0" + bond_mode: "802.3ad" + mii_status: "up" + lacp_rate: "slow 0" + mtu: "1500" + speed: "25000" + min_slaves: 2 + xmit_hash_policy: "layer3+4 1" + miimon: 100 + updelay: 0 + downdelay: 0 diff --git a/components/ethernet/config/spec.go b/components/ethernet/config/spec.go new file mode 100644 index 00000000..bab74189 --- /dev/null +++ b/components/ethernet/config/spec.go @@ -0,0 +1,63 @@ +/* +Copyright 2024 The Scitix Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package config + +import ( + "fmt" + "github.com/scitix/sichek/pkg/utils" + "github.com/sirupsen/logrus" +) + +type EthernetSpecConfig struct { + TargetBond string `json:"target_bond" yaml:"target_bond"` + BondMode string `json:"bond_mode" yaml:"bond_mode"` + MIIStatus string `json:"mii_status" yaml:"mii_status"` + LACPRate string `json:"lacp_rate" yaml:"lacp_rate"` + MTU string `json:"mtu" yaml:"mtu"` + Speed string `json:"speed" yaml:"speed"` + MinSlaves int `json:"min_slaves" yaml:"min_slaves"` + XmitHashPolicy string `json:"xmit_hash_policy" yaml:"xmit_hash_policy"` + Miimon int `json:"miimon" yaml:"miimon"` + UpDelay int `json:"updelay" yaml:"updelay"` + DownDelay int `json:"downdelay" yaml:"downdelay"` +} + +type EthernetSpecs struct { + Ethernet map[string]*EthernetSpecConfig `json:"ethernet" yaml:"ethernet"` +} + +// LoadSpec loads Ethernet spec from the given file path. +func LoadSpec(file string) (*EthernetSpecConfig, error) { + if file == "" { + return nil, fmt.Errorf("ethernet spec file path is empty") + } + s := &EthernetSpecs{} + if err := utils.LoadFromYaml(file, s); err != nil { + return nil, fmt.Errorf("failed to parse YAML file %s: %v", file, err) + } + + if s.Ethernet == nil { + return nil, fmt.Errorf("ethernet spec is empty") + } + + // For ethernet, we assume a "default" spec for now, similar to infiniband + if spec, ok := s.Ethernet["default"]; ok { + logrus.WithField("component", "ethernet").Infof("Loaded default Ethernet spec") + return spec, nil + } + + return nil, fmt.Errorf("default ethernet spec not found in provided specs") +} diff --git a/components/ethernet/ethernet.go b/components/ethernet/ethernet.go new file mode 100644 index 00000000..4e63051e --- /dev/null +++ b/components/ethernet/ethernet.go @@ -0,0 +1,343 @@ +/* +Copyright 2024 The Scitix Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package ethernet + +import ( + "context" + "fmt" + "strings" + "sync" + "time" + + "github.com/scitix/sichek/components/common" + filter "github.com/scitix/sichek/components/common/eventfilter" + "github.com/scitix/sichek/components/ethernet/checker" + "github.com/scitix/sichek/components/ethernet/collector" + "github.com/scitix/sichek/components/ethernet/config" + ethmetrics "github.com/scitix/sichek/components/ethernet/metrics" + "github.com/scitix/sichek/consts" + "github.com/scitix/sichek/pkg/utils" + + "github.com/sirupsen/logrus" +) + +type component struct { + ctx context.Context + cancel context.CancelFunc + componentName string + cfg *config.EthernetUserConfig + cfgMutex sync.Mutex + collector *collector.EthernetCollector + checkers []common.Checker + filter *filter.EventFilter + metrics *ethmetrics.EthernetMetrics + + cacheMtx sync.RWMutex + cacheBuffer []*common.Result + cacheInfo []common.Info + currIndex int64 + cacheSize int64 + + service *common.CommonService +} + +var ( + ethernetComponent *component + ethernetComponentOnce sync.Once +) + +func NewEthernetComponent(cfgFile string, specFile string, ignoredCheckers []string) (common.Component, error) { + var err error + ethernetComponentOnce.Do(func() { + defer func() { + if r := recover(); r != nil { + err = fmt.Errorf("panic occurred when create component ethernet: %v", r) + } + }() + ethernetComponent, err = newEthernetComponent(cfgFile, specFile, ignoredCheckers) + }) + return ethernetComponent, err +} + +func newEthernetComponent(cfgFile string, specFile string, ignoredCheckers []string) (comp *component, err error) { + ctx, cancel := context.WithCancel(context.Background()) + defer func() { + if err != nil { + cancel() + } + }() + + cfg := &config.EthernetUserConfig{} + err = common.LoadUserConfig(cfgFile, cfg) + if err != nil || cfg.Ethernet == nil { + logrus.WithField("component", "ethernet").Warnf("get user config failed or ethernet config is nil, using default config") + cfg.Ethernet = &config.EthernetConfig{ + QueryInterval: common.Duration{Duration: 60 * time.Second}, + CacheSize: 5, + } + } + if len(ignoredCheckers) > 0 { + cfg.Ethernet.IgnoredCheckers = ignoredCheckers + } + + eventRules, err := config.LoadDefaultEventRules() + if err != nil { + logrus.WithField("component", "ethernet").Warnf("failed to load eventrules: %v", err) + } + + filterPointer, err := filter.NewEventFilter(consts.ComponentNameEthernet, eventRules, 100) + if err != nil { + logrus.WithField("component", "ethernet").Warnf("NewEthernetComponent create event filter failed: %v", err) + filterPointer = nil + } + + spec, err := config.LoadSpec(specFile) + if err != nil { + logrus.WithField("component", "ethernet").Warnf("failed to load spec %s: %v", specFile, err) + } + + targetBond := "" + if spec != nil { + targetBond = spec.TargetBond + } + + collectorInst, err := collector.NewEthernetCollector(targetBond) + if err != nil { + logrus.WithField("component", "ethernet").Errorf("NewEthernetComponent create collector failed: %v", err) + return nil, err + } + + checkers, err := checker.NewCheckers(cfg, spec) + if err != nil { + return nil, err + } + + cacheSize := cfg.Ethernet.CacheSize + if cacheSize == 0 { + cacheSize = 5 + } + + component := &component{ + ctx: ctx, + cancel: cancel, + componentName: consts.ComponentNameEthernet, + collector: collectorInst, + checkers: checkers, + filter: filterPointer, + cfg: cfg, + cacheBuffer: make([]*common.Result, cacheSize), + cacheInfo: make([]common.Info, cacheSize), + cacheSize: cacheSize, + metrics: ethmetrics.NewEthernetMetrics(), + } + service := common.NewCommonService(ctx, cfg, component.componentName, component.GetTimeout(), component.HealthCheck) + component.service = service + + return component, nil +} + +func (c *component) Name() string { + return c.componentName +} + +func (c *component) HealthCheck(ctx context.Context) (*common.Result, error) { + timer := common.NewTimer(fmt.Sprintf("%s-HealthCheck-Cost", c.componentName)) + ethInfo, err := c.collector.Collect(ctx) + if err != nil { + logrus.WithField("component", "ethernet").Errorf("failed to collect ethernet info: %v", err) + return nil, err + } + logrus.WithField("component", "ethernet").Infof("collected ethernet info: %+v", ethInfo) + + if c.cfg.Ethernet != nil && c.cfg.Ethernet.EnableMetrics { + c.metrics.ExportMetrics(ethInfo) + } + + result := common.Check(ctx, c.componentName, ethInfo, c.checkers) + timer.Mark("ethernet-check") + + if c.filter != nil { + eventResult := c.filter.Check() + timer.Mark("event-filter") + if eventResult != nil { + result.Checkers = append(result.Checkers, eventResult.Checkers...) + if eventResult.Status == consts.StatusAbnormal { + result.Status = consts.StatusAbnormal + if consts.LevelPriority[result.Level] < consts.LevelPriority[eventResult.Level] { + result.Level = eventResult.Level + } + } + } + } + + c.cacheMtx.Lock() + c.cacheBuffer[c.currIndex] = result + c.cacheInfo[c.currIndex] = ethInfo + c.currIndex = (c.currIndex + 1) % c.cacheSize + c.cacheMtx.Unlock() + + if result.Status == consts.StatusAbnormal && consts.LevelPriority[result.Level] > consts.LevelPriority[consts.LevelInfo] { + logrus.WithField("component", "ethernet").Errorf("Health Check Failed") + } else { + logrus.WithField("component", "ethernet").Infof("Health Check PASSED") + } + + return result, nil +} + +func (c *component) CacheResults() ([]*common.Result, error) { + c.cacheMtx.RLock() + defer c.cacheMtx.RUnlock() + return c.cacheBuffer, nil +} + +func (c *component) LastResult() (*common.Result, error) { + c.cacheMtx.RLock() + defer c.cacheMtx.RUnlock() + result := c.cacheBuffer[c.currIndex] + if c.currIndex == 0 { + result = c.cacheBuffer[c.cacheSize-1] + } + return result, nil +} + +func (c *component) CacheInfos() ([]common.Info, error) { + c.cacheMtx.RLock() + defer c.cacheMtx.RUnlock() + return c.cacheInfo, nil +} + +func (c *component) LastInfo() (common.Info, error) { + c.cacheMtx.RLock() + defer c.cacheMtx.RUnlock() + var info common.Info + if c.currIndex == 0 { + info = c.cacheInfo[c.cacheSize-1] + } else { + info = c.cacheInfo[c.currIndex-1] + } + return info, nil +} + +func (c *component) Start() <-chan *common.Result { + return c.service.Start() +} + +func (c *component) Stop() error { + return c.service.Stop() +} + +func (c *component) Update(cfg common.ComponentUserConfig) error { + c.cfgMutex.Lock() + configPointer, ok := cfg.(*config.EthernetUserConfig) + if !ok { + return fmt.Errorf("update wrong config type for ethernet") + } + c.cfg = configPointer + c.cfgMutex.Unlock() + return c.service.Update(cfg) +} + +func (c *component) Status() bool { + return c.service.Status() +} + +func (c *component) GetTimeout() time.Duration { + return c.cfg.GetQueryInterval().Duration +} + +func (c *component) PrintInfo(info common.Info, result *common.Result, summaryPrint bool) bool { + checkAllPassed := true + if result.Status == consts.StatusAbnormal && consts.LevelPriority[result.Level] > consts.LevelPriority[consts.LevelInfo] { + checkAllPassed = false + } + ethEvent := make(map[string]string) + + l1Print := fmt.Sprintf("L1(Link): %sNot Checked%s", consts.Yellow, consts.Reset) + l2Print := fmt.Sprintf("L2(Bond): %sNot Checked%s", consts.Yellow, consts.Reset) + l3Print := fmt.Sprintf("L3(LACP): %sNot Checked%s", consts.Yellow, consts.Reset) + l4Print := fmt.Sprintf("L4(ARP) : %sNot Checked%s", consts.Yellow, consts.Reset) + l5Print := fmt.Sprintf("L5(Route): %sNot Checked%s", consts.Yellow, consts.Reset) + + utils.PrintTitle("Ethernet", "-") + checkerResults := result.Checkers + for _, res := range checkerResults { + if res.Status != consts.StatusNormal && res.Level != consts.LevelInfo { + checkAllPassed = false + ethEvent[res.Name] = fmt.Sprintf("Event: %s%s%s -> %s", consts.Red, res.ErrorName, consts.Reset, strings.TrimRight(res.Detail, "\n")) + } + + statusColor := consts.Green + statusText := "OK" + if res.Status != consts.StatusNormal { + statusColor = consts.Red + statusText = "Err" + } + + switch res.Name { + case config.EthernetL1CheckerName: + l1Print = fmt.Sprintf("L1(Physical Link): %s%s%s", statusColor, statusText, consts.Reset) + case config.EthernetL2CheckerName: + l2Print = fmt.Sprintf("L2(Bonding) : %s%s%s", statusColor, statusText, consts.Reset) + case config.EthernetL3CheckerName: + l3Print = fmt.Sprintf("L3(LACP) : %s%s%s", statusColor, statusText, consts.Reset) + case config.EthernetL4CheckerName: + l4Print = fmt.Sprintf("L4(ARP) : %s%s%s", statusColor, statusText, consts.Reset) + case config.EthernetL5CheckerName: + l5Print = fmt.Sprintf("L5(Routing) : %s%s%s", statusColor, statusText, consts.Reset) + } + } + + ethInfo, ok := info.(*collector.EthernetInfo) + if ok && len(ethInfo.BondInterfaces) > 0 { + for _, bond := range ethInfo.BondInterfaces { + fmt.Printf("Bond Interface: %s\n", bond) + if sysfs, exists := ethInfo.SysfsBonding[bond]; exists { + mode := sysfs["mode"] + miimon := sysfs["miimon"] + lacpRate := sysfs["lacp_rate"] + slaves := strings.Join(ethInfo.BondSlaves[bond], ", ") + + fmt.Printf("Bond Mode: %-25s ", mode) + fmt.Printf("MII Monitor: %-25s\n", miimon) + fmt.Printf("LACP Rate: %-25s ", lacpRate) + fmt.Printf("Slaves : %-25s\n", slaves) + } + + // Try parsing sysctl rp_filter + if rpFilter, exists := ethInfo.RPFilter[bond]; exists { + fmt.Printf("RP Filter: %-25s\n", rpFilter) + } + fmt.Println() + } + } + + fmt.Printf("%-35s%-35s\n", l1Print, l2Print) + fmt.Printf("%-35s%-35s\n", l3Print, l4Print) + fmt.Printf("%-35s\n", l5Print) + + if len(ethEvent) == 0 { + fmt.Printf("\nErrors Events:\n\tNo Ethernet Events Detected\n") + } else { + fmt.Printf("\nErrors Events:\n") + for _, v := range ethEvent { + fmt.Printf("\t%s\n", v) + } + } + + fmt.Println() + return checkAllPassed +} diff --git a/components/ethernet/ethernet_test.go b/components/ethernet/ethernet_test.go new file mode 100644 index 00000000..a2f19505 --- /dev/null +++ b/components/ethernet/ethernet_test.go @@ -0,0 +1,174 @@ +/* +Copyright 2024 The Scitix Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package ethernet + +import ( + "context" + "testing" + + "github.com/scitix/sichek/components/ethernet/checker" + "github.com/scitix/sichek/components/ethernet/collector" + "github.com/scitix/sichek/components/ethernet/config" + "github.com/scitix/sichek/consts" + "github.com/stretchr/testify/assert" +) + +func TestEthernetCheckers(t *testing.T) { + spec := &config.EthernetSpecConfig{ + BondMode: "802.3ad", + MIIStatus: "up", + LACPRate: "fast", + MTU: "1500", + MinSlaves: 1, + XmitHashPolicy: "layer3+4", + Miimon: 100, + UpDelay: 100, + DownDelay: 100, + } + + info := &collector.EthernetInfo{ + BondInterfaces: []string{"bond0"}, + Bonds: map[string]collector.BondState{ + "bond0": { + Name: "bond0", + IsUp: true, + HasLowerUp: true, + IPAddr: "192.168.1.1", + MTU: 1500, + Mode: "802.3ad", + Miimon: 100, + XmitHashPolicy: "layer3+4", + LACPRate: "fast", + ActiveSlave: "eth0", + }, + }, + LACP: map[string]collector.LACPState{ + "bond0": { + ActiveAggregatorID: "1", + ActorKey: "21", + PartnerKey: "40016", + PartnerMacAddress: "11:22:33:44:55:66", + SlaveAggregatorIDs: map[string]string{"eth0": "1"}, + SlaveActorKeys: map[string]string{"eth0": "21"}, + SlavePartnerKeys: map[string]string{"eth0": "40016"}, + }, + }, + Slaves: map[string]map[string]collector.SlaveState{ + "bond0": { + "eth0": {Name: "eth0", IsUp: true, LinkDetected: true, Speed: 25000, Duplex: "Full"}, + }, + }, + Stats: map[string]collector.TrafficStats{ + "eth0": {RXErrors: 0, TXErrors: 0, Dropped: 0, Carrier: 0}, + }, + Routes: collector.RouteState{ + DefaultRouteViaBond: true, + GatewayReachable: true, + GatewayIP: "192.168.1.1", + }, + SyslogErrors: []string{}, + ProcNetBonding: map[string]string{ + "bond0": "MII Status: up\nUp Delay (ms): 100\nDown Delay (ms): 100\n", + }, + RPFilter: map[string]string{ + "all": "0", + "bond0": "0", + }, + } + + ctx := context.Background() + + checkers, err := checker.NewCheckers(&config.EthernetUserConfig{}, spec) + assert.NoError(t, err) + + for _, c := range checkers { + res, err := c.Check(ctx, info) + assert.NoError(t, err) + assert.Equal(t, consts.StatusNormal, res.Status, "Checker %s failed unexpectedly: %s", c.Name(), res.Detail) + } +} + +func TestEthernetCheckersFailures(t *testing.T) { + spec := &config.EthernetSpecConfig{ + BondMode: "802.3ad", + MIIStatus: "up", + LACPRate: "fast", + MTU: "1500", + MinSlaves: 2, + XmitHashPolicy: "layer3+4", + Miimon: 100, + UpDelay: 100, + DownDelay: 100, + } + + info := &collector.EthernetInfo{ + BondInterfaces: []string{"bond0"}, + Bonds: map[string]collector.BondState{ + "bond0": { + Name: "bond0", + IsUp: false, + HasLowerUp: false, + IPAddr: "", + MTU: 1500, + Mode: "802.3ad", + Miimon: 0, + XmitHashPolicy: "layer3+4", + LACPRate: "slow", + ActiveSlave: "", + }, + }, + LACP: map[string]collector.LACPState{ + "bond0": { + ActiveAggregatorID: "1", + ActorKey: "21", + PartnerKey: "40016", + PartnerMacAddress: "00:00:00:00:00:00", + SlaveAggregatorIDs: map[string]string{"eth0": "2"}, // mismatch + SlaveActorKeys: map[string]string{"eth0": "21"}, + SlavePartnerKeys: map[string]string{"eth0": "40016"}, + }, + }, + Slaves: map[string]map[string]collector.SlaveState{ + "bond0": { + "eth0": {Name: "eth0", IsUp: false, LinkDetected: false, Speed: 1000, Duplex: "Half"}, + }, + }, + Stats: map[string]collector.TrafficStats{ + "eth0": {RXErrors: 100, TXErrors: 50, Dropped: 100, Carrier: 10}, + }, + Routes: collector.RouteState{ + DefaultRouteViaBond: false, + GatewayReachable: false, + GatewayIP: "192.168.1.1", + }, + SyslogErrors: []string{"eth0 tx timeout"}, + RPFilter: map[string]string{ + "all": "1", + "bond0": "1", + }, + } + + ctx := context.Background() + + checkers, err := checker.NewCheckers(&config.EthernetUserConfig{}, spec) + assert.NoError(t, err) + + for _, c := range checkers { + res, err := c.Check(ctx, info) + assert.NoError(t, err) + assert.Equal(t, consts.StatusAbnormal, res.Status, "Checker %s should have failed", c.Name()) + } +} diff --git a/components/ethernet/metrics/metrics.go b/components/ethernet/metrics/metrics.go new file mode 100644 index 00000000..2111307f --- /dev/null +++ b/components/ethernet/metrics/metrics.go @@ -0,0 +1,83 @@ +/* +Copyright 2024 The Scitix Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package metrics + +import ( + "github.com/scitix/sichek/components/ethernet/collector" + common "github.com/scitix/sichek/metrics" +) + +const ( + MetricPrefix = "sichek_ethernet" + TagPrefix = "json" +) + +type EthernetMetrics struct { + BondStatusGauge *common.GaugeVecMetricExporter + SlaveStatusGauge *common.GaugeVecMetricExporter + RouteStatusGauge *common.GaugeVecMetricExporter + TrafficStatsGauge *common.GaugeVecMetricExporter + LACPStatusGauge *common.GaugeVecMetricExporter + SystemStatusGauge *common.GaugeVecMetricExporter +} + +func NewEthernetMetrics() *EthernetMetrics { + // Use distinct prefixes to avoid metric name collision. + // We stick to ExportStruct for consistency in label cardinality. + return &EthernetMetrics{ + BondStatusGauge: common.NewGaugeVecMetricExporter(MetricPrefix+"_bond", []string{"bond"}), + SlaveStatusGauge: common.NewGaugeVecMetricExporter(MetricPrefix+"_slave", []string{"bond", "slave"}), + RouteStatusGauge: common.NewGaugeVecMetricExporter(MetricPrefix+"_route", nil), + TrafficStatsGauge: common.NewGaugeVecMetricExporter(MetricPrefix+"_traffic", []string{"interface"}), + LACPStatusGauge: common.NewGaugeVecMetricExporter(MetricPrefix+"_lacp", []string{"bond"}), + SystemStatusGauge: common.NewGaugeVecMetricExporter(MetricPrefix+"_system", nil), + } +} + +func (m *EthernetMetrics) ExportMetrics(info *collector.EthernetInfo) { + if info == nil { + return + } + + // 1. Export Bond Status + for bondName, bondState := range info.Bonds { + m.BondStatusGauge.ExportStruct(bondState, []string{bondName}, TagPrefix) + } + + // 2. Export Slave Status + for bondName, slaves := range info.Slaves { + for slaveName, slaveState := range slaves { + m.SlaveStatusGauge.ExportStruct(slaveState, []string{bondName, slaveName}, TagPrefix) + } + } + + // 3. Export Route Status + m.RouteStatusGauge.ExportStruct(info.Routes, []string{}, TagPrefix) + + // 4. Export Traffic Stats + for ifaceName, stats := range info.Stats { + m.TrafficStatsGauge.ExportStruct(stats, []string{ifaceName}, TagPrefix) + } + + // 5. Export LACP Info + for bondName, lacpState := range info.LACP { + m.LACPStatusGauge.ExportStruct(lacpState, []string{bondName}, TagPrefix) + } + + // 6. Export System Info + m.SystemStatusGauge.SetMetric("syslog_error_count", nil, float64(len(info.SyslogErrors))) + m.SystemStatusGauge.SetMetric("bond_count", nil, float64(len(info.BondInterfaces))) +} diff --git a/config/default_spec.yaml b/config/default_spec.yaml index da63ee78..53aea44c 100644 --- a/config/default_spec.yaml +++ b/config/default_spec.yaml @@ -101,4 +101,17 @@ pcie_topo: pci_switches: - gpu: 1 ib: 1 - count: 8 \ No newline at end of file + count: 8 +ethernet: + default: + target_bond: "bond0" + bond_mode: "802.3ad" + mii_status: "up" + lacp_rate: "slow 0" + mtu: "1500" + speed: "25000" + min_slaves: 2 + xmit_hash_policy: "layer3+4 1" + miimon: 100 + updelay: 0 + downdelay: 0 \ No newline at end of file diff --git a/config/default_user_config.yaml b/config/default_user_config.yaml index 07605f8d..6531cbcf 100644 --- a/config/default_user_config.yaml +++ b/config/default_user_config.yaml @@ -65,4 +65,10 @@ nccltest: enable: true pcie_topo: - enable: true \ No newline at end of file + enable: true + +ethernet: + query_interval: 10s + cache_size: 5 + ignored_checkers: [] + enable_metrics: true \ No newline at end of file diff --git a/consts/consts.go b/consts/consts.go index d2da83f9..d9bd9729 100644 --- a/consts/consts.go +++ b/consts/consts.go @@ -81,7 +81,7 @@ var ( DefaultComponentQueryInterval = time.Duration.Seconds(1) DefaultComponents = []string{ - ComponentNameCPU, ComponentNameNvidia, ComponentNameInfiniband, ComponentNameGpfs, ComponentNameDmesg, + ComponentNameCPU, ComponentNameNvidia, ComponentNameInfiniband, ComponentNameEthernet, ComponentNameGpfs, ComponentNameDmesg, ComponentNamePodlog, ComponentNameGpuEvents, ComponentNameSyslog, } ) diff --git a/docs/ethernet.md b/docs/ethernet.md new file mode 100644 index 00000000..36bdb612 --- /dev/null +++ b/docs/ethernet.md @@ -0,0 +1,70 @@ +# Ethernet Network Check + +Ethernet network performance and operational status can be assessed through comprehensive system metrics. Proactively catching and localizing failures before they affect user workloads is crucial for maintaining cluster stability and high utilization, especially in GPU compute scenarios. + +## Ethernet Specific Checks + +To ensure the proper functioning of Ethernet interfaces (especially Bond aggregation links), SiChek performs evaluations across the following layers: + +### 1. Configuration Validation + +- **Bonding Settings**: + - Verify that the **Bond Mode** matches the expectation (e.g., 802.3ad Dynamic link aggregation). + - Confirm that **MII Status Monitoring** (Miimon) is enabled and the polling interval is reasonably configured. + - Validate **MTU** configuration consistency to prevent performance loss due to fragmentation in large packet transfers or RoCE scenarios. + - Verify that the network **Transmission Hash Policy** (xmit_hash_policy) meets business requirements (e.g., layer3+4). + +- **LACP Negotiation Baselines**: + - In 802.3ad mode, validate the **LACP Rate** (lacp_rate) configuration (fast/slow). + - Check if the **Minimum Slave Count** (MinSlaves) in multi-slave environments meets the standard. + +### 2. Runtime Error and Anomaly Detection + +- **Physical Layer (L1)**: + - **Link Detected**: Confirm that all underlying physical slave NICs have a link status of UP. + - **Physical Speed Match**: Check if the actual negotiated speed matches hardware specifications (e.g., 25Gbps/100Gbps). + - **TX Timeout**: Scan kernel logs to identify transmit timeout warnings generated by drivers or firmware. + +- **Bonding Layer (L2)**: + - **Aggregator Consistency**: Check if the aggregator IDs of each slave NIC in LACP negotiation match the global active aggregator ID. + - **Partner MAC**: Identify if the switch-side LACP is responding normally (rejecting all-zero MACs). + - **Link Flapping and Failures**: Monitor the `Link Failure Count` history and increments for slave NICs. + +- **Connectivity and Forwarding Layer (L4/L5)**: + - **ARP Neighbor Status**: Real-time retrieval of `FAILED` or `INCOMPLETE` abnormal entries in the ARP table. + - **Gateway Reachability**: Dual-verify default gateway connectivity through Ping and neighbor cache. + - **Reverse Path Filter (RP Filter)**: Validate configuration to prevent legitimate packets from being dropped by the kernel due to asymmetric routing. + +## Key Metrics + +SiChek collects and parses the following key metrics to support the checks listed above: + +- **NIC Basic Info** (device-level) + - **Link Detected**: Physical carrier detection status. + - **Negotiated Speed**: Automatically negotiated or manually set physical layer synchronization speed. + - **Duplex Mode**: Duplex mode (Full/Half). + +- **Bonding States** (interface-level) + - **Bonding Mode**: System-defined link aggregation mode. + - **MII Status**: Logical Bond interface connectivity status. + - **MTU**: Maximum Transmission Unit size. + - **Xmit Hash Policy**: Hash algorithm for distributing packets across different slave NICs. + - **Active Aggregator ID**: Active aggregator identifier in the current 802.3ad mode. + - **Partner Mac Address**: MAC address of the successfully negotiated peer switch. + +- **Traffic Statistics** (device-level) + - **RX/TX Errors**: Count of packet errors during receiving or sending. + - **Dropped Packets**: Count of packets dropped due to insufficient buffer or configuration limits. + - **Carrier Errors**: Count of carrier signal loss detected at the link layer. + - **CRC Errors (Approximation)**: Cyclic Redundancy Check errors, indicating physical medium quality. + +- **Protocol and Routing States** (node-level) + - **IP Neighbor State**: Neighbor node L2 resolution status returned by ARP commands. + - **Gateway Reachability**: Indicates if the default gateway responds to ICMP or L2 liveliness probes. + - **Default Route Via Bond**: Confirms if the system's primary default route is physically carried on the target Bond interface. + - **rp_filter Context**: Values of `net.ipv4.conf.all.rp_filter` at system and interface levels. + +- **System Log Anomalies** (node-level) + - **Kernel Dmesg/Journal**: Real-time capture of network-related kernel error records containing "flap", "down", "fail", or "tx timeout". + +By systematically collecting these multi-dimensional metrics and performing layered checks, Sichek can accurately identify Ethernet configuration flaws and operational risks, ensuring network reliability in large-scale computing clusters.