Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion cmd/command/command.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ func NewRootCmd() *cobra.Command {
"h": true,
"all": true,
"run": true,
"ethernet": true,
"e": true,
}

if commandsRequireRoot[cmd.Use] {
Expand All @@ -57,7 +59,7 @@ func NewRootCmd() *cobra.Command {
rootCmd.AddCommand(component.NewCPUCmd())
rootCmd.AddCommand(component.NewNvidiaCmd())
rootCmd.AddCommand(component.NewInfinibandCmd())
// rootCmd.AddCommand(component.NewEthernetCmd())
rootCmd.AddCommand(component.NewEthernetCmd())
rootCmd.AddCommand(component.NewGpfsCmd())
rootCmd.AddCommand(component.NewPodLogCmd())
rootCmd.AddCommand(component.NewDmesgCmd())
Expand Down
3 changes: 3 additions & 0 deletions cmd/command/component/all.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import (
"github.com/scitix/sichek/components/common"
"github.com/scitix/sichek/components/cpu"
"github.com/scitix/sichek/components/dmesg"
"github.com/scitix/sichek/components/ethernet"
"github.com/scitix/sichek/components/gpfs"
gpuevents "github.com/scitix/sichek/components/gpuevents"
"github.com/scitix/sichek/components/infiniband"
Expand Down Expand Up @@ -186,6 +187,8 @@ func NewComponent(componentName string, cfgFile string, specFile string, ignored
case consts.ComponentNameSyslog:
// if skipPercent is -1, use the value from the config file
return syslog.NewComponent(cfgFile, "", -1)
case consts.ComponentNameEthernet:
return ethernet.NewEthernetComponent(cfgFile, specFile, ignoredCheckers)
default:
return nil, fmt.Errorf("invalid component name: %s", componentName)
}
Expand Down
93 changes: 93 additions & 0 deletions cmd/command/component/ethernet.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
/*
Copyright 2024 The Scitix Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package component

import (
"context"
"strings"

"github.com/scitix/sichek/cmd/command/spec"
"github.com/scitix/sichek/components/ethernet"
"github.com/scitix/sichek/consts"

"github.com/sirupsen/logrus"
"github.com/spf13/cobra"
)

func NewEthernetCmd() *cobra.Command {
var (
cfgFile string
specFile string
ignoredCheckersStr string
verbose bool
)
ethernetCmd := &cobra.Command{
Use: "ethernet",
Aliases: []string{"e"},
Short: "Perform Ethernet HealthCheck",
Run: func(cmd *cobra.Command, args []string) {
ctx, cancel := context.WithTimeout(context.Background(), consts.CmdTimeout)

if !verbose {
logrus.SetLevel(logrus.ErrorLevel)
defer cancel()
} else {
logrus.SetLevel(logrus.DebugLevel)
defer func() {
logrus.WithField("component", "ethernet").Info("Run ethernet Cmd context canceled")
cancel()
}()
}

resolvedCfgFile, err := spec.EnsureCfgFile(cfgFile)
if err != nil {
logrus.WithField("daemon", "ethernet").Errorf("failed to load cfgFile: %v", err)
} else {
logrus.WithField("daemon", "ethernet").Info("load cfgFile: " + resolvedCfgFile)
}
resolvedSpecFile, err := spec.EnsureSpecFile(specFile)
if err != nil {
logrus.WithField("daemon", "ethernet").Errorf("failed to load specFile: %v", err)
} else {
logrus.WithField("daemon", "ethernet").Info("load specFile: " + resolvedSpecFile)
}

var ignoredCheckers []string
if len(ignoredCheckersStr) > 0 {
ignoredCheckers = strings.Split(ignoredCheckersStr, ",")
}

component, err := ethernet.NewEthernetComponent(resolvedCfgFile, resolvedSpecFile, ignoredCheckers)
if err != nil {
logrus.WithField("component", "ethernet").Error(err)
return
}
logrus.WithField("component", "ethernet").Infof("Run Ethernet component check: %s", component.Name())
result, err := RunComponentCheck(ctx, component, consts.CmdTimeout)
if err != nil {
return
}
PrintCheckResults(true, result)
},
}

ethernetCmd.Flags().StringVarP(&cfgFile, "cfg", "c", "", "Path to the user config file")
ethernetCmd.Flags().StringVarP(&specFile, "spec", "s", "", "Path to the Ethernet specification file")
ethernetCmd.Flags().StringVarP(&ignoredCheckersStr, "ignored-checkers", "i", "", "Ignored checkers")
ethernetCmd.Flags().BoolVarP(&verbose, "verbose", "v", false, "Enable verbose output")

return ethernetCmd
}
69 changes: 69 additions & 0 deletions components/ethernet/checker/checker.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
/*
Copyright 2024 The Scitix Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package checker

import (
"regexp"
"strconv"

"github.com/scitix/sichek/components/common"
"github.com/scitix/sichek/components/ethernet/config"
)

func NewCheckers(cfg *config.EthernetUserConfig, spec *config.EthernetSpecConfig) ([]common.Checker, error) {
checkers := []common.Checker{
&L1Checker{
spec: spec,
prevCRC: make(map[string]int64),
prevCarrier: make(map[string]int64),
prevDrops: make(map[string]int64),
},
&L2Checker{
spec: spec,
prevLinkFailures: make(map[string]int64),
prevActiveSlave: make(map[string]string),
},
&L3Checker{spec: spec},
&L4Checker{spec: spec},
&L5Checker{spec: spec},
}
// Filter skipped checkers
ignoredMap := make(map[string]bool)
if cfg != nil && cfg.Ethernet != nil {
for _, v := range cfg.Ethernet.IgnoredCheckers {
ignoredMap[v] = true
}
}
var activeCheckers []common.Checker
for _, chk := range checkers {
if !ignoredMap[chk.Name()] {
activeCheckers = append(activeCheckers, chk)
}
}
return activeCheckers, nil
}


// extractInt parses an integer using regex from a string pattern
func extractInt(input, pattern string) int64 {
re := regexp.MustCompile(pattern)
matches := re.FindStringSubmatch(input)
if len(matches) > 1 {
val, _ := strconv.ParseInt(matches[1], 10, 64)
return val
}
return 0
}
128 changes: 128 additions & 0 deletions components/ethernet/checker/l1_checker.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
/*
Copyright 2024 The Scitix Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package checker

import (
"context"
"fmt"
"strconv"
"strings"

"github.com/scitix/sichek/components/common"
"github.com/scitix/sichek/components/ethernet/collector"
"github.com/scitix/sichek/components/ethernet/config"
"github.com/scitix/sichek/consts"
)

type L1Checker struct {
spec *config.EthernetSpecConfig
prevCRC map[string]int64
prevCarrier map[string]int64
prevDrops map[string]int64
}

func (c *L1Checker) Name() string { return config.EthernetL1CheckerName }

func (c *L1Checker) Check(ctx context.Context, data any) (*common.CheckerResult, error) {
info, ok := data.(*collector.EthernetInfo)
if !ok {
return nil, fmt.Errorf("invalid data type")
}

result := &common.CheckerResult{
Name: c.Name(),
Description: config.EthernetCheckItems[c.Name()],
Status: consts.StatusNormal,
Level: consts.LevelInfo,
Curr: "OK",
}

expectedSpeed := "25000" // default to 25G
if c.spec != nil && c.spec.Speed != "" {
expectedSpeed = c.spec.Speed
}

for _, bond := range info.BondInterfaces {
for slaveName, slaveState := range info.Slaves[bond] {
if !slaveState.LinkDetected {
result.Status = consts.StatusAbnormal
result.Level = consts.LevelCritical
result.ErrorName = "LinkDown"
result.Detail += fmt.Sprintf("Physical NIC %s link not UP. Command: ethtool %s, Expected: Link detected: yes, Actual: not connected or unknown.\n", slaveName, slaveName)
}

if len(info.SyslogErrors) > 0 {
for _, errLine := range info.SyslogErrors {
if strings.Contains(errLine, "tx timeout") && strings.Contains(errLine, slaveName) {
result.Status = consts.StatusAbnormal
result.Level = consts.LevelCritical
result.ErrorName = "TxTimeout"
result.Detail += fmt.Sprintf("NIC %s tx timeout found in kernel log. Command: dmesg | grep -iE 'eth|mlx|link'.\n", slaveName)
break
}
}
}

// check speed
speedStr := strconv.Itoa(slaveState.Speed)
if speedStr != expectedSpeed && slaveState.Speed > 0 {
result.Status = consts.StatusAbnormal
result.Level = consts.LevelWarning
result.ErrorName = "SpeedMismatch"
result.Detail += fmt.Sprintf("NIC %s speed mismatch. Command: ethtool %s, Expected: %sMb/s, Actual: %sMb/s.\n", slaveName, slaveName, expectedSpeed, speedStr)
}

// Parse stats
sStats := info.Stats[slaveName]

// CRC errors
currCRC := sStats.RXErrors // Approximation, standard ip -s link maps CRC errors to RX errors broadly. For exact CRC, ethtool parsing should remain, but for now we follow the general RX error growth.
if prev, ok := c.prevCRC[slaveName]; ok && currCRC > prev {
result.Status = consts.StatusAbnormal
result.Level = consts.LevelWarning
result.ErrorName = "CRCErrorsGrowing"
result.Detail += fmt.Sprintf("NIC %s RX (CRC) errors increasing. Command: ip -s link show %s, Previous: %d, Current: %d.\n", slaveName, slaveName, prev, currCRC)
}
c.prevCRC[slaveName] = currCRC

// Carrier errors
currCarrierIPS := sStats.Carrier
if prev, ok := c.prevCarrier[slaveName]; ok && currCarrierIPS > prev {
result.Status = consts.StatusAbnormal
result.Level = consts.LevelWarning
result.ErrorName = "CarrierErrorsGrowing"
result.Detail += fmt.Sprintf("NIC %s Carrier errors increasing. Command: ip -s link show %s, Previous: %d, Current: %d.\n", slaveName, slaveName, prev, currCarrierIPS)
}
c.prevCarrier[slaveName] = currCarrierIPS

// Drops
currDrops := sStats.Dropped
if prev, ok := c.prevDrops[slaveName]; ok && currDrops > prev {
result.Status = consts.StatusAbnormal
result.Level = consts.LevelWarning
result.ErrorName = "DropsGrowing"
result.Detail += fmt.Sprintf("NIC %s Drops increasing. Command: ip -s link show %s, Previous: %d, Current: %d.\n", slaveName, slaveName, prev, currDrops)
}
c.prevDrops[slaveName] = currDrops
}
}

if result.Status != consts.StatusNormal {
result.Suggestion = "Please check physical link, cable, driver version (ethtool -i), or check dmesg for specific errors; if speed mismatch, check corresponding configuration."
}

return result, nil
}
Loading