From 87f576a93cd04f0e2e8a9a4c49ef67cd45e2a08c Mon Sep 17 00:00:00 2001 From: Anthony Harivel Date: Wed, 14 Jan 2026 14:55:36 +0100 Subject: [PATCH 1/3] collectors: add sriov metrics collector Add a new collector that exports SR-IOV metrics for physical functions and virtual functions. Supports VFs bound to both network drivers and vfio-pci by reading stats from the parent PF when direct VF stats are unavailable. Exported metrics include traffic counters (rx/tx bytes, unicast, multicast, broadcast), error counters (dropped, allocation failures), and TX performance metrics. Each metric includes NUMA node information for topology-aware monitoring. Parses per-VF statistics from Intel PF drivers (ixgbe, i40e, ice) which use different naming conventions for VF stats. Signed-off-by: Anthony Harivel --- collectors/collectors.go | 2 + collectors/sriov/collector.go | 382 ++++++++++++++++++++++++++++++++++ collectors/sriov/metrics.go | 127 +++++++++++ 3 files changed, 511 insertions(+) create mode 100644 collectors/sriov/collector.go create mode 100644 collectors/sriov/metrics.go diff --git a/collectors/collectors.go b/collectors/collectors.go index ea3b911..c68ea4e 100644 --- a/collectors/collectors.go +++ b/collectors/collectors.go @@ -15,6 +15,7 @@ import ( "github.com/openstack-k8s-operators/openstack-network-exporter/collectors/ovsdbserver" "github.com/openstack-k8s-operators/openstack-network-exporter/collectors/pmd_perf" "github.com/openstack-k8s-operators/openstack-network-exporter/collectors/pmd_rxq" + "github.com/openstack-k8s-operators/openstack-network-exporter/collectors/sriov" "github.com/openstack-k8s-operators/openstack-network-exporter/collectors/vswitch" ) @@ -30,6 +31,7 @@ var collectors = []lib.Collector{ new(ovsdbserver.Collector), new(pmd_perf.Collector), new(pmd_rxq.Collector), + new(sriov.Collector), new(vswitch.Collector), } diff --git a/collectors/sriov/collector.go b/collectors/sriov/collector.go new file mode 100644 index 0000000..4fe6a3d --- /dev/null +++ b/collectors/sriov/collector.go @@ -0,0 +1,382 @@ +// SPDX-License-Identifier: Apache-2.0 + +package sriov + +import ( + "bufio" + "context" + "os" + "os/exec" + "path/filepath" + "regexp" + "strconv" + "strings" + "time" + + "github.com/openstack-k8s-operators/openstack-network-exporter/collectors/lib" + "github.com/openstack-k8s-operators/openstack-network-exporter/config" + "github.com/openstack-k8s-operators/openstack-network-exporter/log" + "github.com/prometheus/client_golang/prometheus" +) + +type Collector struct{} + +func (Collector) Name() string { + return "sriov" +} + +func (Collector) Metrics() []lib.Metric { + var res []lib.Metric + for _, m := range metrics { + res = append(res, m) + } + return res +} + +func (c *Collector) Describe(ch chan<- *prometheus.Desc) { + lib.DescribeEnabledMetrics(c, ch) +} + +type InterfaceInfo struct { + Name string + IsPF bool + IsVF bool + ParentPF string + VFNum int + NumVFs int + Driver string + PCIAddr string + NumaNode string +} + +func discoverSriovInterfaces() ([]InterfaceInfo, error) { + var interfaces []InterfaceInfo + + netPath := "/sys/class/net" + entries, err := os.ReadDir(netPath) + if err != nil { + return nil, err + } + + for _, entry := range entries { + name := entry.Name() + + if name == "lo" { + continue + } + + info := InterfaceInfo{Name: name, VFNum: -1} + + devicePath := filepath.Join(netPath, name, "device") + + if pciAddr, err := filepath.EvalSymlinks(devicePath); err == nil { + info.PCIAddr = filepath.Base(pciAddr) + } + + info.Driver = getDriver(devicePath) + info.NumaNode = getNumaNode(devicePath) + + numVFsPath := filepath.Join(devicePath, "sriov_numvfs") + if numVFs, err := readIntFromFile(numVFsPath); err == nil { + info.IsPF = true + info.NumVFs = numVFs + interfaces = append(interfaces, info) + continue + } + + physfnPath := filepath.Join(devicePath, "physfn") + if _, err := os.Lstat(physfnPath); err == nil { + info.IsVF = true + + if vfNum, pfPCI := getVFNumber(devicePath); vfNum >= 0 { + info.VFNum = vfNum + info.ParentPF = pfPCI + } + + interfaces = append(interfaces, info) + } + } + + return interfaces, nil +} + +func getDriver(devicePath string) string { + driverPath := filepath.Join(devicePath, "driver") + if target, err := os.Readlink(driverPath); err == nil { + return filepath.Base(target) + } + return "none" +} + +func getNumaNode(devicePath string) string { + numaPath := filepath.Join(devicePath, "numa_node") + data, err := os.ReadFile(numaPath) + if err != nil { + return "-1" + } + numaNode := strings.TrimSpace(string(data)) + if numaNode == "" { + return "-1" + } + return numaNode +} + +func readIntFromFile(path string) (int, error) { + data, err := os.ReadFile(path) + if err != nil { + return 0, err + } + value, err := strconv.Atoi(strings.TrimSpace(string(data))) + if err != nil { + return 0, err + } + return value, nil +} + +func getVFNumber(devicePath string) (int, string) { + physfnPath := filepath.Join(devicePath, "physfn") + pfDevice, err := os.Readlink(physfnPath) + if err != nil { + return -1, "" + } + + pfDevicePath := filepath.Join(devicePath, pfDevice) + entries, err := os.ReadDir(pfDevicePath) + if err != nil { + return -1, "" + } + + myDevice, err := filepath.EvalSymlinks(devicePath) + if err != nil { + return -1, "" + } + + virtfnRe := regexp.MustCompile(`^virtfn(\d+)$`) + for _, entry := range entries { + match := virtfnRe.FindStringSubmatch(entry.Name()) + if match == nil { + continue + } + + virtfnPath := filepath.Join(pfDevicePath, entry.Name()) + target, err := os.Readlink(virtfnPath) + if err != nil { + continue + } + + targetPath := filepath.Join(pfDevicePath, target) + targetAbs, err := filepath.EvalSymlinks(targetPath) + if err != nil { + continue + } + + if targetAbs == myDevice { + vfNum, _ := strconv.Atoi(match[1]) + return vfNum, filepath.Base(pfDevicePath) + } + } + + return -1, "" +} + +var ethtoolStatRe = regexp.MustCompile(`^\s+(\w+):\s+(\d+)$`) + +func getEthtoolStats(iface string) (map[string]float64, error) { + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + + cmd := exec.CommandContext(ctx, "ethtool", "-S", iface) + output, err := cmd.Output() + if err != nil { + return nil, err + } + + stats := make(map[string]float64) + scanner := bufio.NewScanner(strings.NewReader(string(output))) + + for scanner.Scan() { + line := scanner.Text() + match := ethtoolStatRe.FindStringSubmatch(line) + if match != nil { + value, err := strconv.ParseFloat(match[2], 64) + if err != nil { + log.Debugf("failed to parse %s=%s: %s", match[1], match[2], err) + continue + } + stats[match[1]] = value + } + } + + return stats, scanner.Err() +} + +func buildLabels(info InterfaceInfo, dataSource string) []string { + vfNum := "" + if info.VFNum >= 0 { + vfNum = strconv.Itoa(info.VFNum) + } + + ifType := "unknown" + if info.IsPF { + ifType = "pf" + } else if info.IsVF { + ifType = "vf" + } + + return []string{ + info.Name, + ifType, + info.ParentPF, + vfNum, + info.Driver, + dataSource, + info.NumaNode, + } +} + +var vfStatPatterns = []*regexp.Regexp{ + regexp.MustCompile(`^vf_(\w+)\[(\d+)\]$`), // ixgbe: vf_rx_packets[0] + regexp.MustCompile(`^vf-(\d+)-(\w+)$`), // i40e: vf-0-rx_packets + regexp.MustCompile(`^vf_(\d+)_(\w+)$`), // ice: vf_0_rx_packets +} + +func parseVFStat(statName string) (vfNum int, statType string, ok bool) { + if match := vfStatPatterns[0].FindStringSubmatch(statName); match != nil { + statType = match[1] + vfNum, _ = strconv.Atoi(match[2]) + return vfNum, statType, true + } + + if match := vfStatPatterns[1].FindStringSubmatch(statName); match != nil { + vfNum, _ = strconv.Atoi(match[1]) + statType = match[2] + return vfNum, statType, true + } + + if match := vfStatPatterns[2].FindStringSubmatch(statName); match != nil { + vfNum, _ = strconv.Atoi(match[1]) + statType = match[2] + return vfNum, statType, true + } + + return 0, "", false +} + +var queueStatRe = regexp.MustCompile(`^(tx|rx)_queue_(\d+)_(packets|bytes)$`) + +func (Collector) Collect(ch chan<- prometheus.Metric) { + interfaces, err := discoverSriovInterfaces() + if err != nil { + log.Errf("failed to discover SR-IOV interfaces: %s", err) + return + } + + log.Debugf("discovered %d SR-IOV interfaces", len(interfaces)) + + seenVFStats := make(map[string]bool) + + for _, iface := range interfaces { + stats, err := getEthtoolStats(iface.Name) + if err != nil { + log.Debugf("ethtool -S %s: %s", iface.Name, err) + continue + } + + log.Debugf("collected %d stats for %s (PF=%v, VF=%v, driver=%s)", + len(stats), iface.Name, iface.IsPF, iface.IsVF, iface.Driver) + + if iface.IsVF { + labels := buildLabels(iface, "direct") + collectInterfaceStats(ch, labels, stats) + key := iface.ParentPF + ":" + strconv.Itoa(iface.VFNum) + seenVFStats[key] = true + } + + if iface.IsPF { + labels := buildLabels(iface, "direct") + collectInterfaceStats(ch, labels, stats) + collectPerVFStatsFromPF(ch, iface, stats, seenVFStats) + } + } +} + +func collectInterfaceStats(ch chan<- prometheus.Metric, labels []string, stats map[string]float64) { + for statName, value := range stats { + if _, _, ok := parseVFStat(statName); ok { + continue + } + + if match := queueStatRe.FindStringSubmatch(statName); match != nil { + direction := match[1] + queueNum := match[2] + statType := match[3] + + metricName := "sriov_" + direction + "_queue_" + statType + "_total" + queueLabels := append(append([]string{}, labels...), queueNum) + + desc := prometheus.NewDesc( + metricName, + statType+" "+direction+" on queue", + append(extendedLabels, "queue"), + nil, + ) + + if config.MetricSets().Has(config.METRICS_PERF) { + ch <- prometheus.MustNewConstMetric( + desc, prometheus.CounterValue, value, queueLabels...) + } + continue + } + + if m, ok := metrics[statName]; ok { + if config.MetricSets().Has(m.Set) { + ch <- prometheus.MustNewConstMetric( + m.Desc(), m.ValueType, value, labels...) + } + } + } +} + +func collectPerVFStatsFromPF(ch chan<- prometheus.Metric, pfInfo InterfaceInfo, stats map[string]float64, seenVFStats map[string]bool) { + for statName, value := range stats { + vfNum, statType, ok := parseVFStat(statName) + if !ok { + continue + } + + key := pfInfo.PCIAddr + ":" + strconv.Itoa(vfNum) + if seenVFStats[key] { + continue + } + + metricName := "sriov_vf_" + statType + "_total" + + vfLabels := []string{ + "", + "vf", + pfInfo.PCIAddr, + strconv.Itoa(vfNum), + "vfio-pci", + "pf_aggregate", + } + + desc := prometheus.NewDesc( + metricName, + "VF "+statType+" collected from PF", + extendedLabels, + nil, + ) + + var metricSet config.MetricSet + if strings.Contains(statType, "error") || strings.Contains(statType, "drop") { + metricSet = config.METRICS_ERRORS + } else { + metricSet = config.METRICS_COUNTERS + } + + if config.MetricSets().Has(metricSet) { + ch <- prometheus.MustNewConstMetric( + desc, prometheus.CounterValue, value, vfLabels...) + } + } +} diff --git a/collectors/sriov/metrics.go b/collectors/sriov/metrics.go new file mode 100644 index 0000000..18cd342 --- /dev/null +++ b/collectors/sriov/metrics.go @@ -0,0 +1,127 @@ +// SPDX-License-Identifier: Apache-2.0 + +package sriov + +import ( + "github.com/openstack-k8s-operators/openstack-network-exporter/collectors/lib" + "github.com/openstack-k8s-operators/openstack-network-exporter/config" + "github.com/prometheus/client_golang/prometheus" +) + +var extendedLabels = []string{ + "interface", + "type", + "parent_pf", + "vf_num", + "driver", + "data_source", + "numa_node", +} + +var metrics = map[string]lib.Metric{ + "rx_bytes": { + Name: "sriov_rx_bytes_total", + Description: "Total number of received bytes on SR-IOV interface", + Labels: extendedLabels, + ValueType: prometheus.CounterValue, + Set: config.METRICS_COUNTERS, + }, + "tx_bytes": { + Name: "sriov_tx_bytes_total", + Description: "Total number of transmitted bytes on SR-IOV interface", + Labels: extendedLabels, + ValueType: prometheus.CounterValue, + Set: config.METRICS_COUNTERS, + }, + "rx_unicast": { + Name: "sriov_rx_unicast_packets_total", + Description: "Total number of received unicast packets", + Labels: extendedLabels, + ValueType: prometheus.CounterValue, + Set: config.METRICS_COUNTERS, + }, + "tx_unicast": { + Name: "sriov_tx_unicast_packets_total", + Description: "Total number of transmitted unicast packets", + Labels: extendedLabels, + ValueType: prometheus.CounterValue, + Set: config.METRICS_COUNTERS, + }, + "rx_multicast": { + Name: "sriov_rx_multicast_packets_total", + Description: "Total number of received multicast packets", + Labels: extendedLabels, + ValueType: prometheus.CounterValue, + Set: config.METRICS_COUNTERS, + }, + "tx_multicast": { + Name: "sriov_tx_multicast_packets_total", + Description: "Total number of transmitted multicast packets", + Labels: extendedLabels, + ValueType: prometheus.CounterValue, + Set: config.METRICS_COUNTERS, + }, + "rx_broadcast": { + Name: "sriov_rx_broadcast_packets_total", + Description: "Total number of received broadcast packets", + Labels: extendedLabels, + ValueType: prometheus.CounterValue, + Set: config.METRICS_COUNTERS, + }, + "tx_broadcast": { + Name: "sriov_tx_broadcast_packets_total", + Description: "Total number of transmitted broadcast packets", + Labels: extendedLabels, + ValueType: prometheus.CounterValue, + Set: config.METRICS_COUNTERS, + }, + "rx_dropped": { + Name: "sriov_rx_dropped_total", + Description: "Total number of received packets dropped", + Labels: extendedLabels, + ValueType: prometheus.CounterValue, + Set: config.METRICS_ERRORS, + }, + "tx_errors": { + Name: "sriov_tx_errors_total", + Description: "Total number of transmit errors", + Labels: extendedLabels, + ValueType: prometheus.CounterValue, + Set: config.METRICS_ERRORS, + }, + "rx_alloc_fail": { + Name: "sriov_rx_alloc_fail_total", + Description: "Total number of RX buffer allocation failures", + Labels: extendedLabels, + ValueType: prometheus.CounterValue, + Set: config.METRICS_ERRORS, + }, + "rx_pg_alloc_fail": { + Name: "sriov_rx_pg_alloc_fail_total", + Description: "Total number of RX page allocation failures", + Labels: extendedLabels, + ValueType: prometheus.CounterValue, + Set: config.METRICS_ERRORS, + }, + "tx_linearize": { + Name: "sriov_tx_linearize_total", + Description: "Number of times TX linearization was needed", + Labels: extendedLabels, + ValueType: prometheus.CounterValue, + Set: config.METRICS_PERF, + }, + "tx_busy": { + Name: "sriov_tx_busy_total", + Description: "Number of times TX queue was busy", + Labels: extendedLabels, + ValueType: prometheus.CounterValue, + Set: config.METRICS_PERF, + }, + "tx_restart": { + Name: "sriov_tx_restart_total", + Description: "Number of TX queue restarts", + Labels: extendedLabels, + ValueType: prometheus.CounterValue, + Set: config.METRICS_PERF, + }, +} From e351489f2b968a436ad7ef88f7b60c08f0c780ae Mon Sep 17 00:00:00 2001 From: Anthony Harivel Date: Thu, 15 Jan 2026 14:22:13 +0100 Subject: [PATCH 2/3] collectors: sriov: use safchain/ethtool library Replace exec.Command("ethtool", "-S", ...) with the safchain/ethtool Go library which uses ioctl directly. This removes shell exec overhead and provides cleaner error handling while maintaining the same functionality. Signed-off-by: Anthony Harivel --- collectors/sriov/collector.go | 46 +++++++++-------------------------- go.mod | 1 + 2 files changed, 12 insertions(+), 35 deletions(-) diff --git a/collectors/sriov/collector.go b/collectors/sriov/collector.go index 4fe6a3d..db15769 100644 --- a/collectors/sriov/collector.go +++ b/collectors/sriov/collector.go @@ -3,20 +3,17 @@ package sriov import ( - "bufio" - "context" "os" - "os/exec" "path/filepath" "regexp" "strconv" "strings" - "time" "github.com/openstack-k8s-operators/openstack-network-exporter/collectors/lib" "github.com/openstack-k8s-operators/openstack-network-exporter/config" "github.com/openstack-k8s-operators/openstack-network-exporter/log" "github.com/prometheus/client_golang/prometheus" + "github.com/safchain/ethtool" ) type Collector struct{} @@ -179,35 +176,14 @@ func getVFNumber(devicePath string) (int, string) { return -1, "" } -var ethtoolStatRe = regexp.MustCompile(`^\s+(\w+):\s+(\d+)$`) - -func getEthtoolStats(iface string) (map[string]float64, error) { - ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) - defer cancel() - - cmd := exec.CommandContext(ctx, "ethtool", "-S", iface) - output, err := cmd.Output() +func getEthtoolStats(iface string) (map[string]uint64, error) { + eth, err := ethtool.NewEthtool() if err != nil { return nil, err } + defer eth.Close() - stats := make(map[string]float64) - scanner := bufio.NewScanner(strings.NewReader(string(output))) - - for scanner.Scan() { - line := scanner.Text() - match := ethtoolStatRe.FindStringSubmatch(line) - if match != nil { - value, err := strconv.ParseFloat(match[2], 64) - if err != nil { - log.Debugf("failed to parse %s=%s: %s", match[1], match[2], err) - continue - } - stats[match[1]] = value - } - } - - return stats, scanner.Err() + return eth.Stats(iface) } func buildLabels(info InterfaceInfo, dataSource string) []string { @@ -278,7 +254,7 @@ func (Collector) Collect(ch chan<- prometheus.Metric) { for _, iface := range interfaces { stats, err := getEthtoolStats(iface.Name) if err != nil { - log.Debugf("ethtool -S %s: %s", iface.Name, err) + log.Debugf("ethtool stats %s: %s", iface.Name, err) continue } @@ -300,7 +276,7 @@ func (Collector) Collect(ch chan<- prometheus.Metric) { } } -func collectInterfaceStats(ch chan<- prometheus.Metric, labels []string, stats map[string]float64) { +func collectInterfaceStats(ch chan<- prometheus.Metric, labels []string, stats map[string]uint64) { for statName, value := range stats { if _, _, ok := parseVFStat(statName); ok { continue @@ -323,7 +299,7 @@ func collectInterfaceStats(ch chan<- prometheus.Metric, labels []string, stats m if config.MetricSets().Has(config.METRICS_PERF) { ch <- prometheus.MustNewConstMetric( - desc, prometheus.CounterValue, value, queueLabels...) + desc, prometheus.CounterValue, float64(value), queueLabels...) } continue } @@ -331,13 +307,13 @@ func collectInterfaceStats(ch chan<- prometheus.Metric, labels []string, stats m if m, ok := metrics[statName]; ok { if config.MetricSets().Has(m.Set) { ch <- prometheus.MustNewConstMetric( - m.Desc(), m.ValueType, value, labels...) + m.Desc(), m.ValueType, float64(value), labels...) } } } } -func collectPerVFStatsFromPF(ch chan<- prometheus.Metric, pfInfo InterfaceInfo, stats map[string]float64, seenVFStats map[string]bool) { +func collectPerVFStatsFromPF(ch chan<- prometheus.Metric, pfInfo InterfaceInfo, stats map[string]uint64, seenVFStats map[string]bool) { for statName, value := range stats { vfNum, statType, ok := parseVFStat(statName) if !ok { @@ -376,7 +352,7 @@ func collectPerVFStatsFromPF(ch chan<- prometheus.Metric, pfInfo InterfaceInfo, if config.MetricSets().Has(metricSet) { ch <- prometheus.MustNewConstMetric( - desc, prometheus.CounterValue, value, vfLabels...) + desc, prometheus.CounterValue, float64(value), vfLabels...) } } } diff --git a/go.mod b/go.mod index 2f233d1..02b8753 100644 --- a/go.mod +++ b/go.mod @@ -9,6 +9,7 @@ require ( github.com/go-logr/logr v1.4.1 github.com/ovn-org/libovsdb v0.7.0 github.com/prometheus/client_golang v1.20.5 + github.com/safchain/ethtool v0.4.1 gopkg.in/yaml.v3 v3.0.1 ) From 368e9ee9668ff4f1bebfa33a1e623f8b61094bc9 Mon Sep 17 00:00:00 2001 From: Anthony Harivel Date: Wed, 21 Jan 2026 20:41:50 +0100 Subject: [PATCH 3/3] test ip link Signed-off-by: Anthony Harivel --- collectors/sriov/collector.go | 139 +++++++++++++++++++++------------- collectors/sriov/metrics.go | 61 +++++++++++++++ 2 files changed, 148 insertions(+), 52 deletions(-) diff --git a/collectors/sriov/collector.go b/collectors/sriov/collector.go index db15769..58f4454 100644 --- a/collectors/sriov/collector.go +++ b/collectors/sriov/collector.go @@ -3,6 +3,7 @@ package sriov import ( + "net" "os" "path/filepath" "regexp" @@ -14,6 +15,7 @@ import ( "github.com/openstack-k8s-operators/openstack-network-exporter/log" "github.com/prometheus/client_golang/prometheus" "github.com/safchain/ethtool" + "github.com/vishvananda/netlink" ) type Collector struct{} @@ -27,6 +29,9 @@ func (Collector) Metrics() []lib.Metric { for _, m := range metrics { res = append(res, m) } + for _, m := range vfNetlinkMetrics { + res = append(res, m) + } return res } @@ -210,32 +215,50 @@ func buildLabels(info InterfaceInfo, dataSource string) []string { } } -var vfStatPatterns = []*regexp.Regexp{ - regexp.MustCompile(`^vf_(\w+)\[(\d+)\]$`), // ixgbe: vf_rx_packets[0] - regexp.MustCompile(`^vf-(\d+)-(\w+)$`), // i40e: vf-0-rx_packets - regexp.MustCompile(`^vf_(\d+)_(\w+)$`), // ice: vf_0_rx_packets +// VFStats holds statistics for a VF obtained via netlink +type VFStats struct { + VFNum int + MAC net.HardwareAddr + RxBytes uint64 + TxBytes uint64 + RxPackets uint64 + TxPackets uint64 + Multicast uint64 + Broadcast uint64 + RxDropped uint64 + TxDropped uint64 } -func parseVFStat(statName string) (vfNum int, statType string, ok bool) { - if match := vfStatPatterns[0].FindStringSubmatch(statName); match != nil { - statType = match[1] - vfNum, _ = strconv.Atoi(match[2]) - return vfNum, statType, true +// getVFStatsFromNetlink retrieves per-VF statistics using netlink (ip -s link show) +func getVFStatsFromNetlink(ifaceName string) ([]VFStats, error) { + link, err := netlink.LinkByName(ifaceName) + if err != nil { + return nil, err } - if match := vfStatPatterns[1].FindStringSubmatch(statName); match != nil { - vfNum, _ = strconv.Atoi(match[1]) - statType = match[2] - return vfNum, statType, true + vfInfos := link.Attrs().Vfs + if len(vfInfos) == 0 { + return nil, nil } - if match := vfStatPatterns[2].FindStringSubmatch(statName); match != nil { - vfNum, _ = strconv.Atoi(match[1]) - statType = match[2] - return vfNum, statType, true + var stats []VFStats + for _, vf := range vfInfos { + s := VFStats{ + VFNum: vf.ID, + MAC: vf.Mac, + RxBytes: vf.RxBytes, + TxBytes: vf.TxBytes, + RxPackets: vf.RxPackets, + TxPackets: vf.TxPackets, + Multicast: vf.Multicast, + Broadcast: vf.Broadcast, + RxDropped: vf.RxDropped, + TxDropped: vf.TxDropped, + } + stats = append(stats, s) } - return 0, "", false + return stats, nil } var queueStatRe = regexp.MustCompile(`^(tx|rx)_queue_(\d+)_(packets|bytes)$`) @@ -271,17 +294,14 @@ func (Collector) Collect(ch chan<- prometheus.Metric) { if iface.IsPF { labels := buildLabels(iface, "direct") collectInterfaceStats(ch, labels, stats) - collectPerVFStatsFromPF(ch, iface, stats, seenVFStats) + // Use netlink to get per-VF stats (works on ice, mlx5_core, etc.) + collectVFStatsFromNetlink(ch, iface, seenVFStats) } } } func collectInterfaceStats(ch chan<- prometheus.Metric, labels []string, stats map[string]uint64) { for statName, value := range stats { - if _, _, ok := parseVFStat(statName); ok { - continue - } - if match := queueStatRe.FindStringSubmatch(statName); match != nil { direction := match[1] queueNum := match[2] @@ -313,46 +333,61 @@ func collectInterfaceStats(ch chan<- prometheus.Metric, labels []string, stats m } } -func collectPerVFStatsFromPF(ch chan<- prometheus.Metric, pfInfo InterfaceInfo, stats map[string]uint64, seenVFStats map[string]bool) { - for statName, value := range stats { - vfNum, statType, ok := parseVFStat(statName) - if !ok { - continue - } +func emitVFMetric(ch chan<- prometheus.Metric, metricKey string, value uint64, labels []string) { + m := vfNetlinkMetrics[metricKey] + ch <- prometheus.MustNewConstMetric(m.Desc(), m.ValueType, float64(value), labels...) +} + +func collectVFStatsFromNetlink(ch chan<- prometheus.Metric, pfInfo InterfaceInfo, seenVFStats map[string]bool) { + vfStats, err := getVFStatsFromNetlink(pfInfo.Name) + if err != nil { + log.Debugf("netlink VF stats for %s: %s", pfInfo.Name, err) + return + } + + if len(vfStats) == 0 { + log.Debugf("no VF stats from netlink for %s", pfInfo.Name) + return + } + + log.Debugf("collected %d VF stats via netlink for PF %s", len(vfStats), pfInfo.Name) + + for _, vf := range vfStats { + key := pfInfo.PCIAddr + ":" + strconv.Itoa(vf.VFNum) - key := pfInfo.PCIAddr + ":" + strconv.Itoa(vfNum) + // Skip if we already collected direct stats for this VF if seenVFStats[key] { + log.Debugf("skipping VF %d on %s - already have direct stats", vf.VFNum, pfInfo.Name) continue } - metricName := "sriov_vf_" + statType + "_total" + // Determine driver - if not in seenVFStats, it's likely vfio-pci + driver := "vfio-pci" vfLabels := []string{ - "", - "vf", - pfInfo.PCIAddr, - strconv.Itoa(vfNum), - "vfio-pci", - "pf_aggregate", + "", // interface (empty for vfio-pci) + "vf", // type + pfInfo.PCIAddr, // parent_pf + strconv.Itoa(vf.VFNum), // vf_num + driver, // driver + "netlink", // data_source + pfInfo.NumaNode, // numa_node } - desc := prometheus.NewDesc( - metricName, - "VF "+statType+" collected from PF", - extendedLabels, - nil, - ) - - var metricSet config.MetricSet - if strings.Contains(statType, "error") || strings.Contains(statType, "drop") { - metricSet = config.METRICS_ERRORS - } else { - metricSet = config.METRICS_COUNTERS + // Emit counter metrics + if config.MetricSets().Has(config.METRICS_COUNTERS) { + emitVFMetric(ch, "rx_bytes", vf.RxBytes, vfLabels) + emitVFMetric(ch, "tx_bytes", vf.TxBytes, vfLabels) + emitVFMetric(ch, "rx_packets", vf.RxPackets, vfLabels) + emitVFMetric(ch, "tx_packets", vf.TxPackets, vfLabels) + emitVFMetric(ch, "rx_multicast", vf.Multicast, vfLabels) + emitVFMetric(ch, "rx_broadcast", vf.Broadcast, vfLabels) } - if config.MetricSets().Has(metricSet) { - ch <- prometheus.MustNewConstMetric( - desc, prometheus.CounterValue, float64(value), vfLabels...) + // Emit error metrics + if config.MetricSets().Has(config.METRICS_ERRORS) { + emitVFMetric(ch, "rx_dropped", vf.RxDropped, vfLabels) + emitVFMetric(ch, "tx_dropped", vf.TxDropped, vfLabels) } } } diff --git a/collectors/sriov/metrics.go b/collectors/sriov/metrics.go index 18cd342..51390b3 100644 --- a/collectors/sriov/metrics.go +++ b/collectors/sriov/metrics.go @@ -18,6 +18,67 @@ var extendedLabels = []string{ "numa_node", } +// vfNetlinkMetrics are metrics collected via netlink for VFs (especially vfio-pci bound) +var vfNetlinkMetrics = map[string]lib.Metric{ + "rx_bytes": { + Name: "sriov_vf_rx_bytes_total", + Description: "Total bytes received by VF (from netlink)", + Labels: extendedLabels, + ValueType: prometheus.CounterValue, + Set: config.METRICS_COUNTERS, + }, + "tx_bytes": { + Name: "sriov_vf_tx_bytes_total", + Description: "Total bytes transmitted by VF (from netlink)", + Labels: extendedLabels, + ValueType: prometheus.CounterValue, + Set: config.METRICS_COUNTERS, + }, + "rx_packets": { + Name: "sriov_vf_rx_packets_total", + Description: "Total packets received by VF (from netlink)", + Labels: extendedLabels, + ValueType: prometheus.CounterValue, + Set: config.METRICS_COUNTERS, + }, + "tx_packets": { + Name: "sriov_vf_tx_packets_total", + Description: "Total packets transmitted by VF (from netlink)", + Labels: extendedLabels, + ValueType: prometheus.CounterValue, + Set: config.METRICS_COUNTERS, + }, + "rx_multicast": { + Name: "sriov_vf_rx_multicast_total", + Description: "Total multicast packets received by VF (from netlink)", + Labels: extendedLabels, + ValueType: prometheus.CounterValue, + Set: config.METRICS_COUNTERS, + }, + "rx_broadcast": { + Name: "sriov_vf_rx_broadcast_total", + Description: "Total broadcast packets received by VF (from netlink)", + Labels: extendedLabels, + ValueType: prometheus.CounterValue, + Set: config.METRICS_COUNTERS, + }, + "rx_dropped": { + Name: "sriov_vf_rx_dropped_total", + Description: "Total packets dropped on receive by VF (from netlink)", + Labels: extendedLabels, + ValueType: prometheus.CounterValue, + Set: config.METRICS_ERRORS, + }, + "tx_dropped": { + Name: "sriov_vf_tx_dropped_total", + Description: "Total packets dropped on transmit by VF (from netlink)", + Labels: extendedLabels, + ValueType: prometheus.CounterValue, + Set: config.METRICS_ERRORS, + }, +} + +// metrics are metrics collected via ethtool for PFs and VFs with network drivers var metrics = map[string]lib.Metric{ "rx_bytes": { Name: "sriov_rx_bytes_total",