From a51296073319fc7bdad2656d1b005c787a01b0e7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ignas=20Vy=C5=A1niauskas?= <ignas@dividat.ch>
Date: Fri, 28 Nov 2025 15:49:24 +0200
Subject: [PATCH 1/5] Add local monitoring using Telegraf and InfluxDB

---
 application.nix                           |  38 +++
 base/default.nix                          |   1 +
 base/monitoring.nix                       | 390 ++++++++++++++++++++++
 testing/integration/monitoring-basic.nix  | 126 +++++++
 testing/integration/monitoring-stress.nix | 321 ++++++++++++++++++
 5 files changed, 876 insertions(+)
 create mode 100644 base/monitoring.nix
 create mode 100644 testing/integration/monitoring-basic.nix
 create mode 100644 testing/integration/monitoring-stress.nix

diff --git a/application.nix b/application.nix
index 9511b0d6..0b0d0ccf 100644
--- a/application.nix
+++ b/application.nix
@@ -84,6 +84,44 @@ rec {
         group = "users";
       };
 
+      playos.monitoring.enable = true;
+      playos.monitoring.extraServices = [ "dividat-driver.service" ];
+
+      systemd.services.telegraf.path = with pkgs; [ procps ]; # pgrep for procstat
+
+      # track the memory and cpu usage of processes started in the X11 session
+      # (kiosk, qtwebengine and anything else)
+      services.telegraf.extraConfig = {
+        inputs.procstat = [{
+          properties = [ "cpu" "memory" ];
+
+          taginclude = [ "process_name" ]; # not unique!
+          fieldinclude = [
+            "pid" # Note: PID is a field, not a tag, to avoid tag cardinality
+                  # growth due to restarts.
+            "cpu_time_iowait"
+            "cpu_usage"
+            "memory_rss"
+            "memory_shared"
+          ];
+
+          filter = [{
+            name = "session-procs";
+            cgroups = [ "/sys/fs/cgroup/user.slice/user-*.slice/session-*.scope" ];
+            users = [ "play" ];
+          }];
+
+        }];
+
+        processors.strings = [{
+          left = [{
+            tag = "process_name";
+            width = 64; # trim process_names to at most 64 chars to avoid very long tag names
+          }];
+        }];
+
+      };
+
       # Limit virtual terminals that can be switched to
       # Virtual terminal 7 is the kiosk, 8 is the status screen
       playos.xserver.activeVirtualTerminals = [ 7 8 ];
diff --git a/base/default.nix b/base/default.nix
index 00297378..769292bf 100644
--- a/base/default.nix
+++ b/base/default.nix
@@ -19,6 +19,7 @@ with lib;
     ./system-partition.nix
     ./volatile-root.nix
     ./compatibility
+    ./monitoring.nix
   ];
 
   options = {
diff --git a/base/monitoring.nix b/base/monitoring.nix
new file mode 100644
index 00000000..6790490c
--- /dev/null
+++ b/base/monitoring.nix
@@ -0,0 +1,390 @@
+{
+  config,
+  pkgs,
+  lib,
+  ...
+}:
+let
+  cfg = config.playos.monitoring;
+  localDatabase = cfg.localDbName;
+  dbRetention = cfg.localRetention;
+in
+{
+  imports = [
+    ./volatile-root.nix
+  ];
+
+  options = {
+    playos.monitoring = with lib; {
+      enable = mkEnableOption "Enable system monitoring tools";
+
+      localDbName = mkOption {
+        default = "playos";
+        type = types.str;
+      };
+
+      localRetention = mkOption {
+        default = "12w";
+        example = "30d";
+        description = ''
+          How long to keep data in the local DB. Specified as duration unit string.
+          See:
+              https://docs.influxdata.com/influxdb/v1/query_language/manage-database/#retention-policy-management
+              https://docs.influxdata.com/influxdb/v1/query_language/spec/#durations
+        '';
+        type = types.str;
+      };
+
+      localDbShard = mkOption {
+        default = "1w";
+        example = "3d;";
+        description = "InfluxDB shard duration (size). Must be smaller than localRetention duration. See localRetention for references";
+        type = types.str;
+      };
+
+      collectionIntervalSeconds = mkOption {
+        default = 60;
+        description = "Interval at which to collect metrics (in seconds)";
+        type = types.ints.between 1 (60 * 60);
+      };
+
+      extraServices = mkOption {
+        default = [ ];
+        description = "List of extra systemd service names (globs) to monitor";
+        type = types.listOf types.str;
+      };
+    };
+  };
+
+  config =
+    let
+      commonServiceConfig = {
+        # restart with delay and backoff
+        Restart = lib.mkForce "always";
+        RestartMaxDelaySec = "10min";
+        RestartSteps = 10;
+
+        # stop restarting after 20 attemps
+        StartLimitIntervalSec = "infinity";
+        StartLimitBurst = 20;
+
+        # limit resource usage
+        CPUWeight = 100 / 10; # 10 times smaller than the default
+        IOWeight = 100 / 10;
+      };
+
+      # A slightly silly, but helpful way to validate Telegraf's config.
+      # Due to the nix->TOML transformation and Telegraf's weird spec
+      # it is very easy to accidentally produce a broken config.
+      telegrafConfigIsValid =
+        let
+          telegrafCfg = config.services.telegraf;
+          settingsFormat = pkgs.formats.toml { };
+          configFile = settingsFormat.generate "config.toml"
+          (telegrafCfg.extraConfig // {agent.debug = true; });
+        in
+          pkgs.runCommand
+            "validate-config"
+            { buildInputs = with pkgs; [ telegraf ]; }
+            ''
+            set -euo pipefail
+
+            echo "=== Validating telegraf's config..."
+
+            if telegraf --config ${configFile} --test &> output.txt; then
+              echo "=== Config seems good!"
+              touch $out
+            else
+              echo "=== Config validation FAILED, config was:"
+              cat ${configFile}
+
+              echo "=== Telegraf ouput:"
+              cat output.txt
+
+              exit 1
+            fi
+            '';
+
+    in
+    lib.mkIf cfg.enable {
+
+      ### InfluxDB --- local metric storage
+
+      services.influxdb.enable = true;
+
+      playos.storage.persistentFolders."${config.services.influxdb.dataDir}" = {
+        mode = "0700";
+        user = config.services.influxdb.user;
+        group = config.users.users."${config.services.influxdb.user}".group;
+      };
+
+      # for maintenance ops
+      environment.systemPackages = [ pkgs.influxdb ];
+
+      systemd.services.influxdb.serviceConfig = commonServiceConfig // {
+        # for the socket file
+        RuntimeDirectory = "influxdb";
+        # for db / data
+        StateDirectory = "influxdb";
+
+        MemoryMax = "500M";
+
+        # limit to two cores
+        Environment = "GOMAXPROCS=2";
+      };
+
+      services.influxdb.dataDir = "/var/lib/influxdb"; # use the standard dir
+
+      services.influxdb.extraConfig = {
+        reporting-disabled = true;
+
+        http = {
+          enabled = true;
+
+          bind-address = "localhost:8086";
+          unix-socket-enabled = true;
+          bind-socket = "/var/run/influxdb/influxdb.sock";
+
+          auth-enabled = false;
+          log-enabled = false;
+          write-tracing = false;
+          pprof-enabled = false;
+        };
+
+        meta = {
+          retention-autocreate = false;
+        };
+
+        data = {
+          query-log-enabled = false;
+
+          # avoid accidental cardinality explosions
+          max-series-per-database = 4000;
+          max-values-per-tag = 100;
+
+          # reject writes if cache grows big
+          cache-max-memory-size = "200m";
+
+          # do one thing at a time
+          max-concurrent-compactions = 1;
+        };
+
+        logging.level = "warn";
+        logging.suppress-logo = true;
+
+        monitor.store-enabled = false;
+        subscriber.enabled = false;
+        continuous_queries.enabled = false;
+        admin.enabled = false;
+        hinted-handoff.enabled = false;
+      };
+
+      ### Telegraf --- metric collection
+
+      services.telegraf.enable = true;
+
+      systemd.tmpfiles.rules = [
+        "f '/var/cache/telegraf/env-file' 0755 telegraf telegraf - -"
+      ];
+
+      # expose machine-id via an env file and setup the DB
+      systemd.services.telegraf-setup = {
+
+        serviceConfig.ExecStart = lib.getExe (
+          pkgs.writeShellApplication {
+            name = "telegraf-setup";
+            runtimeInputs = with pkgs; [
+              influxdb
+              gnugrep
+            ];
+            text = ''
+              echo "MACHINE_ID=$(cat /etc/machine-id)" > /var/cache/telegraf/env-file
+
+              result_file=$(mktemp)
+              trap 'rm -f $result_file' EXIT
+
+              influx -format csv -execute "SHOW DATABASES" > "$result_file"
+
+              if grep -q ${localDatabase} "$result_file"; then
+                  echo "Database '${localDatabase}' exists, nothing to do"
+              else
+                  echo "Creating ${localDatabase}"
+                  influx -execute 'CREATE DATABASE ${localDatabase}; CREATE RETENTION POLICY "${dbRetention}" ON ${localDatabase} DURATION ${dbRetention} REPLICATION 1 SHARD DURATION ${cfg.localDbShard} DEFAULT; '
+              fi
+            '';
+          }
+        );
+
+        serviceConfig.Type = "oneshot";
+        serviceConfig.User = "telegraf";
+
+        requires = [ "influxdb.service" ];
+        after = [ "influxdb.service" ];
+
+        before = [ "telegraf.service" ];
+        requiredBy = [ "telegraf.service" ];
+      };
+
+      systemd.services.telegraf.serviceConfig = commonServiceConfig // {
+        EnvironmentFile = "/var/cache/telegraf/env-file";
+
+        MemoryMax = "200M";
+      };
+
+      systemd.services.telegraf.path = [
+        pkgs.lm_sensors # for inputs.sensors
+        pkgs.dbus # for inputs.systemd_units
+        telegrafConfigIsValid
+      ];
+
+      services.telegraf.extraConfig = with builtins; rec {
+        global_tags.playos_version = lib.mkIf (config.playos ? "version") config.playos.version;
+
+        agent = {
+          quiet = true;
+          hostname = "playos-\${MACHINE_ID}";
+
+          always_include_global_tags = true;
+
+          interval = "${toString cfg.collectionIntervalSeconds}s";
+          precision = "${toString (ceil (cfg.collectionIntervalSeconds / 2.0))}s";
+
+          # don't launch all collectors at once
+          collection_jitter = "${toString (ceil (cfg.collectionIntervalSeconds / 5.0))}s";
+
+          # avoid buffering many things to reduce mem usage
+          metric_batch_size = 50;
+          metric_buffer_limit = 100;
+        };
+
+        outputs.influxdb = {
+          urls = [ "unix:///var/run/influxdb/influxdb.sock" ];
+          database = "${localDatabase}";
+          content_encoding = "identity"; # don't compress
+          skip_database_creation = true; # we set up the DB manually
+        };
+
+        ## INPUTS: collected metrics
+
+        inputs.mem = {
+          fieldinclude = [
+            "cached"
+            "free"
+            "mapped"
+            "used"
+            "slab"
+            "shared"
+            "available"
+          ];
+        };
+
+        inputs.cpu = {
+          fieldinclude = [
+            "usage_user"
+            "usage_system"
+            "usage_active" # is this sum of above?
+          ];
+        };
+
+        inputs.system = {
+          fieldinclude = [
+            "load*"
+          ];
+        };
+
+        inputs.disk = {
+          # drop all the metadata tags except path
+          taginclude = [ "path" ];
+          interval = "5m"; # collect every 5 minutes, we don't expect big fluctuations here
+          mount_points = [
+            "/" # tmpfs overlay
+            config.playos.storage.persistentDataPartition.mountPath # /mnt/data
+          ] ++ (builtins.attrNames config.playos.storage.persistentFolders); # individual persistent folders
+
+          fieldinclude = [
+            "free"
+            "used"
+            "inodes_used"
+          ];
+        };
+
+        # TODO: check if it works on a PlayOS PC
+        #inputs.sensors = { };
+
+        inputs.wireless = {
+          # keeping many fields for now, to help debug wireless issues
+          fieldinclude = [
+            "status"
+            "level"
+            "noise"
+            "retry" # cumulative retry counts
+            "misc" # packets dropped for un-specified reason
+            "missed_beacon" # missed beacon packets
+          ];
+        };
+
+        inputs.net = {
+          interfaces = [
+            "wl*"
+            "eth*"
+            "enp*"
+          ];
+          fieldinclude = [
+            "bytes_sent"
+            "bytes_recv"
+            "err_in"
+            "err_out"
+            "drop_in"
+            "drop_out"
+          ];
+          ignore_protocol_stats = true;
+        };
+
+        # memory usage by dfferent systemd units
+        # (the plugin, as of v1.36.4, does not return IO or CPU stats)
+        inputs.systemd_units =
+          let
+            generalStuff = {
+              # drop all the metadata tags except name
+              taginclude = [ "name" ];
+              fieldinclude = [ "mem_current" ];
+
+              scope = "system";
+              details = true;
+            };
+          in
+          lib.lists.map (x: generalStuff // x) [
+            # memory usage by system processes and per user
+            {
+              unittype = "slice";
+              pattern = lib.strings.concatStringsSep " " [
+                "system.slice"
+                "user-*.slice"
+              ];
+            }
+            # memory usage by core services
+            {
+              unittype = "service";
+              pattern = lib.strings.concatStringsSep " " (
+                [
+                  "telegraf.service"
+                  "influxdb.service"
+                  "connman.service"
+                  "playos-*"
+                ]
+                ++ cfg.extraServices
+              );
+            }
+          ];
+
+        # Additional input plugins of possible interest:
+        #
+        # - inputs.procstat + processors.topk for TOP 5 processes by mem/cpu?
+        #   computationally heavy, not sure how useful.
+        # - inputs.diskio - disk perf, at least read_time, write_time, io_time, io_await?
+        # - inputs.kernel - various core kernel stats, including mem pressure
+
+      };
+
+    };
+}
diff --git a/testing/integration/monitoring-basic.nix b/testing/integration/monitoring-basic.nix
new file mode 100644
index 00000000..e1bbef1a
--- /dev/null
+++ b/testing/integration/monitoring-basic.nix
@@ -0,0 +1,126 @@
+{
+  # Enable remote access to InfluxDB (port 18086). Build using
+  # `nix-build --arg debug true -A driverInteractive` and then use
+  # influx/Chronograf/Grafana to connect to `http://localhost:18086/`
+  debug ? false,
+}:
+let
+  pkgs = import ../../pkgs { };
+
+  inherit (pkgs) lib;
+in
+pkgs.testers.runNixOSTest {
+  name = "monitoring";
+
+  nodes = {
+    machine =
+      {
+        config,
+        lib,
+        pkgs,
+        ...
+      }:
+      {
+        imports = [
+          ../../base/monitoring.nix
+        ];
+
+        config = {
+          virtualisation.forwardPorts = lib.optional debug {
+            from = "host";
+            host.port = 18086;
+            guest.port = 8086;
+          };
+
+          networking.firewall.enable = lib.mkIf debug (lib.mkForce false);
+
+          playos.monitoring.enable = true;
+
+          # collect faster in here
+          playos.monitoring.collectionIntervalSeconds = 2;
+
+          # modify retetion policy to check configuration works
+          playos.monitoring.localRetention = "6h"; # 1h is smallest possible
+          playos.monitoring.localDbShard = "2h"; # 1h is smallest possible
+
+          environment.systemPackages = [
+            pkgs.influxdb
+          ];
+        };
+      };
+  };
+
+  extraPythonPackages = ps: [
+    ps.colorama
+    ps.types-colorama
+  ];
+
+  testScript =
+    { nodes }:
+    let
+      monCfg = nodes.machine.playos.monitoring;
+      dbName = monCfg.localDbName;
+    in
+    ''
+      ${builtins.readFile ../helpers/nixos-test-script-helpers.py}
+      import csv
+
+      ## CONSTANTS
+
+      ## HELPERS
+
+      def run_query(query, as_dict_reader=True):
+          res = machine.succeed(
+              f"influx -database ${dbName} -format csv -execute '{query}'"
+          ).strip().split('\n')
+          # there should be at least a header
+          assert len(res) > 1, f"Query '{query}' returned no data?"
+          if as_dict_reader:
+            return csv.DictReader(res)
+          else:
+            return res
+
+
+      ## TESTS
+
+      machine.start()
+
+      with TestCase("influxdb and telegraf are running"):
+        machine.wait_for_unit("influxdb.service", timeout=10)
+        machine.wait_for_unit("telegraf.service", timeout=10)
+
+      with TestCase("Retention policy is setup") as t:
+        results = list(run_query("SHOW RETENTION POLICIES"))
+        t.assertEqual(len(results), 1,
+          f"More than one retention policy found: {results}")
+
+        policy = results[0]
+        t.assertEqual(policy['name'], "${monCfg.localRetention}")
+        t.assertTrue(policy['duration'].startswith("${monCfg.localRetention}"))
+        t.assertTrue(policy['shardGroupDuration'].startswith("${monCfg.localDbShard}"))
+        t.assertEqual(policy['default'], "true")
+
+      sleep_duration = ${toString monCfg.collectionIntervalSeconds} * 3
+      print(f"Sleeping for {sleep_duration} seconds to collect some metrics")
+      time.sleep(sleep_duration)
+      print("Restarting telegraf to force flush")
+      machine.systemctl("restart telegraf.service")
+
+      with TestCase("Metrics are received") as t:
+        results = list(run_query("SELECT * FROM mem LIMIT 2"))
+        t.assertGreater(len(results), 1, "Expected at least 2 rows")
+        first_result = results[0]
+
+        t.assertIn("free", first_result)
+        t.assertIn("used", first_result)
+
+      with TestCase("Metrics are tagged with machine-id") as t:
+        machineId = machine.succeed("cat /etc/machine-id").strip()
+        t.assertEqual(first_result['host'], f"playos-{machineId}")
+
+      with TestCase("Unnecessary tags are dropped") as t:
+        cpu_row = list(run_query("SELECT * FROM cpu LIMIT 1"))[0]
+        t.assertNotIn("time_guest_nice", cpu_row)
+        t.assertNotIn("usage_guest_nice", cpu_row)
+    '';
+}
diff --git a/testing/integration/monitoring-stress.nix b/testing/integration/monitoring-stress.nix
new file mode 100644
index 00000000..7a970b96
--- /dev/null
+++ b/testing/integration/monitoring-stress.nix
@@ -0,0 +1,321 @@
+# Stress and "volume" testing of the monitoring setup.
+#
+# The test generates simulated data to backfill InfluxDB for the configured
+# retention period and checks several invariants:
+
+# - No cardinality explosion: the default metric collection (via Telegraf) is
+#   ran to check that it produces series with cardinalities within expected
+#   limits (for this stress test and in general)
+# - Telegraf memory usage is within limits
+# - Disk usage: after backfilling, stored InfluxDB data is within expeted limits
+# - Memory usage: after backfilling and after InfluxDB completes compaction,
+#   we check that it can reach a resting state with low memory usage.
+#
+# Note that the observed memory usage here is higher than what we expect in a
+# production system, since:
+# - Telegraf is collecting data at 1s intervals, 60x faster
+# - Months of data for the whole retention period is generated in *minutes*
+
+{
+  # Enabling slowMode runs this test in a less stressful way by setting a
+  # delay between batched writes. This makes the data generation last ~15
+  # minutes (instead of ~1-2m). This is still 8000x faster than we will be
+  # producing data, but gives enough time for InfluxDB to do house-cleaning
+  # and shows a more realistic memory profile.
+  slowMode ? false,
+
+  # for (test) development - run through with tiny data and sleeps to check if
+  # the test setup works
+  speedrun ? false,
+
+  # Enable remote access to InfluxDB (port 18086). Build using
+  # `nix-build --arg debug true -A driverInteractive` and then use
+  # influx/Chronograf/Grafana to connect to `http://localhost:18086/`
+  debug ? false,
+}:
+let
+  pkgs = import ../../pkgs { };
+
+  inherit (pkgs) lib;
+
+  # influxdb stress testing tool
+  inch_tool = pkgs.buildGoModule rec {
+    pname = "inch";
+    name = "inch";
+
+    vendorHash = "sha256-upbcZCZEqgp8QlbA1qihLBmyHA0oA5PatN/ur6MkzqU=";
+
+    src = (
+      pkgs.fetchFromGitHub {
+        owner = "influxdata";
+        repo = "inch";
+        rev = "56a9750e91941d59a17ef2463d351513f378d9f4";
+        sha256 = "sha256-UXg3+L4PMW8u5RLeDja0kYzxUnljhxVYe+p29XW4xoM=";
+      }
+    );
+  };
+
+  # how much to pause between batches when generating simulated metric data,
+  # controls generation speed and load on InfluxDB
+  writeDelay = if slowMode then "300ms" else "0";
+
+  # cgroup/OOM limits for InfluxDB. Actual expected usage is smaller, this is
+  # just to avoid OOM due to stress, see assertions.
+  memoryMax = if slowMode then "500M" else "2G";
+
+  # How much data to generate (and how much is stored in InfluxDB)
+  localRetentionWeeks =
+    if speedrun then
+      1
+    else
+      # matches the default configuration
+      12;
+in
+pkgs.testers.runNixOSTest {
+  name = "monitoring";
+
+  nodes = {
+    machine =
+      {
+        config,
+        lib,
+        pkgs,
+        ...
+      }:
+      {
+        imports = [
+          ../../base/monitoring.nix
+        ];
+
+        config = {
+          virtualisation.forwardPorts = lib.optional debug {
+            from = "host";
+            host.port = 18086;
+            guest.port = 8086;
+          };
+
+          networking.firewall.enable = lib.mkIf debug (lib.mkForce false);
+
+          virtualisation.memorySize = lib.mkForce 3000;
+
+          playos.monitoring.enable = true;
+
+          # collect faster in here
+          playos.monitoring.collectionIntervalSeconds =
+            if slowMode then
+              10 # still 6x more frequent
+            else
+              1;
+          playos.monitoring.localRetention = "${toString localRetentionWeeks}w";
+
+          # enable frequent compaction to observe results quickly
+          services.influxdb.extraConfig.data = {
+            cache-snapshot-write-cold-duration = "1m";
+            compact-full-write-cold-duration = "1m";
+          };
+
+          systemd.services.influxdb.serviceConfig = {
+            MemoryMax = lib.mkForce memoryMax;
+          };
+
+          environment.systemPackages = [
+            pkgs.influxdb
+            inch_tool
+          ];
+        };
+      };
+  };
+
+  extraPythonPackages = ps: [
+    ps.colorama
+    ps.types-colorama
+  ];
+
+  testScript =
+    { nodes }:
+    ''
+      ${builtins.readFile ../helpers/nixos-test-script-helpers.py}
+      import math
+      import json
+
+      ## CONSTANTS
+
+      # How much space can the InfluxDB take on disk
+      MAX_INFLUXDB_STORED_SIZE_MB = 300
+
+      # How much memory we expect InfluxDB to use in a "steady" state
+      # (compaction done, regular telegraf collection)
+      MAX_INFLUXDB_PEAK_MEMORY_MB = 150
+
+      MAX_TELEGRAF_PEAK_MEMORY_MB = 150
+
+      SPEEDRUN = json.loads("${lib.boolToString speedrun}")
+      if SPEEDRUN:
+          print("===== RUNNING IN SPEEDRUN MODE =====")
+
+      weeks = ${toString localRetentionWeeks}
+      measurements = 10 # 1 measurement = 1 configured input plugin
+
+      # Empirically we expect to have <100 unique tag values over ALL of the
+      # measurements, so product of tag_cardinalities should be less than
+      # 100/measurements
+      tag_cardinalities = [2, 5]
+
+      # 1 series = 1 unique tag combination within a measurement
+      num_series = measurements * math.prod(tag_cardinalities)
+
+      # each series will have this number of data fields
+      fields = 5
+
+      points_per_minute = 1
+      points = weeks*7*24*60*points_per_minute
+
+      ## HELPERS
+
+      def run_query(query):
+          res = machine.succeed(
+              f"influx -database playos -format csv -execute '{query}'"
+          ).strip().split('\n')
+          # there should be at least a header
+          assert len(res) > 1, f"Query '{query}' returned no data?"
+          return res
+
+
+      def check_cardinalities(t):
+          measurements_list = run_query("SHOW MEASUREMENTS")
+          total_measurements = len(measurements_list) - 1 # minus header
+
+          series_list = run_query("SHOW SERIES")
+          total_series = len(series_list) - 1
+
+          fields_list = run_query("SHOW FIELD KEYS")
+          total_fields = len(fields_list) - 1
+
+          print(f"""
+          Telegraf data collection produced:
+          - {total_measurements} measurements
+          - {total_series} unique series (measurements x tag_combos)
+          - {total_fields} unique fields, for a...
+          - {round(total_fields/total_measurements, 1)} avg. fields per measurement
+          """)
+
+          t.assertLess(total_series, num_series,
+                           "Telegraf collected metrics exceed assumed max series count")
+          t.assertLess(total_fields, measurements * fields,
+                           "Telegraf collected metrics produced more fields than assumed")
+
+
+      def get_memory_stats_mb(service):
+          memory_current = machine.succeed(f"systemctl show {service} -p MemoryCurrent --value")
+          memory_peak = machine.succeed(f"systemctl show {service} -p MemoryPeak --value")
+          return {
+              'memory_peak': int(memory_peak) / (1024*1024),
+              'mem_current': int(memory_current) / (1024*1024)
+          }
+
+      def get_disk_usage_bytes():
+          db_stored_size = machine.succeed("du --bytes -s /var/lib/influxdb | cut -f1")
+          return int(db_stored_size)
+
+      def print_memory_stats(stats):
+          for k in stats:
+              print(f"{k}: {round(stats[k])}MB")
+
+      def get_and_print_memory_stats(service):
+          stats = get_memory_stats_mb(service)
+          print(f"{service} memory usage:")
+          print_memory_stats(stats)
+          return stats
+
+      def check_stored_size(t):
+          stored_bytes = get_disk_usage_bytes()
+          stored_mb = round(stored_bytes / (1024*1024))
+          print(f"Disk usage is: {stored_mb}MB")
+
+          t.assertGreater(stored_bytes, 0,
+              "Stored DB size is zero?")
+
+          t.assertLess(stored_bytes, MAX_INFLUXDB_STORED_SIZE_MB * 1024 * 1024,
+              f"Stored DB size exceeded {MAX_INFLUXDB_STORED_SIZE_MB}MB")
+
+
+      ## TESTS
+
+      machine.start()
+
+      with TestPrecondition("influxdb and telegraf are running"):
+          machine.wait_for_unit("influxdb.service", timeout=10)
+          machine.wait_for_unit("telegraf.service", timeout=10)
+
+
+      ## Stage 1: check if basic collection has no memory spikes
+
+      sleep_duration = 1 if SPEEDRUN else 120
+      print(f"Collecting Telegraf stats for {sleep_duration} seconds...")
+      time.sleep(sleep_duration)
+
+      with TestCase("Memory usage of Telegraf is reasonable") as t:
+          telegraf_stats = get_and_print_memory_stats("telegraf.service")
+          t.assertLess(telegraf_stats['memory_peak'], MAX_TELEGRAF_PEAK_MEMORY_MB)
+
+      print("Stopping Telegraf to force data flush.")
+      machine.systemctl("stop telegraf.service")
+
+      with TestCase("Memory usage of InfluxDB is reasonable") as t:
+          influxdb_stats = get_and_print_memory_stats("influxdb.service")
+          t.assertLess(influxdb_stats['memory_peak'], MAX_INFLUXDB_PEAK_MEMORY_MB)
+
+      ## Stage 2: run a stress test to backfill data for the whole retention
+
+      with TestPrecondition("Cardinality of data collected by Telegraf matches stress test setup") as t:
+          check_cardinalities(t)
+
+      with TestCase(f"Generate {weeks} weeks of data (SLOW_MODE = ${lib.boolToString slowMode})"):
+          res = machine.succeed(
+          f"""inch \
+                  -no-setup \
+                  -max-errors 10 \
+                  -db playos \
+                  -precision s \
+                  -randomize-fields \
+                  -m {measurements} \
+                  -f {fields} \
+                  -t {",".join(map(str,tag_cardinalities))} \
+                  -p {points} \
+                  -delay ${writeDelay} \
+                  -time -{weeks*7*24}h
+          """)
+          print(res)
+
+      print("====== STATS IMMEDIATELLY AFTER ========")
+      get_and_print_memory_stats("influxdb.service")
+      # Note: memory limits enforced via MemoryMax, "realistic" memory only
+      # checked at the end
+
+      with TestCase("Disk usage after stress test is within limits") as t:
+          check_stored_size(t)
+
+
+      sleep_duration = 1 if SPEEDRUN else 120
+      print(f"Sleeping for {sleep_duration} seconds to allow compaction...")
+      time.sleep(sleep_duration)
+
+      print("====== STATS AFTER compaction ========")
+      get_and_print_memory_stats("influxdb.service")
+
+      with TestCase("Disk usage after compaction is within limits") as t:
+          check_stored_size(t)
+
+      print("====== STATS AFTER restarting ========")
+      machine.systemctl("restart influxdb.service")
+      if not SPEEDRUN:
+          time.sleep(10)
+      stats = get_and_print_memory_stats("influxdb.service")
+
+      with TestCase("InfluxDB memory usage after restart is within limits") as t:
+          t.assertLess(stats['memory_peak'], MAX_INFLUXDB_PEAK_MEMORY_MB)
+
+      with TestCase("Disk usage after restart is within limits") as t:
+          check_stored_size(t)
+    '';
+}

From 4f461fe4419be0064c916b621b90d55165012479 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ignas=20Vy=C5=A1niauskas?= <ignas@dividat.ch>
Date: Fri, 19 Dec 2025 13:47:56 +0200
Subject: [PATCH 2/5] Remove persistentFolders from mount_points

They all show the same numbers, since this is not based on `du`.
---
 base/monitoring.nix | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/base/monitoring.nix b/base/monitoring.nix
index 6790490c..9f8e8a72 100644
--- a/base/monitoring.nix
+++ b/base/monitoring.nix
@@ -299,7 +299,7 @@ in
           mount_points = [
             "/" # tmpfs overlay
             config.playos.storage.persistentDataPartition.mountPath # /mnt/data
-          ] ++ (builtins.attrNames config.playos.storage.persistentFolders); # individual persistent folders
+          ];
 
           fieldinclude = [
             "free"

From b930e8ec534984b7423be7af9fead2514aaf2768 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ignas=20Vy=C5=A1niauskas?= <ignas@dividat.ch>
Date: Fri, 19 Dec 2025 13:53:58 +0200
Subject: [PATCH 3/5] Use a more general interface glob

---
 base/monitoring.nix | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/base/monitoring.nix b/base/monitoring.nix
index 9f8e8a72..f99bb36b 100644
--- a/base/monitoring.nix
+++ b/base/monitoring.nix
@@ -327,7 +327,7 @@ in
           interfaces = [
             "wl*"
             "eth*"
-            "enp*"
+            "en*"
           ];
           fieldinclude = [
             "bytes_sent"

From b67cf4a1414d77f0f5e7f943f880066b70173e53 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ignas=20Vy=C5=A1niauskas?= <ignas@dividat.ch>
Date: Mon, 19 Jan 2026 12:36:56 +0200
Subject: [PATCH 4/5] Build a minimal version of telegraf

This drastically reduces the binary size and memory usage, from
~160-180MB to 20-30MB.

A simple approach with hard-coded plugin list instead of some
mutually-recursive mess that would require a cyclic dependency for
config -> telgraf-build -> config-validation.
---
 base/monitoring.nix                       | 10 +++++++-
 pkgs/default.nix                          |  2 ++
 pkgs/telegraf.nix                         | 30 +++++++++++++++++++++++
 testing/integration/monitoring-stress.nix |  2 +-
 4 files changed, 42 insertions(+), 2 deletions(-)
 create mode 100644 pkgs/telegraf.nix

diff --git a/base/monitoring.nix b/base/monitoring.nix
index f99bb36b..034cf42e 100644
--- a/base/monitoring.nix
+++ b/base/monitoring.nix
@@ -101,6 +101,12 @@ in
               echo "=== Telegraf ouput:"
               cat output.txt
 
+              echo "Hint: PlayOS uses a custom build of telegraf, so if you get"
+              echo "an error like 'undefined but requested input', this can mean"
+              echo "two things:"
+              echo "  1. Typo / wrong name of plugin"
+              echo "  2. Plugin is not included in custom build, check pkgs/telegraf.nix"
+
               exit 1
             fi
             '';
@@ -228,7 +234,7 @@ in
       systemd.services.telegraf.serviceConfig = commonServiceConfig // {
         EnvironmentFile = "/var/cache/telegraf/env-file";
 
-        MemoryMax = "200M";
+        MemoryMax = "60M";
       };
 
       systemd.services.telegraf.path = [
@@ -237,6 +243,8 @@ in
         telegrafConfigIsValid
       ];
 
+      # NOTE: if you add new inputs/ouputs or other configuration options that
+      # require extra telegraf dependencies, you need to also modify pkgs/telegraf.nix
       services.telegraf.extraConfig = with builtins; rec {
         global_tags.playos_version = lib.mkIf (config.playos ? "version") config.playos.version;
 
diff --git a/pkgs/default.nix b/pkgs/default.nix
index 348cfef7..8c14efa3 100644
--- a/pkgs/default.nix
+++ b/pkgs/default.nix
@@ -36,6 +36,8 @@ let
       });
 
       focus-shift = self.callPackage ./focus-shift.nix {};
+
+      telegraf = (import ./telegraf.nix) super;
     };
 in
 
diff --git a/pkgs/telegraf.nix b/pkgs/telegraf.nix
new file mode 100644
index 00000000..8f44d03d
--- /dev/null
+++ b/pkgs/telegraf.nix
@@ -0,0 +1,30 @@
+# Build a minimal version of telegraf with only the plugins we actually use.
+#
+# Note: this relies on Telegraf's config validation in base/monitoring to detect
+# missing plugins.
+#
+# See https://github.com/influxdata/telegraf/blob/master/docs/CUSTOMIZATION.md
+# for details.
+super:
+let
+  supportedPlugins = [
+    "inputs.cpu"
+    "inputs.disk"
+    "inputs.mem"
+    "inputs.net"
+    "inputs.procstat"
+    "inputs.system"
+    "inputs.sensors"
+    "inputs.systemd_units"
+    "inputs.wireless"
+
+    "outputs.influxdb"
+
+    "processors.strings"
+    ];
+in
+super.telegraf.overrideAttrs (old: {
+  tags = (old.tags or []) ++ ["custom"] ++ supportedPlugins;
+
+  doCheck = false; # tests require non-custom build
+})
diff --git a/testing/integration/monitoring-stress.nix b/testing/integration/monitoring-stress.nix
index 7a970b96..742eefb4 100644
--- a/testing/integration/monitoring-stress.nix
+++ b/testing/integration/monitoring-stress.nix
@@ -147,7 +147,7 @@ pkgs.testers.runNixOSTest {
       # (compaction done, regular telegraf collection)
       MAX_INFLUXDB_PEAK_MEMORY_MB = 150
 
-      MAX_TELEGRAF_PEAK_MEMORY_MB = 150
+      MAX_TELEGRAF_PEAK_MEMORY_MB = 40
 
       SPEEDRUN = json.loads("${lib.boolToString speedrun}")
       if SPEEDRUN:

From ab73b81fdc03ef000b36d7db9aae7aa181441296 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ignas=20Vy=C5=A1niauskas?= <ignas@dividat.ch>
Date: Mon, 19 Jan 2026 12:48:04 +0200
Subject: [PATCH 5/5] Spelling

---
 base/monitoring.nix | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/base/monitoring.nix b/base/monitoring.nix
index 034cf42e..db9dc0e4 100644
--- a/base/monitoring.nix
+++ b/base/monitoring.nix
@@ -98,7 +98,7 @@ in
               echo "=== Config validation FAILED, config was:"
               cat ${configFile}
 
-              echo "=== Telegraf ouput:"
+              echo "=== Telegraf output:"
               cat output.txt
 
               echo "Hint: PlayOS uses a custom build of telegraf, so if you get"