From a51296073319fc7bdad2656d1b005c787a01b0e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ignas=20Vy=C5=A1niauskas?= Date: Fri, 28 Nov 2025 15:49:24 +0200 Subject: [PATCH 1/5] Add local monitoring using Telegraf and InfluxDB --- application.nix | 38 +++ base/default.nix | 1 + base/monitoring.nix | 390 ++++++++++++++++++++++ testing/integration/monitoring-basic.nix | 126 +++++++ testing/integration/monitoring-stress.nix | 321 ++++++++++++++++++ 5 files changed, 876 insertions(+) create mode 100644 base/monitoring.nix create mode 100644 testing/integration/monitoring-basic.nix create mode 100644 testing/integration/monitoring-stress.nix diff --git a/application.nix b/application.nix index 9511b0d6..0b0d0ccf 100644 --- a/application.nix +++ b/application.nix @@ -84,6 +84,44 @@ rec { group = "users"; }; + playos.monitoring.enable = true; + playos.monitoring.extraServices = [ "dividat-driver.service" ]; + + systemd.services.telegraf.path = with pkgs; [ procps ]; # pgrep for procstat + + # track the memory and cpu usage of processes started in the X11 session + # (kiosk, qtwebengine and anything else) + services.telegraf.extraConfig = { + inputs.procstat = [{ + properties = [ "cpu" "memory" ]; + + taginclude = [ "process_name" ]; # not unique! + fieldinclude = [ + "pid" # Note: PID is a field, not a tag, to avoid tag cardinality + # growth due to restarts. + "cpu_time_iowait" + "cpu_usage" + "memory_rss" + "memory_shared" + ]; + + filter = [{ + name = "session-procs"; + cgroups = [ "/sys/fs/cgroup/user.slice/user-*.slice/session-*.scope" ]; + users = [ "play" ]; + }]; + + }]; + + processors.strings = [{ + left = [{ + tag = "process_name"; + width = 64; # trim process_names to at most 64 chars to avoid very long tag names + }]; + }]; + + }; + # Limit virtual terminals that can be switched to # Virtual terminal 7 is the kiosk, 8 is the status screen playos.xserver.activeVirtualTerminals = [ 7 8 ]; diff --git a/base/default.nix b/base/default.nix index 00297378..769292bf 100644 --- a/base/default.nix +++ b/base/default.nix @@ -19,6 +19,7 @@ with lib; ./system-partition.nix ./volatile-root.nix ./compatibility + ./monitoring.nix ]; options = { diff --git a/base/monitoring.nix b/base/monitoring.nix new file mode 100644 index 00000000..6790490c --- /dev/null +++ b/base/monitoring.nix @@ -0,0 +1,390 @@ +{ + config, + pkgs, + lib, + ... +}: +let + cfg = config.playos.monitoring; + localDatabase = cfg.localDbName; + dbRetention = cfg.localRetention; +in +{ + imports = [ + ./volatile-root.nix + ]; + + options = { + playos.monitoring = with lib; { + enable = mkEnableOption "Enable system monitoring tools"; + + localDbName = mkOption { + default = "playos"; + type = types.str; + }; + + localRetention = mkOption { + default = "12w"; + example = "30d"; + description = '' + How long to keep data in the local DB. Specified as duration unit string. + See: + https://docs.influxdata.com/influxdb/v1/query_language/manage-database/#retention-policy-management + https://docs.influxdata.com/influxdb/v1/query_language/spec/#durations + ''; + type = types.str; + }; + + localDbShard = mkOption { + default = "1w"; + example = "3d;"; + description = "InfluxDB shard duration (size). Must be smaller than localRetention duration. See localRetention for references"; + type = types.str; + }; + + collectionIntervalSeconds = mkOption { + default = 60; + description = "Interval at which to collect metrics (in seconds)"; + type = types.ints.between 1 (60 * 60); + }; + + extraServices = mkOption { + default = [ ]; + description = "List of extra systemd service names (globs) to monitor"; + type = types.listOf types.str; + }; + }; + }; + + config = + let + commonServiceConfig = { + # restart with delay and backoff + Restart = lib.mkForce "always"; + RestartMaxDelaySec = "10min"; + RestartSteps = 10; + + # stop restarting after 20 attemps + StartLimitIntervalSec = "infinity"; + StartLimitBurst = 20; + + # limit resource usage + CPUWeight = 100 / 10; # 10 times smaller than the default + IOWeight = 100 / 10; + }; + + # A slightly silly, but helpful way to validate Telegraf's config. + # Due to the nix->TOML transformation and Telegraf's weird spec + # it is very easy to accidentally produce a broken config. + telegrafConfigIsValid = + let + telegrafCfg = config.services.telegraf; + settingsFormat = pkgs.formats.toml { }; + configFile = settingsFormat.generate "config.toml" + (telegrafCfg.extraConfig // {agent.debug = true; }); + in + pkgs.runCommand + "validate-config" + { buildInputs = with pkgs; [ telegraf ]; } + '' + set -euo pipefail + + echo "=== Validating telegraf's config..." + + if telegraf --config ${configFile} --test &> output.txt; then + echo "=== Config seems good!" + touch $out + else + echo "=== Config validation FAILED, config was:" + cat ${configFile} + + echo "=== Telegraf ouput:" + cat output.txt + + exit 1 + fi + ''; + + in + lib.mkIf cfg.enable { + + ### InfluxDB --- local metric storage + + services.influxdb.enable = true; + + playos.storage.persistentFolders."${config.services.influxdb.dataDir}" = { + mode = "0700"; + user = config.services.influxdb.user; + group = config.users.users."${config.services.influxdb.user}".group; + }; + + # for maintenance ops + environment.systemPackages = [ pkgs.influxdb ]; + + systemd.services.influxdb.serviceConfig = commonServiceConfig // { + # for the socket file + RuntimeDirectory = "influxdb"; + # for db / data + StateDirectory = "influxdb"; + + MemoryMax = "500M"; + + # limit to two cores + Environment = "GOMAXPROCS=2"; + }; + + services.influxdb.dataDir = "/var/lib/influxdb"; # use the standard dir + + services.influxdb.extraConfig = { + reporting-disabled = true; + + http = { + enabled = true; + + bind-address = "localhost:8086"; + unix-socket-enabled = true; + bind-socket = "/var/run/influxdb/influxdb.sock"; + + auth-enabled = false; + log-enabled = false; + write-tracing = false; + pprof-enabled = false; + }; + + meta = { + retention-autocreate = false; + }; + + data = { + query-log-enabled = false; + + # avoid accidental cardinality explosions + max-series-per-database = 4000; + max-values-per-tag = 100; + + # reject writes if cache grows big + cache-max-memory-size = "200m"; + + # do one thing at a time + max-concurrent-compactions = 1; + }; + + logging.level = "warn"; + logging.suppress-logo = true; + + monitor.store-enabled = false; + subscriber.enabled = false; + continuous_queries.enabled = false; + admin.enabled = false; + hinted-handoff.enabled = false; + }; + + ### Telegraf --- metric collection + + services.telegraf.enable = true; + + systemd.tmpfiles.rules = [ + "f '/var/cache/telegraf/env-file' 0755 telegraf telegraf - -" + ]; + + # expose machine-id via an env file and setup the DB + systemd.services.telegraf-setup = { + + serviceConfig.ExecStart = lib.getExe ( + pkgs.writeShellApplication { + name = "telegraf-setup"; + runtimeInputs = with pkgs; [ + influxdb + gnugrep + ]; + text = '' + echo "MACHINE_ID=$(cat /etc/machine-id)" > /var/cache/telegraf/env-file + + result_file=$(mktemp) + trap 'rm -f $result_file' EXIT + + influx -format csv -execute "SHOW DATABASES" > "$result_file" + + if grep -q ${localDatabase} "$result_file"; then + echo "Database '${localDatabase}' exists, nothing to do" + else + echo "Creating ${localDatabase}" + influx -execute 'CREATE DATABASE ${localDatabase}; CREATE RETENTION POLICY "${dbRetention}" ON ${localDatabase} DURATION ${dbRetention} REPLICATION 1 SHARD DURATION ${cfg.localDbShard} DEFAULT; ' + fi + ''; + } + ); + + serviceConfig.Type = "oneshot"; + serviceConfig.User = "telegraf"; + + requires = [ "influxdb.service" ]; + after = [ "influxdb.service" ]; + + before = [ "telegraf.service" ]; + requiredBy = [ "telegraf.service" ]; + }; + + systemd.services.telegraf.serviceConfig = commonServiceConfig // { + EnvironmentFile = "/var/cache/telegraf/env-file"; + + MemoryMax = "200M"; + }; + + systemd.services.telegraf.path = [ + pkgs.lm_sensors # for inputs.sensors + pkgs.dbus # for inputs.systemd_units + telegrafConfigIsValid + ]; + + services.telegraf.extraConfig = with builtins; rec { + global_tags.playos_version = lib.mkIf (config.playos ? "version") config.playos.version; + + agent = { + quiet = true; + hostname = "playos-\${MACHINE_ID}"; + + always_include_global_tags = true; + + interval = "${toString cfg.collectionIntervalSeconds}s"; + precision = "${toString (ceil (cfg.collectionIntervalSeconds / 2.0))}s"; + + # don't launch all collectors at once + collection_jitter = "${toString (ceil (cfg.collectionIntervalSeconds / 5.0))}s"; + + # avoid buffering many things to reduce mem usage + metric_batch_size = 50; + metric_buffer_limit = 100; + }; + + outputs.influxdb = { + urls = [ "unix:///var/run/influxdb/influxdb.sock" ]; + database = "${localDatabase}"; + content_encoding = "identity"; # don't compress + skip_database_creation = true; # we set up the DB manually + }; + + ## INPUTS: collected metrics + + inputs.mem = { + fieldinclude = [ + "cached" + "free" + "mapped" + "used" + "slab" + "shared" + "available" + ]; + }; + + inputs.cpu = { + fieldinclude = [ + "usage_user" + "usage_system" + "usage_active" # is this sum of above? + ]; + }; + + inputs.system = { + fieldinclude = [ + "load*" + ]; + }; + + inputs.disk = { + # drop all the metadata tags except path + taginclude = [ "path" ]; + interval = "5m"; # collect every 5 minutes, we don't expect big fluctuations here + mount_points = [ + "/" # tmpfs overlay + config.playos.storage.persistentDataPartition.mountPath # /mnt/data + ] ++ (builtins.attrNames config.playos.storage.persistentFolders); # individual persistent folders + + fieldinclude = [ + "free" + "used" + "inodes_used" + ]; + }; + + # TODO: check if it works on a PlayOS PC + #inputs.sensors = { }; + + inputs.wireless = { + # keeping many fields for now, to help debug wireless issues + fieldinclude = [ + "status" + "level" + "noise" + "retry" # cumulative retry counts + "misc" # packets dropped for un-specified reason + "missed_beacon" # missed beacon packets + ]; + }; + + inputs.net = { + interfaces = [ + "wl*" + "eth*" + "enp*" + ]; + fieldinclude = [ + "bytes_sent" + "bytes_recv" + "err_in" + "err_out" + "drop_in" + "drop_out" + ]; + ignore_protocol_stats = true; + }; + + # memory usage by dfferent systemd units + # (the plugin, as of v1.36.4, does not return IO or CPU stats) + inputs.systemd_units = + let + generalStuff = { + # drop all the metadata tags except name + taginclude = [ "name" ]; + fieldinclude = [ "mem_current" ]; + + scope = "system"; + details = true; + }; + in + lib.lists.map (x: generalStuff // x) [ + # memory usage by system processes and per user + { + unittype = "slice"; + pattern = lib.strings.concatStringsSep " " [ + "system.slice" + "user-*.slice" + ]; + } + # memory usage by core services + { + unittype = "service"; + pattern = lib.strings.concatStringsSep " " ( + [ + "telegraf.service" + "influxdb.service" + "connman.service" + "playos-*" + ] + ++ cfg.extraServices + ); + } + ]; + + # Additional input plugins of possible interest: + # + # - inputs.procstat + processors.topk for TOP 5 processes by mem/cpu? + # computationally heavy, not sure how useful. + # - inputs.diskio - disk perf, at least read_time, write_time, io_time, io_await? + # - inputs.kernel - various core kernel stats, including mem pressure + + }; + + }; +} diff --git a/testing/integration/monitoring-basic.nix b/testing/integration/monitoring-basic.nix new file mode 100644 index 00000000..e1bbef1a --- /dev/null +++ b/testing/integration/monitoring-basic.nix @@ -0,0 +1,126 @@ +{ + # Enable remote access to InfluxDB (port 18086). Build using + # `nix-build --arg debug true -A driverInteractive` and then use + # influx/Chronograf/Grafana to connect to `http://localhost:18086/` + debug ? false, +}: +let + pkgs = import ../../pkgs { }; + + inherit (pkgs) lib; +in +pkgs.testers.runNixOSTest { + name = "monitoring"; + + nodes = { + machine = + { + config, + lib, + pkgs, + ... + }: + { + imports = [ + ../../base/monitoring.nix + ]; + + config = { + virtualisation.forwardPorts = lib.optional debug { + from = "host"; + host.port = 18086; + guest.port = 8086; + }; + + networking.firewall.enable = lib.mkIf debug (lib.mkForce false); + + playos.monitoring.enable = true; + + # collect faster in here + playos.monitoring.collectionIntervalSeconds = 2; + + # modify retetion policy to check configuration works + playos.monitoring.localRetention = "6h"; # 1h is smallest possible + playos.monitoring.localDbShard = "2h"; # 1h is smallest possible + + environment.systemPackages = [ + pkgs.influxdb + ]; + }; + }; + }; + + extraPythonPackages = ps: [ + ps.colorama + ps.types-colorama + ]; + + testScript = + { nodes }: + let + monCfg = nodes.machine.playos.monitoring; + dbName = monCfg.localDbName; + in + '' + ${builtins.readFile ../helpers/nixos-test-script-helpers.py} + import csv + + ## CONSTANTS + + ## HELPERS + + def run_query(query, as_dict_reader=True): + res = machine.succeed( + f"influx -database ${dbName} -format csv -execute '{query}'" + ).strip().split('\n') + # there should be at least a header + assert len(res) > 1, f"Query '{query}' returned no data?" + if as_dict_reader: + return csv.DictReader(res) + else: + return res + + + ## TESTS + + machine.start() + + with TestCase("influxdb and telegraf are running"): + machine.wait_for_unit("influxdb.service", timeout=10) + machine.wait_for_unit("telegraf.service", timeout=10) + + with TestCase("Retention policy is setup") as t: + results = list(run_query("SHOW RETENTION POLICIES")) + t.assertEqual(len(results), 1, + f"More than one retention policy found: {results}") + + policy = results[0] + t.assertEqual(policy['name'], "${monCfg.localRetention}") + t.assertTrue(policy['duration'].startswith("${monCfg.localRetention}")) + t.assertTrue(policy['shardGroupDuration'].startswith("${monCfg.localDbShard}")) + t.assertEqual(policy['default'], "true") + + sleep_duration = ${toString monCfg.collectionIntervalSeconds} * 3 + print(f"Sleeping for {sleep_duration} seconds to collect some metrics") + time.sleep(sleep_duration) + print("Restarting telegraf to force flush") + machine.systemctl("restart telegraf.service") + + with TestCase("Metrics are received") as t: + results = list(run_query("SELECT * FROM mem LIMIT 2")) + t.assertGreater(len(results), 1, "Expected at least 2 rows") + first_result = results[0] + + t.assertIn("free", first_result) + t.assertIn("used", first_result) + + with TestCase("Metrics are tagged with machine-id") as t: + machineId = machine.succeed("cat /etc/machine-id").strip() + t.assertEqual(first_result['host'], f"playos-{machineId}") + + with TestCase("Unnecessary tags are dropped") as t: + cpu_row = list(run_query("SELECT * FROM cpu LIMIT 1"))[0] + t.assertNotIn("time_guest_nice", cpu_row) + t.assertNotIn("usage_guest_nice", cpu_row) + ''; +} diff --git a/testing/integration/monitoring-stress.nix b/testing/integration/monitoring-stress.nix new file mode 100644 index 00000000..7a970b96 --- /dev/null +++ b/testing/integration/monitoring-stress.nix @@ -0,0 +1,321 @@ +# Stress and "volume" testing of the monitoring setup. +# +# The test generates simulated data to backfill InfluxDB for the configured +# retention period and checks several invariants: + +# - No cardinality explosion: the default metric collection (via Telegraf) is +# ran to check that it produces series with cardinalities within expected +# limits (for this stress test and in general) +# - Telegraf memory usage is within limits +# - Disk usage: after backfilling, stored InfluxDB data is within expeted limits +# - Memory usage: after backfilling and after InfluxDB completes compaction, +# we check that it can reach a resting state with low memory usage. +# +# Note that the observed memory usage here is higher than what we expect in a +# production system, since: +# - Telegraf is collecting data at 1s intervals, 60x faster +# - Months of data for the whole retention period is generated in *minutes* + +{ + # Enabling slowMode runs this test in a less stressful way by setting a + # delay between batched writes. This makes the data generation last ~15 + # minutes (instead of ~1-2m). This is still 8000x faster than we will be + # producing data, but gives enough time for InfluxDB to do house-cleaning + # and shows a more realistic memory profile. + slowMode ? false, + + # for (test) development - run through with tiny data and sleeps to check if + # the test setup works + speedrun ? false, + + # Enable remote access to InfluxDB (port 18086). Build using + # `nix-build --arg debug true -A driverInteractive` and then use + # influx/Chronograf/Grafana to connect to `http://localhost:18086/` + debug ? false, +}: +let + pkgs = import ../../pkgs { }; + + inherit (pkgs) lib; + + # influxdb stress testing tool + inch_tool = pkgs.buildGoModule rec { + pname = "inch"; + name = "inch"; + + vendorHash = "sha256-upbcZCZEqgp8QlbA1qihLBmyHA0oA5PatN/ur6MkzqU="; + + src = ( + pkgs.fetchFromGitHub { + owner = "influxdata"; + repo = "inch"; + rev = "56a9750e91941d59a17ef2463d351513f378d9f4"; + sha256 = "sha256-UXg3+L4PMW8u5RLeDja0kYzxUnljhxVYe+p29XW4xoM="; + } + ); + }; + + # how much to pause between batches when generating simulated metric data, + # controls generation speed and load on InfluxDB + writeDelay = if slowMode then "300ms" else "0"; + + # cgroup/OOM limits for InfluxDB. Actual expected usage is smaller, this is + # just to avoid OOM due to stress, see assertions. + memoryMax = if slowMode then "500M" else "2G"; + + # How much data to generate (and how much is stored in InfluxDB) + localRetentionWeeks = + if speedrun then + 1 + else + # matches the default configuration + 12; +in +pkgs.testers.runNixOSTest { + name = "monitoring"; + + nodes = { + machine = + { + config, + lib, + pkgs, + ... + }: + { + imports = [ + ../../base/monitoring.nix + ]; + + config = { + virtualisation.forwardPorts = lib.optional debug { + from = "host"; + host.port = 18086; + guest.port = 8086; + }; + + networking.firewall.enable = lib.mkIf debug (lib.mkForce false); + + virtualisation.memorySize = lib.mkForce 3000; + + playos.monitoring.enable = true; + + # collect faster in here + playos.monitoring.collectionIntervalSeconds = + if slowMode then + 10 # still 6x more frequent + else + 1; + playos.monitoring.localRetention = "${toString localRetentionWeeks}w"; + + # enable frequent compaction to observe results quickly + services.influxdb.extraConfig.data = { + cache-snapshot-write-cold-duration = "1m"; + compact-full-write-cold-duration = "1m"; + }; + + systemd.services.influxdb.serviceConfig = { + MemoryMax = lib.mkForce memoryMax; + }; + + environment.systemPackages = [ + pkgs.influxdb + inch_tool + ]; + }; + }; + }; + + extraPythonPackages = ps: [ + ps.colorama + ps.types-colorama + ]; + + testScript = + { nodes }: + '' + ${builtins.readFile ../helpers/nixos-test-script-helpers.py} + import math + import json + + ## CONSTANTS + + # How much space can the InfluxDB take on disk + MAX_INFLUXDB_STORED_SIZE_MB = 300 + + # How much memory we expect InfluxDB to use in a "steady" state + # (compaction done, regular telegraf collection) + MAX_INFLUXDB_PEAK_MEMORY_MB = 150 + + MAX_TELEGRAF_PEAK_MEMORY_MB = 150 + + SPEEDRUN = json.loads("${lib.boolToString speedrun}") + if SPEEDRUN: + print("===== RUNNING IN SPEEDRUN MODE =====") + + weeks = ${toString localRetentionWeeks} + measurements = 10 # 1 measurement = 1 configured input plugin + + # Empirically we expect to have <100 unique tag values over ALL of the + # measurements, so product of tag_cardinalities should be less than + # 100/measurements + tag_cardinalities = [2, 5] + + # 1 series = 1 unique tag combination within a measurement + num_series = measurements * math.prod(tag_cardinalities) + + # each series will have this number of data fields + fields = 5 + + points_per_minute = 1 + points = weeks*7*24*60*points_per_minute + + ## HELPERS + + def run_query(query): + res = machine.succeed( + f"influx -database playos -format csv -execute '{query}'" + ).strip().split('\n') + # there should be at least a header + assert len(res) > 1, f"Query '{query}' returned no data?" + return res + + + def check_cardinalities(t): + measurements_list = run_query("SHOW MEASUREMENTS") + total_measurements = len(measurements_list) - 1 # minus header + + series_list = run_query("SHOW SERIES") + total_series = len(series_list) - 1 + + fields_list = run_query("SHOW FIELD KEYS") + total_fields = len(fields_list) - 1 + + print(f""" + Telegraf data collection produced: + - {total_measurements} measurements + - {total_series} unique series (measurements x tag_combos) + - {total_fields} unique fields, for a... + - {round(total_fields/total_measurements, 1)} avg. fields per measurement + """) + + t.assertLess(total_series, num_series, + "Telegraf collected metrics exceed assumed max series count") + t.assertLess(total_fields, measurements * fields, + "Telegraf collected metrics produced more fields than assumed") + + + def get_memory_stats_mb(service): + memory_current = machine.succeed(f"systemctl show {service} -p MemoryCurrent --value") + memory_peak = machine.succeed(f"systemctl show {service} -p MemoryPeak --value") + return { + 'memory_peak': int(memory_peak) / (1024*1024), + 'mem_current': int(memory_current) / (1024*1024) + } + + def get_disk_usage_bytes(): + db_stored_size = machine.succeed("du --bytes -s /var/lib/influxdb | cut -f1") + return int(db_stored_size) + + def print_memory_stats(stats): + for k in stats: + print(f"{k}: {round(stats[k])}MB") + + def get_and_print_memory_stats(service): + stats = get_memory_stats_mb(service) + print(f"{service} memory usage:") + print_memory_stats(stats) + return stats + + def check_stored_size(t): + stored_bytes = get_disk_usage_bytes() + stored_mb = round(stored_bytes / (1024*1024)) + print(f"Disk usage is: {stored_mb}MB") + + t.assertGreater(stored_bytes, 0, + "Stored DB size is zero?") + + t.assertLess(stored_bytes, MAX_INFLUXDB_STORED_SIZE_MB * 1024 * 1024, + f"Stored DB size exceeded {MAX_INFLUXDB_STORED_SIZE_MB}MB") + + + ## TESTS + + machine.start() + + with TestPrecondition("influxdb and telegraf are running"): + machine.wait_for_unit("influxdb.service", timeout=10) + machine.wait_for_unit("telegraf.service", timeout=10) + + + ## Stage 1: check if basic collection has no memory spikes + + sleep_duration = 1 if SPEEDRUN else 120 + print(f"Collecting Telegraf stats for {sleep_duration} seconds...") + time.sleep(sleep_duration) + + with TestCase("Memory usage of Telegraf is reasonable") as t: + telegraf_stats = get_and_print_memory_stats("telegraf.service") + t.assertLess(telegraf_stats['memory_peak'], MAX_TELEGRAF_PEAK_MEMORY_MB) + + print("Stopping Telegraf to force data flush.") + machine.systemctl("stop telegraf.service") + + with TestCase("Memory usage of InfluxDB is reasonable") as t: + influxdb_stats = get_and_print_memory_stats("influxdb.service") + t.assertLess(influxdb_stats['memory_peak'], MAX_INFLUXDB_PEAK_MEMORY_MB) + + ## Stage 2: run a stress test to backfill data for the whole retention + + with TestPrecondition("Cardinality of data collected by Telegraf matches stress test setup") as t: + check_cardinalities(t) + + with TestCase(f"Generate {weeks} weeks of data (SLOW_MODE = ${lib.boolToString slowMode})"): + res = machine.succeed( + f"""inch \ + -no-setup \ + -max-errors 10 \ + -db playos \ + -precision s \ + -randomize-fields \ + -m {measurements} \ + -f {fields} \ + -t {",".join(map(str,tag_cardinalities))} \ + -p {points} \ + -delay ${writeDelay} \ + -time -{weeks*7*24}h + """) + print(res) + + print("====== STATS IMMEDIATELLY AFTER ========") + get_and_print_memory_stats("influxdb.service") + # Note: memory limits enforced via MemoryMax, "realistic" memory only + # checked at the end + + with TestCase("Disk usage after stress test is within limits") as t: + check_stored_size(t) + + + sleep_duration = 1 if SPEEDRUN else 120 + print(f"Sleeping for {sleep_duration} seconds to allow compaction...") + time.sleep(sleep_duration) + + print("====== STATS AFTER compaction ========") + get_and_print_memory_stats("influxdb.service") + + with TestCase("Disk usage after compaction is within limits") as t: + check_stored_size(t) + + print("====== STATS AFTER restarting ========") + machine.systemctl("restart influxdb.service") + if not SPEEDRUN: + time.sleep(10) + stats = get_and_print_memory_stats("influxdb.service") + + with TestCase("InfluxDB memory usage after restart is within limits") as t: + t.assertLess(stats['memory_peak'], MAX_INFLUXDB_PEAK_MEMORY_MB) + + with TestCase("Disk usage after restart is within limits") as t: + check_stored_size(t) + ''; +} From 4f461fe4419be0064c916b621b90d55165012479 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ignas=20Vy=C5=A1niauskas?= Date: Fri, 19 Dec 2025 13:47:56 +0200 Subject: [PATCH 2/5] Remove persistentFolders from mount_points They all show the same numbers, since this is not based on `du`. --- base/monitoring.nix | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/base/monitoring.nix b/base/monitoring.nix index 6790490c..9f8e8a72 100644 --- a/base/monitoring.nix +++ b/base/monitoring.nix @@ -299,7 +299,7 @@ in mount_points = [ "/" # tmpfs overlay config.playos.storage.persistentDataPartition.mountPath # /mnt/data - ] ++ (builtins.attrNames config.playos.storage.persistentFolders); # individual persistent folders + ]; fieldinclude = [ "free" From b930e8ec534984b7423be7af9fead2514aaf2768 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ignas=20Vy=C5=A1niauskas?= Date: Fri, 19 Dec 2025 13:53:58 +0200 Subject: [PATCH 3/5] Use a more general interface glob --- base/monitoring.nix | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/base/monitoring.nix b/base/monitoring.nix index 9f8e8a72..f99bb36b 100644 --- a/base/monitoring.nix +++ b/base/monitoring.nix @@ -327,7 +327,7 @@ in interfaces = [ "wl*" "eth*" - "enp*" + "en*" ]; fieldinclude = [ "bytes_sent" From b67cf4a1414d77f0f5e7f943f880066b70173e53 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ignas=20Vy=C5=A1niauskas?= Date: Mon, 19 Jan 2026 12:36:56 +0200 Subject: [PATCH 4/5] Build a minimal version of telegraf This drastically reduces the binary size and memory usage, from ~160-180MB to 20-30MB. A simple approach with hard-coded plugin list instead of some mutually-recursive mess that would require a cyclic dependency for config -> telgraf-build -> config-validation. --- base/monitoring.nix | 10 +++++++- pkgs/default.nix | 2 ++ pkgs/telegraf.nix | 30 +++++++++++++++++++++++ testing/integration/monitoring-stress.nix | 2 +- 4 files changed, 42 insertions(+), 2 deletions(-) create mode 100644 pkgs/telegraf.nix diff --git a/base/monitoring.nix b/base/monitoring.nix index f99bb36b..034cf42e 100644 --- a/base/monitoring.nix +++ b/base/monitoring.nix @@ -101,6 +101,12 @@ in echo "=== Telegraf ouput:" cat output.txt + echo "Hint: PlayOS uses a custom build of telegraf, so if you get" + echo "an error like 'undefined but requested input', this can mean" + echo "two things:" + echo " 1. Typo / wrong name of plugin" + echo " 2. Plugin is not included in custom build, check pkgs/telegraf.nix" + exit 1 fi ''; @@ -228,7 +234,7 @@ in systemd.services.telegraf.serviceConfig = commonServiceConfig // { EnvironmentFile = "/var/cache/telegraf/env-file"; - MemoryMax = "200M"; + MemoryMax = "60M"; }; systemd.services.telegraf.path = [ @@ -237,6 +243,8 @@ in telegrafConfigIsValid ]; + # NOTE: if you add new inputs/ouputs or other configuration options that + # require extra telegraf dependencies, you need to also modify pkgs/telegraf.nix services.telegraf.extraConfig = with builtins; rec { global_tags.playos_version = lib.mkIf (config.playos ? "version") config.playos.version; diff --git a/pkgs/default.nix b/pkgs/default.nix index 348cfef7..8c14efa3 100644 --- a/pkgs/default.nix +++ b/pkgs/default.nix @@ -36,6 +36,8 @@ let }); focus-shift = self.callPackage ./focus-shift.nix {}; + + telegraf = (import ./telegraf.nix) super; }; in diff --git a/pkgs/telegraf.nix b/pkgs/telegraf.nix new file mode 100644 index 00000000..8f44d03d --- /dev/null +++ b/pkgs/telegraf.nix @@ -0,0 +1,30 @@ +# Build a minimal version of telegraf with only the plugins we actually use. +# +# Note: this relies on Telegraf's config validation in base/monitoring to detect +# missing plugins. +# +# See https://github.com/influxdata/telegraf/blob/master/docs/CUSTOMIZATION.md +# for details. +super: +let + supportedPlugins = [ + "inputs.cpu" + "inputs.disk" + "inputs.mem" + "inputs.net" + "inputs.procstat" + "inputs.system" + "inputs.sensors" + "inputs.systemd_units" + "inputs.wireless" + + "outputs.influxdb" + + "processors.strings" + ]; +in +super.telegraf.overrideAttrs (old: { + tags = (old.tags or []) ++ ["custom"] ++ supportedPlugins; + + doCheck = false; # tests require non-custom build +}) diff --git a/testing/integration/monitoring-stress.nix b/testing/integration/monitoring-stress.nix index 7a970b96..742eefb4 100644 --- a/testing/integration/monitoring-stress.nix +++ b/testing/integration/monitoring-stress.nix @@ -147,7 +147,7 @@ pkgs.testers.runNixOSTest { # (compaction done, regular telegraf collection) MAX_INFLUXDB_PEAK_MEMORY_MB = 150 - MAX_TELEGRAF_PEAK_MEMORY_MB = 150 + MAX_TELEGRAF_PEAK_MEMORY_MB = 40 SPEEDRUN = json.loads("${lib.boolToString speedrun}") if SPEEDRUN: From ab73b81fdc03ef000b36d7db9aae7aa181441296 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ignas=20Vy=C5=A1niauskas?= Date: Mon, 19 Jan 2026 12:48:04 +0200 Subject: [PATCH 5/5] Spelling --- base/monitoring.nix | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/base/monitoring.nix b/base/monitoring.nix index 034cf42e..db9dc0e4 100644 --- a/base/monitoring.nix +++ b/base/monitoring.nix @@ -98,7 +98,7 @@ in echo "=== Config validation FAILED, config was:" cat ${configFile} - echo "=== Telegraf ouput:" + echo "=== Telegraf output:" cat output.txt echo "Hint: PlayOS uses a custom build of telegraf, so if you get"