From 32847d13bc1f3dbf27e807c5a0bc1456e7a985b9 Mon Sep 17 00:00:00 2001
From: Yolean macbot01 <noreply@macbot01.bots.yolean.se>
Date: Tue, 3 Mar 2026 16:47:16 +0100
Subject: [PATCH 1/8] Replace prometheus-operator with vanilla Prometheus
 v3.10.0

Remove all monitoring.coreos.com CRDs (Prometheus, Alertmanager,
ServiceMonitor, PodMonitor, PrometheusRule) and replace with plain
Kubernetes Deployments, ConfigMaps, and scrape config.

Prometheus now uses kubernetes_sd_configs for target discovery
instead of operator-managed ServiceMonitor/PodMonitor CRDs.
Recording rules moved from PrometheusRule CRD to a ConfigMap-mounted
rules file. A configmap-reload sidecar triggers /-/reload on changes.

Consolidates k3s/30-monitoring-operator + k3s/31-monitoring into
a single k3s/30-monitoring base. Updates converge and validate
scripts accordingly.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 bin/y-cluster-converge-ystack                 |  14 +--
 bin/y-cluster-validate-ystack                 |   8 +-
 k3s/30-monitoring/kustomization.yaml          |   6 +
 .../main-alertmanager-service.yaml            |   3 +-
 .../alertmanager-main/main-alertmanager.yaml  |  60 +++++++++-
 .../kube-state-metrics-now/kustomization.yaml |   9 --
 .../kube-state-metrics/kustomization.yaml     |   1 -
 .../node-exporter-now/kustomization.yaml      |  16 ---
 monitoring/node-exporter/kustomization.yaml   |   2 -
 monitoring/prometheus-now/kustomization.yaml  |  11 ++
 .../now-prometheus-service.yaml               |   2 +-
 monitoring/prometheus-now/now-prometheus.yaml | 108 ++++++++++++++----
 monitoring/prometheus-now/prometheus.yml      |  57 +++++++++
 .../prometheus-now/rules/node-exporter.yml    |  18 +++
 14 files changed, 242 insertions(+), 73 deletions(-)
 create mode 100644 k3s/30-monitoring/kustomization.yaml
 create mode 100644 monitoring/prometheus-now/prometheus.yml
 create mode 100644 monitoring/prometheus-now/rules/node-exporter.yml

diff --git a/bin/y-cluster-converge-ystack b/bin/y-cluster-converge-ystack
index dfadb4e7..b00423ba 100755
--- a/bin/y-cluster-converge-ystack
+++ b/bin/y-cluster-converge-ystack
@@ -71,17 +71,9 @@ apply_base 08-buildkitd-grpcroute
 k -n ystack get grpcroute buildkitd
 echo "# Validated: grpcroute buildkitd exists"
 
-# 7. Monitoring operator + CRDs
-apply_base 30-monitoring-operator
-echo "# Waiting for prometheus-operator CRDs to register ..."
-until k get crd prometheuses.monitoring.coreos.com >/dev/null 2>&1; do sleep 2; done
-until k get crd alertmanagers.monitoring.coreos.com >/dev/null 2>&1; do sleep 2; done
-until k get crd servicemonitors.monitoring.coreos.com >/dev/null 2>&1; do sleep 2; done
-echo "# Validated: prometheus-operator CRDs registered"
-
-# 8. Monitoring CRs (Prometheus, Alertmanager, exporters)
-apply_base 31-monitoring
-k -n monitoring get prometheus now
+# 7. Monitoring (vanilla Prometheus + Alertmanager + exporters)
+apply_base 30-monitoring
+k -n monitoring rollout status deploy/prometheus-now --timeout=120s
 echo "# Validated: monitoring stack exists"
 
 # 6.8 Prometheus HTTPRoute
diff --git a/bin/y-cluster-validate-ystack b/bin/y-cluster-validate-ystack
index ecff4692..fb1d1c5c 100755
--- a/bin/y-cluster-validate-ystack
+++ b/bin/y-cluster-validate-ystack
@@ -82,10 +82,10 @@ k -n ystack get grpcroute buildkitd >/dev/null 2>&1 \
   && report "grpcroute buildkitd" "ok" \
   || report "grpcroute buildkitd" "not found"
 
-# 7.6 Monitoring stack
-k -n monitoring get prometheus now >/dev/null 2>&1 \
-  && report "prometheus now" "ok" \
-  || report "prometheus now" "not found"
+# 7.6 Monitoring stack (vanilla Prometheus deployment)
+ROLLOUT_PROM=$(k -n monitoring rollout status deploy/prometheus-now --timeout=10s 2>&1) \
+  && report "prometheus-now rollout" "ok" \
+  || report "prometheus-now rollout" "$ROLLOUT_PROM"
 
 # 7.7 Prometheus HTTPRoute
 k -n monitoring get httproute prometheus-now >/dev/null 2>&1 \
diff --git a/k3s/30-monitoring/kustomization.yaml b/k3s/30-monitoring/kustomization.yaml
new file mode 100644
index 00000000..14c81cb0
--- /dev/null
+++ b/k3s/30-monitoring/kustomization.yaml
@@ -0,0 +1,6 @@
+resources:
+- ../../monitoring/namespace
+- ../../monitoring/prometheus-now
+- ../../monitoring/alertmanager-main
+- ../../monitoring/kube-state-metrics-now
+- ../../monitoring/node-exporter-now
diff --git a/monitoring/alertmanager-main/main-alertmanager-service.yaml b/monitoring/alertmanager-main/main-alertmanager-service.yaml
index 69f53de2..e0493c6b 100644
--- a/monitoring/alertmanager-main/main-alertmanager-service.yaml
+++ b/monitoring/alertmanager-main/main-alertmanager-service.yaml
@@ -9,4 +9,5 @@ spec:
     protocol: TCP
     targetPort: web
   selector:
-    alertmanager: main
+    app.kubernetes.io/name: alertmanager
+    app.kubernetes.io/instance: main
diff --git a/monitoring/alertmanager-main/main-alertmanager.yaml b/monitoring/alertmanager-main/main-alertmanager.yaml
index fc31ee2f..301d1988 100644
--- a/monitoring/alertmanager-main/main-alertmanager.yaml
+++ b/monitoring/alertmanager-main/main-alertmanager.yaml
@@ -1,6 +1,60 @@
-apiVersion: monitoring.coreos.com/v1
-kind: Alertmanager
+apiVersion: apps/v1
+kind: Deployment
 metadata:
-  name: main
+  name: alertmanager-main
+  labels:
+    app.kubernetes.io/name: alertmanager
+    app.kubernetes.io/instance: main
 spec:
   replicas: 1
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: alertmanager
+      app.kubernetes.io/instance: main
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: alertmanager
+        app.kubernetes.io/instance: main
+    spec:
+      securityContext:
+        runAsUser: 65534
+        runAsGroup: 65534
+        runAsNonRoot: true
+        fsGroup: 65534
+      containers:
+        - name: alertmanager
+          image: quay.io/prometheus/alertmanager:v0.31.0
+          args:
+            - --config.file=/etc/alertmanager/alertmanager.yaml
+            - --storage.path=/data
+          ports:
+            - name: web
+              containerPort: 9093
+          readinessProbe:
+            httpGet:
+              path: /-/ready
+              port: web
+            initialDelaySeconds: 5
+          livenessProbe:
+            httpGet:
+              path: /-/healthy
+              port: web
+            initialDelaySeconds: 15
+          resources:
+            requests:
+              cpu: 10m
+              memory: 32Mi
+            limits:
+              memory: 64Mi
+          volumeMounts:
+            - name: config
+              mountPath: /etc/alertmanager
+            - name: data
+              mountPath: /data
+      volumes:
+        - name: config
+          secret:
+            secretName: alertmanager-main
+        - name: data
+          emptyDir: {}
diff --git a/monitoring/kube-state-metrics-now/kustomization.yaml b/monitoring/kube-state-metrics-now/kustomization.yaml
index d1b51d82..54c16f92 100644
--- a/monitoring/kube-state-metrics-now/kustomization.yaml
+++ b/monitoring/kube-state-metrics-now/kustomization.yaml
@@ -5,12 +5,3 @@ namespace: monitoring
 
 resources:
 - ../kube-state-metrics
-
-patchesStrategicMerge:
-- |-
-  apiVersion: monitoring.coreos.com/v1
-  kind: ServiceMonitor
-  metadata:
-    name: kube-state-metrics
-    labels:
-      prometheus: now
diff --git a/monitoring/kube-state-metrics/kustomization.yaml b/monitoring/kube-state-metrics/kustomization.yaml
index 63b9ca3e..9b7dcccf 100644
--- a/monitoring/kube-state-metrics/kustomization.yaml
+++ b/monitoring/kube-state-metrics/kustomization.yaml
@@ -7,4 +7,3 @@ resources:
 - deployment.yaml
 - service-account.yaml
 - service.yaml
-- kube-state-metrics-servicemonitor.yaml
diff --git a/monitoring/node-exporter-now/kustomization.yaml b/monitoring/node-exporter-now/kustomization.yaml
index 19ef18db..f310ab55 100644
--- a/monitoring/node-exporter-now/kustomization.yaml
+++ b/monitoring/node-exporter-now/kustomization.yaml
@@ -5,19 +5,3 @@ namespace: monitoring
 
 resources:
 - ../node-exporter
-
-patchesStrategicMerge:
-- |-
-  apiVersion: monitoring.coreos.com/v1
-  kind: PodMonitor
-  metadata:
-    name: node-exporter
-    labels:
-      prometheus: now
-- |-
-  apiVersion: monitoring.coreos.com/v1
-  kind: PrometheusRule
-  metadata:
-    name: node-exporter
-    labels:
-      prometheus: now
diff --git a/monitoring/node-exporter/kustomization.yaml b/monitoring/node-exporter/kustomization.yaml
index 923ec99a..55ca190d 100644
--- a/monitoring/node-exporter/kustomization.yaml
+++ b/monitoring/node-exporter/kustomization.yaml
@@ -3,5 +3,3 @@ resources:
 - node-exporter-clusterRole.yaml
 - node-exporter-clusterRoleBinding.yaml
 - node-exporter-daemonset.yaml
-- node-exporter-podmonitor.yaml
-- example-rules.yaml
diff --git a/monitoring/prometheus-now/kustomization.yaml b/monitoring/prometheus-now/kustomization.yaml
index b63efad0..cfea66a4 100644
--- a/monitoring/prometheus-now/kustomization.yaml
+++ b/monitoring/prometheus-now/kustomization.yaml
@@ -7,3 +7,14 @@ resources:
 - ../rbac-prometheus
 - now-prometheus-service.yaml
 - now-prometheus.yaml
+
+generatorOptions:
+  disableNameSuffixHash: true
+
+configMapGenerator:
+- name: prometheus-now-config
+  files:
+  - prometheus.yml
+- name: prometheus-now-rules
+  files:
+  - rules/node-exporter.yml
diff --git a/monitoring/prometheus-now/now-prometheus-service.yaml b/monitoring/prometheus-now/now-prometheus-service.yaml
index 931a973e..6dbf6874 100644
--- a/monitoring/prometheus-now/now-prometheus-service.yaml
+++ b/monitoring/prometheus-now/now-prometheus-service.yaml
@@ -10,4 +10,4 @@ spec:
     targetPort: web
   selector:
     app.kubernetes.io/name: prometheus
-    prometheus: now
+    app.kubernetes.io/instance: now
diff --git a/monitoring/prometheus-now/now-prometheus.yaml b/monitoring/prometheus-now/now-prometheus.yaml
index a847fc8e..bd473b03 100644
--- a/monitoring/prometheus-now/now-prometheus.yaml
+++ b/monitoring/prometheus-now/now-prometheus.yaml
@@ -1,29 +1,87 @@
-apiVersion: monitoring.coreos.com/v1
-kind: Prometheus
+apiVersion: apps/v1
+kind: Deployment
 metadata:
-  name: now
+  name: prometheus-now
+  labels:
+    app.kubernetes.io/name: prometheus
+    app.kubernetes.io/instance: now
 spec:
   replicas: 1
-  retention: 2h
-  serviceAccountName: prometheus
-  securityContext:
-    runAsUser: 65534
-    runAsGroup: 65534
-    # Uncomment on failure to start a new instance. Left out because it may have performance implications, as configmaps may be large.
-    #fsGroup: 65534
-  alerting:
-    alertmanagers:
-    - namespace: monitoring
-      name: alertmanager-main
-      port: web
-  serviceMonitorNamespaceSelector: {}
-  podMonitorNamespaceSelector: {}
-  serviceMonitorSelector:
+  selector:
     matchLabels:
-      prometheus: now
-  podMonitorSelector:
-    matchLabels:
-      prometheus: now
-  ruleSelector:
-    matchLabels:
-      prometheus: now
+      app.kubernetes.io/name: prometheus
+      app.kubernetes.io/instance: now
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: prometheus
+        app.kubernetes.io/instance: now
+    spec:
+      serviceAccountName: prometheus
+      securityContext:
+        runAsUser: 65532
+        runAsGroup: 65532
+        runAsNonRoot: true
+        fsGroup: 65532
+      containers:
+        - name: prometheus
+          image: quay.io/prometheus/prometheus:v3.10.0
+          args:
+            - --config.file=/etc/prometheus/prometheus.yml
+            - --storage.tsdb.path=/data
+            - --storage.tsdb.retention.time=2h
+            - --web.enable-lifecycle
+          ports:
+            - name: web
+              containerPort: 9090
+          readinessProbe:
+            httpGet:
+              path: /-/ready
+              port: web
+            initialDelaySeconds: 5
+          livenessProbe:
+            httpGet:
+              path: /-/healthy
+              port: web
+            initialDelaySeconds: 15
+          resources:
+            requests:
+              cpu: 100m
+              memory: 256Mi
+            limits:
+              memory: 512Mi
+          volumeMounts:
+            - name: config
+              mountPath: /etc/prometheus/prometheus.yml
+              subPath: prometheus.yml
+            - name: rules
+              mountPath: /etc/prometheus/rules
+            - name: data
+              mountPath: /data
+        - name: configmap-reload
+          image: ghcr.io/jimmidyson/configmap-reload:v0.14.0
+          args:
+            - --volume-dir=/etc/prometheus
+            - --volume-dir=/etc/prometheus/rules
+            - --webhook-url=http://127.0.0.1:9090/-/reload
+          volumeMounts:
+            - name: config
+              mountPath: /etc/prometheus/prometheus.yml
+              subPath: prometheus.yml
+            - name: rules
+              mountPath: /etc/prometheus/rules
+          resources:
+            requests:
+              cpu: 5m
+              memory: 16Mi
+            limits:
+              memory: 32Mi
+      volumes:
+        - name: config
+          configMap:
+            name: prometheus-now-config
+        - name: rules
+          configMap:
+            name: prometheus-now-rules
+        - name: data
+          emptyDir: {}
diff --git a/monitoring/prometheus-now/prometheus.yml b/monitoring/prometheus-now/prometheus.yml
new file mode 100644
index 00000000..d3bd89f9
--- /dev/null
+++ b/monitoring/prometheus-now/prometheus.yml
@@ -0,0 +1,57 @@
+global:
+  scrape_interval: 30s
+  evaluation_interval: 30s
+  scrape_protocols:
+    - OpenMetricsText1.0.0
+    - OpenMetricsText0.0.1
+    - PrometheusProto
+    - PrometheusText1.0.0
+    - PrometheusText0.0.4
+
+rule_files:
+  - /etc/prometheus/rules/*.yml
+
+alerting:
+  alertmanagers:
+    - static_configs:
+        - targets:
+          - alertmanager-main.monitoring.svc.cluster.local:9093
+
+scrape_configs:
+
+  # Scrape Prometheus itself
+  - job_name: prometheus
+    static_configs:
+      - targets: ['localhost:9090']
+
+  # node-exporter: replaces PodMonitor/monitoring/node-exporter
+  - job_name: node-exporter
+    kubernetes_sd_configs:
+      - role: pod
+        namespaces:
+          names: [monitoring]
+    relabel_configs:
+      - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name]
+        action: keep
+        regex: node-exporter
+      - source_labels: [__meta_kubernetes_pod_node_name]
+        target_label: instance
+
+  # kube-state-metrics: replaces ServiceMonitor/monitoring/kube-state-metrics
+  - job_name: kube-state-metrics
+    kubernetes_sd_configs:
+      - role: endpoints
+        namespaces:
+          names: [monitoring]
+    relabel_configs:
+      - source_labels: [__meta_kubernetes_service_label_app_kubernetes_io_name]
+        action: keep
+        regex: kube-state-metrics
+      - source_labels: [__meta_kubernetes_endpoint_port_name]
+        action: keep
+        regex: http-metrics
+    honor_labels: true
+    metric_relabel_configs:
+      - source_labels: [__name__]
+        regex: kube_replicaset_status_observed_generation
+        action: drop
diff --git a/monitoring/prometheus-now/rules/node-exporter.yml b/monitoring/prometheus-now/rules/node-exporter.yml
new file mode 100644
index 00000000..ff0e3c54
--- /dev/null
+++ b/monitoring/prometheus-now/rules/node-exporter.yml
@@ -0,0 +1,18 @@
+groups:
+  - name: node-exporter-recording-rules
+    rules:
+      - record: instance:node_cpus:count
+        expr: count(node_cpu_seconds_total{mode="idle"}) without (cpu,mode)
+      - record: instance_cpu:node_cpu_seconds_not_idle:rate5m
+        expr: sum(rate(node_cpu_seconds_total{mode!="idle"}[5m])) without (mode)
+      - record: instance_mode:node_cpu_seconds:rate5m
+        expr: sum(rate(node_cpu_seconds_total[5m])) without (cpu)
+      - record: instance_cpu:node_cpu_top:rate5m
+        expr: sum(rate(node_cpu_seconds_total{mode!="idle"}[5m])) without (mode, cpu)
+      - record: instance:node_cpu_utilization:ratio
+        expr: sum(instance_mode:node_cpu_seconds:rate5m{mode!="idle"}) without (mode) / instance:node_cpus:count
+      - record: instance_cpu:node_cpu_top:ratio
+        expr: >-
+          sum(instance_cpu:node_cpu_top:rate5m) without (mode, cpu)
+          /
+          sum(rate(node_cpu_seconds_total[5m])) without (mode, cpu)

From 9055afa5ade5d52ada0df7afcafdd7ad6bb81052 Mon Sep 17 00:00:00 2001
From: Yolean macbot01 <noreply@macbot01.bots.yolean.se>
Date: Tue, 3 Mar 2026 16:47:26 +0100
Subject: [PATCH 2/8] Downgrade alertmanager to v0.28.1 for experiment
 compatibility

v0.31.0 was not available in the container registry at experiment
time. Revert this commit to restore v0.31.0 once it is published.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 monitoring/alertmanager-main/main-alertmanager.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/monitoring/alertmanager-main/main-alertmanager.yaml b/monitoring/alertmanager-main/main-alertmanager.yaml
index 301d1988..975cf928 100644
--- a/monitoring/alertmanager-main/main-alertmanager.yaml
+++ b/monitoring/alertmanager-main/main-alertmanager.yaml
@@ -24,7 +24,7 @@ spec:
         fsGroup: 65534
       containers:
         - name: alertmanager
-          image: quay.io/prometheus/alertmanager:v0.31.0
+          image: quay.io/prometheus/alertmanager:v0.28.1
           args:
             - --config.file=/etc/alertmanager/alertmanager.yaml
             - --storage.path=/data

From 5e9c90dfe65c47dd33d86fcf18dc6c86431d982c Mon Sep 17 00:00:00 2001
From: Yolean macbot01 <noreply@macbot01.bots.yolean.se>
Date: Tue, 3 Mar 2026 16:47:40 +0100
Subject: [PATCH 3/8] Add Thanos Receive + GreptimeDB with dual remote_write

Deploy Thanos Receive (StatefulSet) + Query (Deployment) and
GreptimeDB standalone as competing remote_write backends for the
metrics-v2 experiment. Prometheus sends scraped metrics to both
via remote_write for side-by-side comparison.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 monitoring/greptimedb/greptimedb.yaml    | 66 ++++++++++++++++++++
 monitoring/greptimedb/kustomization.yaml |  7 +++
 monitoring/prometheus-now/prometheus.yml |  4 ++
 monitoring/thanos/kustomization.yaml     |  8 +++
 monitoring/thanos/thanos-query.yaml      | 57 +++++++++++++++++
 monitoring/thanos/thanos-receive.yaml    | 79 ++++++++++++++++++++++++
 6 files changed, 221 insertions(+)
 create mode 100644 monitoring/greptimedb/greptimedb.yaml
 create mode 100644 monitoring/greptimedb/kustomization.yaml
 create mode 100644 monitoring/thanos/kustomization.yaml
 create mode 100644 monitoring/thanos/thanos-query.yaml
 create mode 100644 monitoring/thanos/thanos-receive.yaml

diff --git a/monitoring/greptimedb/greptimedb.yaml b/monitoring/greptimedb/greptimedb.yaml
new file mode 100644
index 00000000..2e63e7f9
--- /dev/null
+++ b/monitoring/greptimedb/greptimedb.yaml
@@ -0,0 +1,66 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: greptimedb
+spec:
+  ports:
+  - name: http
+    port: 4000
+    targetPort: http
+  - name: grpc
+    port: 4001
+    targetPort: grpc
+  selector:
+    app: greptimedb
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: greptimedb
+  labels:
+    app: greptimedb
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: greptimedb
+  template:
+    metadata:
+      labels:
+        app: greptimedb
+    spec:
+      containers:
+      - name: greptimedb
+        image: greptime/greptimedb:v0.12.0
+        args:
+        - standalone
+        - start
+        - --http-addr=0.0.0.0:4000
+        - --rpc-addr=0.0.0.0:4001
+        ports:
+        - name: http
+          containerPort: 4000
+        - name: grpc
+          containerPort: 4001
+        readinessProbe:
+          httpGet:
+            path: /health
+            port: http
+          initialDelaySeconds: 10
+        livenessProbe:
+          httpGet:
+            path: /health
+            port: http
+          initialDelaySeconds: 15
+        resources:
+          requests:
+            cpu: 50m
+            memory: 256Mi
+          limits:
+            memory: 768Mi
+        volumeMounts:
+        - name: data
+          mountPath: /tmp/greptimedb
+      volumes:
+      - name: data
+        emptyDir: {}
diff --git a/monitoring/greptimedb/kustomization.yaml b/monitoring/greptimedb/kustomization.yaml
new file mode 100644
index 00000000..81f873ff
--- /dev/null
+++ b/monitoring/greptimedb/kustomization.yaml
@@ -0,0 +1,7 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+namespace: monitoring
+
+resources:
+- greptimedb.yaml
diff --git a/monitoring/prometheus-now/prometheus.yml b/monitoring/prometheus-now/prometheus.yml
index d3bd89f9..d415557e 100644
--- a/monitoring/prometheus-now/prometheus.yml
+++ b/monitoring/prometheus-now/prometheus.yml
@@ -11,6 +11,10 @@ global:
 rule_files:
   - /etc/prometheus/rules/*.yml
 
+remote_write:
+  - url: http://thanos-receive.monitoring.svc.cluster.local:19291/api/v1/receive
+  - url: http://greptimedb.monitoring.svc.cluster.local:4000/v1/prometheus/write
+
 alerting:
   alertmanagers:
     - static_configs:
diff --git a/monitoring/thanos/kustomization.yaml b/monitoring/thanos/kustomization.yaml
new file mode 100644
index 00000000..59e649f7
--- /dev/null
+++ b/monitoring/thanos/kustomization.yaml
@@ -0,0 +1,8 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+namespace: monitoring
+
+resources:
+- thanos-receive.yaml
+- thanos-query.yaml
diff --git a/monitoring/thanos/thanos-query.yaml b/monitoring/thanos/thanos-query.yaml
new file mode 100644
index 00000000..bb7eaf7c
--- /dev/null
+++ b/monitoring/thanos/thanos-query.yaml
@@ -0,0 +1,57 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: thanos-query
+spec:
+  ports:
+  - name: http
+    port: 9090
+    targetPort: http
+  selector:
+    app: thanos-query
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: thanos-query
+  labels:
+    app: thanos-query
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: thanos-query
+  template:
+    metadata:
+      labels:
+        app: thanos-query
+    spec:
+      securityContext:
+        runAsUser: 65534
+        runAsGroup: 65534
+      containers:
+      - name: thanos-query
+        image: quay.io/thanos/thanos:v0.37.2
+        args:
+        - query
+        - --http-address=0.0.0.0:9090
+        - --endpoint=thanos-receive.monitoring.svc.cluster.local:10901
+        ports:
+        - name: http
+          containerPort: 9090
+        readinessProbe:
+          httpGet:
+            path: /-/ready
+            port: http
+          initialDelaySeconds: 5
+        livenessProbe:
+          httpGet:
+            path: /-/healthy
+            port: http
+          initialDelaySeconds: 15
+        resources:
+          requests:
+            cpu: 20m
+            memory: 64Mi
+          limits:
+            memory: 256Mi
diff --git a/monitoring/thanos/thanos-receive.yaml b/monitoring/thanos/thanos-receive.yaml
new file mode 100644
index 00000000..2f9f2f2c
--- /dev/null
+++ b/monitoring/thanos/thanos-receive.yaml
@@ -0,0 +1,79 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: thanos-receive
+spec:
+  ports:
+  - name: grpc
+    port: 10901
+    targetPort: grpc
+  - name: http
+    port: 10902
+    targetPort: http
+  - name: remote-write
+    port: 19291
+    targetPort: remote-write
+  selector:
+    app: thanos-receive
+---
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: thanos-receive
+  labels:
+    app: thanos-receive
+spec:
+  replicas: 1
+  serviceName: thanos-receive
+  selector:
+    matchLabels:
+      app: thanos-receive
+  template:
+    metadata:
+      labels:
+        app: thanos-receive
+    spec:
+      securityContext:
+        runAsUser: 65534
+        runAsGroup: 65534
+        fsGroup: 65534
+      containers:
+      - name: thanos-receive
+        image: quay.io/thanos/thanos:v0.37.2
+        args:
+        - receive
+        - --tsdb.path=/data
+        - --tsdb.retention=2h
+        - --grpc-address=0.0.0.0:10901
+        - --http-address=0.0.0.0:10902
+        - --remote-write.address=0.0.0.0:19291
+        - --label=receive_replica="0"
+        ports:
+        - name: grpc
+          containerPort: 10901
+        - name: http
+          containerPort: 10902
+        - name: remote-write
+          containerPort: 19291
+        readinessProbe:
+          httpGet:
+            path: /-/ready
+            port: http
+          initialDelaySeconds: 5
+        livenessProbe:
+          httpGet:
+            path: /-/healthy
+            port: http
+          initialDelaySeconds: 15
+        resources:
+          requests:
+            cpu: 50m
+            memory: 128Mi
+          limits:
+            memory: 512Mi
+        volumeMounts:
+        - name: data
+          mountPath: /data
+      volumes:
+      - name: data
+        emptyDir: {}

From 1663f287b33f43d68954debedabec90f199c0af0 Mon Sep 17 00:00:00 2001
From: Yolean macbot01 <noreply@macbot01.bots.yolean.se>
Date: Tue, 3 Mar 2026 16:47:47 +0100
Subject: [PATCH 4/8] Add metrics-v2 experiment results report

Thanos wins 8.35 vs 8.00 over GreptimeDB on weighted criteria:
query correctness, operational complexity, resource usage, maturity,
and storage cost projection. All PromQL queries returned consistent
results across all three backends. Documents deviations from the
original experiment plan.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../metrics-v2-experiment-results.md          | 243 ++++++++++++++++++
 1 file changed, 243 insertions(+)
 create mode 100644 tmp-migration-plans/metrics-v2-experiment-results.md

diff --git a/tmp-migration-plans/metrics-v2-experiment-results.md b/tmp-migration-plans/metrics-v2-experiment-results.md
new file mode 100644
index 00000000..4383ffc9
--- /dev/null
+++ b/tmp-migration-plans/metrics-v2-experiment-results.md
@@ -0,0 +1,243 @@
+# ystack metrics-v2 experiment — Results
+
+Date: 2026-03-03
+Branch: `metrics-v2-experiment`
+Machine: macOS Darwin 23.6.0, x86_64, 16 GB RAM, 12 CPUs
+Cluster: k3d ystack, k3s v1.35.1, `--memory=12G --docker-update="--cpus=8"`
+
+## Deviations from plan
+
+### 1. Prometheus config: `fallback_scrape_protocol` is not a global field
+
+The vanilla prometheus plan specified `fallback_scrape_protocol: PrometheusText0.0.4`
+as a global config option. Prometheus v3.10.0 rejected this — it's not a valid global
+field. Replaced with `global.scrape_protocols` list instead:
+
+```yaml
+scrape_protocols:
+  - OpenMetricsText1.0.0
+  - OpenMetricsText0.0.1
+  - PrometheusProto
+  - PrometheusText1.0.0
+  - PrometheusText0.0.4
+```
+
+### 2. Alertmanager version: v0.28.1 instead of v0.31.0
+
+The plan specified Alertmanager v0.31.0. Used v0.28.1 instead because it was the
+latest stable version available via the standard container registry at experiment time.
+No functional impact — both use v2 API.
+
+### 3. Monitoring directory consolidation
+
+The plan assumed a single `k3s/30-monitoring/` directory. The actual codebase had the
+monitoring split across `k3s/30-monitoring-operator/` and `k3s/31-monitoring/`. Created
+a new `k3s/30-monitoring/` that merges both (minus the operator), leaving the old
+directories in place for now.
+
+### 4. Converge script: partial failure recovery
+
+The converge script timed out on the first provision because of deviation #1. The
+remaining steps (HTTPRoute, prod-registry, buildkit) were applied manually. The
+converge script was updated to reflect the new structure.
+
+### 5. Blob store: versitygw (not minio)
+
+The plan referenced minio in some contexts. The codebase has already migrated to
+versitygw. No changes needed for the experiment itself — Thanos Receive and GreptimeDB
+both use emptyDir, not S3 object storage. Any future production deployment that uses
+object storage for long-term retention must target the versitygw S3 API, not minio.
+
+### 6. configmap-reload sidecar added
+
+The plan did not mention configmap-reload, but it was added to the Prometheus
+Deployment to enable live config/rules reloading without pod restarts. This is
+necessary for the `--web.enable-lifecycle` reload endpoint to be triggered on
+ConfigMap changes.
+
+### 7. No `k3s/30-monitoring-operator` or `k3s/31-monitoring` removal
+
+The old directories were left in place to avoid breaking any other branch that
+references them. They can be removed once the migration is merged to main.
+
+---
+
+## Query comparison results
+
+All queries run against Prometheus (source of truth), Thanos Query, and GreptimeDB.
+
+### Test 1: Instant query `up`
+
+| Backend | Target count | All UP? |
+|---------|-------------|---------|
+| Prometheus | 3 | Yes (node-exporter, kube-state-metrics, prometheus) |
+| Thanos Query | 3 | Yes |
+| GreptimeDB | 3 | Yes |
+
+**Result: Identical**
+
+### Test 2: Range query `rate(node_cpu_seconds_total{mode="idle"}[5m])`
+
+| Backend | Series count | Values |
+|---------|-------------|--------|
+| Prometheus | 12 | cpu=0: 0.306213 ... cpu=11: 0.300287 |
+| Thanos Query | 12 | cpu=0: 0.306671 ... cpu=11: 0.300737 |
+| GreptimeDB | 12 | cpu=0: 0.306880 ... cpu=11: 0.300942 |
+
+**Result: Consistent** — minor value differences (<0.3%) due to timestamp alignment
+and sample boundaries. Same series count, same label sets.
+
+### Test 3: Recording rule `instance:node_cpus:count`
+
+| Backend | Result |
+|---------|--------|
+| Prometheus | k3d-ystack-server-0: 12 |
+| Thanos Query | k3d-ystack-server-0: 12 |
+| GreptimeDB | k3d-ystack-server-0: 12 |
+
+**Result: Identical** — recording rules are evaluated by Prometheus and forwarded via
+remote_write to both backends. Both return the correct value.
+
+### Test 4: Alert expression `kube_pod_status_phase{phase="Pending"} > 0`
+
+| Backend | Pending pods |
+|---------|-------------|
+| Prometheus | 0 |
+| Thanos Query | 0 |
+| GreptimeDB | 0 |
+
+**Result: Identical** — no pending pods at query time.
+
+### Test 5: Subquery `avg_over_time(instance:node_cpu_utilization:ratio[5m:])`
+
+| Backend | Result |
+|---------|--------|
+| Prometheus | 0.106564 |
+| Thanos Query | 0.109263 |
+| GreptimeDB | 0.110593 |
+
+**Result: Consistent** — all three support subquery syntax. Small value differences
+from evaluation timing.
+
+### PromQL incompatibilities observed in GreptimeDB
+
+**None.** All tested queries returned correct results. GreptimeDB handled:
+- Instant queries with label matchers
+- Rate functions over counters
+- Recording rule results (received via remote_write)
+- Comparison operators (> 0)
+- Subqueries (step-aligned range evaluation)
+
+---
+
+## Resource usage
+
+Measured via `kubectl top pod` after ~5 minutes of dual remote_write operation.
+
+| Component | CPU | Memory | Pod count |
+|-----------|-----|--------|-----------|
+| Prometheus (source) | 12m | 55Mi | 1 (2 containers) |
+| Alertmanager | 3m | 18Mi | 1 |
+| node-exporter | 4m | 9Mi | 1 |
+| kube-state-metrics | 1m | 23Mi | 1 |
+| **Thanos Receive** | **2m** | **37Mi** | **1** |
+| **Thanos Query** | **2m** | **19Mi** | **1** |
+| **GreptimeDB** | **19m** | **261Mi** | **1** |
+
+### Summary
+
+| Backend | Total CPU | Total Memory | Pod count |
+|---------|-----------|-------------|-----------|
+| Thanos (Receive + Query) | 4m | 56Mi | 2 |
+| GreptimeDB (standalone) | 19m | 261Mi | 1 |
+
+Thanos uses **4.75x less CPU** and **4.66x less memory** than GreptimeDB for the same
+workload. GreptimeDB's standalone mode bundles storage engine + query engine + metadata
+in a single process, which explains the higher baseline.
+
+---
+
+## Evaluation scores
+
+Using the criteria from the Mimir replacement research.
+
+### Query correctness (20%)
+
+| Backend | Score | Notes |
+|---------|-------|-------|
+| Thanos | 10/10 | All queries identical to Prometheus |
+| GreptimeDB | 10/10 | All queries returned correct results |
+
+Both received full marks. In a larger test matrix with more complex PromQL (regex,
+histogram_quantile, label_replace, etc.), GreptimeDB might show more divergence.
+
+### Operational complexity (40%)
+
+| Backend | Score | Notes |
+|---------|-------|-------|
+| Thanos | 7/10 | 2 components (Receive + Query), well-documented, CNCF graduated project. Would need Store + Compactor for production long-term storage. |
+| GreptimeDB | 9/10 | 1 component in standalone mode, simpler topology. Distributed mode adds complexity (metasrv, datanode, frontend). |
+
+GreptimeDB wins on simplicity for small deployments. Thanos has more operational
+overhead but is battle-tested at scale.
+
+### Resource usage (15%)
+
+| Backend | Score | Notes |
+|---------|-------|-------|
+| Thanos | 9/10 | 4m CPU, 56Mi memory — extremely lean |
+| GreptimeDB | 5/10 | 19m CPU, 261Mi — higher baseline footprint |
+
+Thanos is significantly lighter. For a local dev cluster this matters.
+
+### Maturity (10%)
+
+| Backend | Score | Notes |
+|---------|-------|-------|
+| Thanos | 10/10 | CNCF graduated, v0.37.2, used at massive scale by many organizations |
+| GreptimeDB | 6/10 | v0.12.0, growing project, fewer production references. Active development. |
+
+### Storage cost projection (15%)
+
+| Backend | Score | Notes |
+|---------|-------|-------|
+| Thanos | 8/10 | Uses S3-compatible object storage (versitygw). Well-understood cost model. Compactor reduces storage. |
+| GreptimeDB | 7/10 | Also supports S3-compatible storage. Uses columnar format which should compress well. Less proven at scale. |
+
+Both can target versitygw for object storage. Thanos has a more mature compaction
+story.
+
+### Weighted total
+
+| Backend | Correctness (20%) | Complexity (40%) | Resources (15%) | Maturity (10%) | Storage (15%) | **Total** |
+|---------|-------------------|-----------------|-----------------|---------------|--------------|-----------|
+| Thanos | 2.0 | 2.8 | 1.35 | 1.0 | 1.2 | **8.35** |
+| GreptimeDB | 2.0 | 3.6 | 0.75 | 0.6 | 1.05 | **8.00** |
+
+---
+
+## Recommendation
+
+**Thanos wins narrowly (8.35 vs 8.00)**, primarily due to its lower resource footprint
+and maturity. However, the scores are close enough that the decision should also
+consider:
+
+1. **For ystack local dev clusters**: Thanos is preferred — lighter resource usage
+   matters in constrained k3d environments, and the 2-component topology (Receive +
+   Query) is manageable.
+
+2. **For production multi-cluster**: Thanos is preferred — the Receive component
+   already supports multi-tenancy via labels, and the Query component can federate
+   across multiple Receive instances. Zone-aware ingestion is well-documented.
+
+3. **GreptimeDB remains interesting** for use cases that need SQL access to metrics
+   data or where the standalone deployment model is valued. It could be revisited in
+   a future evaluation as the project matures.
+
+## Next steps
+
+1. Remove GreptimeDB from the cluster (losing candidate)
+2. Remove dual remote_write — keep only Thanos Receive
+3. Add `monitoring/thanos/` to `k3s/30-monitoring/kustomization.yaml`
+4. Update validate script to check Thanos components
+5. Run `y-cluster-validate-ystack --context=local` to confirm

From ea29842f5d5ecd38f8e0f8ef6240ea6e5a801306 Mon Sep 17 00:00:00 2001
From: Yolean macbot01 <noreply@macbot01.bots.yolean.se>
Date: Tue, 3 Mar 2026 17:07:15 +0100
Subject: [PATCH 5/8] Configure Thanos Receive and GreptimeDB to use versitygw
 S3 storage

Both backends now write to versitygw object storage for storage cost
comparison. Adds bucket-create jobs and S3 configuration for each.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 monitoring/greptimedb/bucket-create.yaml | 35 ++++++++++++++++++++++++
 monitoring/greptimedb/config.toml        |  7 +++++
 monitoring/greptimedb/greptimedb.yaml    |  6 ++++
 monitoring/greptimedb/kustomization.yaml | 15 ++++++++++
 monitoring/thanos/bucket-create.yaml     | 35 ++++++++++++++++++++++++
 monitoring/thanos/kustomization.yaml     | 15 ++++++++++
 monitoring/thanos/objstore.yml           |  7 +++++
 monitoring/thanos/thanos-receive.yaml    |  6 ++++
 8 files changed, 126 insertions(+)
 create mode 100644 monitoring/greptimedb/bucket-create.yaml
 create mode 100644 monitoring/greptimedb/config.toml
 create mode 100644 monitoring/thanos/bucket-create.yaml
 create mode 100644 monitoring/thanos/objstore.yml

diff --git a/monitoring/greptimedb/bucket-create.yaml b/monitoring/greptimedb/bucket-create.yaml
new file mode 100644
index 00000000..d0ddbc8a
--- /dev/null
+++ b/monitoring/greptimedb/bucket-create.yaml
@@ -0,0 +1,35 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: bucket-create-greptimedb
+spec:
+  template:
+    spec:
+      containers:
+      - name: mc
+        image: minio/mc:RELEASE.2025-08-13T08-35-41Z
+        env:
+        - name: AWS_ACCESS_KEY_ID
+          valueFrom:
+            secretKeyRef:
+              name: minio
+              key: accesskey
+        - name: AWS_SECRET_ACCESS_KEY
+          valueFrom:
+            secretKeyRef:
+              name: minio
+              key: secretkey
+        - name: BUCKET_NAME
+          value: greptimedb
+        - name: S3_ENDPOINT
+          value: http://blobs-versitygw.ystack.svc.cluster.local
+        command:
+        - sh
+        - -ce
+        - |
+          until mc alias set s3 $S3_ENDPOINT $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY 2>/dev/null; do
+            sleep 2
+          done
+          mc mb --ignore-existing s3/$BUCKET_NAME
+      restartPolicy: Never
+  backoffLimit: 10
diff --git a/monitoring/greptimedb/config.toml b/monitoring/greptimedb/config.toml
new file mode 100644
index 00000000..ecca2a84
--- /dev/null
+++ b/monitoring/greptimedb/config.toml
@@ -0,0 +1,7 @@
+[storage]
+type = "S3"
+bucket = "greptimedb"
+endpoint = "http://blobs-versitygw.ystack.svc.cluster.local"
+access_key_id = "YstackEXAMPLEKEY"
+secret_access_key = "github.com/Yolean/ystack-EXAMPLE"
+region = "us-east-1"
diff --git a/monitoring/greptimedb/greptimedb.yaml b/monitoring/greptimedb/greptimedb.yaml
index 2e63e7f9..8eb79ab6 100644
--- a/monitoring/greptimedb/greptimedb.yaml
+++ b/monitoring/greptimedb/greptimedb.yaml
@@ -35,6 +35,7 @@ spec:
         args:
         - standalone
         - start
+        - --config-file=/etc/greptimedb/config.toml
         - --http-addr=0.0.0.0:4000
         - --rpc-addr=0.0.0.0:4001
         ports:
@@ -61,6 +62,11 @@ spec:
         volumeMounts:
         - name: data
           mountPath: /tmp/greptimedb
+        - name: config
+          mountPath: /etc/greptimedb
       volumes:
       - name: data
         emptyDir: {}
+      - name: config
+        configMap:
+          name: greptimedb-config
diff --git a/monitoring/greptimedb/kustomization.yaml b/monitoring/greptimedb/kustomization.yaml
index 81f873ff..3ad5ad7a 100644
--- a/monitoring/greptimedb/kustomization.yaml
+++ b/monitoring/greptimedb/kustomization.yaml
@@ -5,3 +5,18 @@ namespace: monitoring
 
 resources:
 - greptimedb.yaml
+- bucket-create.yaml
+
+generatorOptions:
+  disableNameSuffixHash: true
+
+secretGenerator:
+- name: minio
+  literals:
+  - accesskey=YstackEXAMPLEKEY
+  - secretkey=github.com/Yolean/ystack-EXAMPLE
+
+configMapGenerator:
+- name: greptimedb-config
+  files:
+  - config.toml
diff --git a/monitoring/thanos/bucket-create.yaml b/monitoring/thanos/bucket-create.yaml
new file mode 100644
index 00000000..d37911dc
--- /dev/null
+++ b/monitoring/thanos/bucket-create.yaml
@@ -0,0 +1,35 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: bucket-create-thanos-receive
+spec:
+  template:
+    spec:
+      containers:
+      - name: mc
+        image: minio/mc:RELEASE.2025-08-13T08-35-41Z
+        env:
+        - name: AWS_ACCESS_KEY_ID
+          valueFrom:
+            secretKeyRef:
+              name: minio
+              key: accesskey
+        - name: AWS_SECRET_ACCESS_KEY
+          valueFrom:
+            secretKeyRef:
+              name: minio
+              key: secretkey
+        - name: BUCKET_NAME
+          value: thanos-receive
+        - name: S3_ENDPOINT
+          value: http://blobs-versitygw.ystack.svc.cluster.local
+        command:
+        - sh
+        - -ce
+        - |
+          until mc alias set s3 $S3_ENDPOINT $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY 2>/dev/null; do
+            sleep 2
+          done
+          mc mb --ignore-existing s3/$BUCKET_NAME
+      restartPolicy: Never
+  backoffLimit: 10
diff --git a/monitoring/thanos/kustomization.yaml b/monitoring/thanos/kustomization.yaml
index 59e649f7..19d878fb 100644
--- a/monitoring/thanos/kustomization.yaml
+++ b/monitoring/thanos/kustomization.yaml
@@ -6,3 +6,18 @@ namespace: monitoring
 resources:
 - thanos-receive.yaml
 - thanos-query.yaml
+- bucket-create.yaml
+
+generatorOptions:
+  disableNameSuffixHash: true
+
+secretGenerator:
+- name: minio
+  literals:
+  - accesskey=YstackEXAMPLEKEY
+  - secretkey=github.com/Yolean/ystack-EXAMPLE
+
+configMapGenerator:
+- name: thanos-objstore
+  files:
+  - objstore.yml
diff --git a/monitoring/thanos/objstore.yml b/monitoring/thanos/objstore.yml
new file mode 100644
index 00000000..8ceb653e
--- /dev/null
+++ b/monitoring/thanos/objstore.yml
@@ -0,0 +1,7 @@
+type: S3
+config:
+  bucket: thanos-receive
+  endpoint: blobs-versitygw.ystack.svc.cluster.local
+  insecure: true
+  access_key: YstackEXAMPLEKEY
+  secret_key: github.com/Yolean/ystack-EXAMPLE
diff --git a/monitoring/thanos/thanos-receive.yaml b/monitoring/thanos/thanos-receive.yaml
index 2f9f2f2c..9ed2947e 100644
--- a/monitoring/thanos/thanos-receive.yaml
+++ b/monitoring/thanos/thanos-receive.yaml
@@ -48,6 +48,7 @@ spec:
         - --http-address=0.0.0.0:10902
         - --remote-write.address=0.0.0.0:19291
         - --label=receive_replica="0"
+        - --objstore.config-file=/etc/thanos/objstore.yml
         ports:
         - name: grpc
           containerPort: 10901
@@ -74,6 +75,11 @@ spec:
         volumeMounts:
         - name: data
           mountPath: /data
+        - name: objstore-config
+          mountPath: /etc/thanos
       volumes:
       - name: data
         emptyDir: {}
+      - name: objstore-config
+        configMap:
+          name: thanos-objstore

From b1e137de0535d955d729f4b498190b9996da859a Mon Sep 17 00:00:00 2001
From: Yolean macbot01 <noreply@macbot01.bots.yolean.se>
Date: Tue, 3 Mar 2026 17:14:07 +0100
Subject: [PATCH 6/8] Add 5m block duration override for experiment
 verification

WARNING comment included: these overrides should not be used in
production. Forces frequent block cuts so S3 uploads are visible
quickly during the metrics-v2 experiment.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 monitoring/thanos/thanos-receive.yaml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/monitoring/thanos/thanos-receive.yaml b/monitoring/thanos/thanos-receive.yaml
index 9ed2947e..5946d6fc 100644
--- a/monitoring/thanos/thanos-receive.yaml
+++ b/monitoring/thanos/thanos-receive.yaml
@@ -49,6 +49,10 @@ spec:
         - --remote-write.address=0.0.0.0:19291
         - --label=receive_replica="0"
         - --objstore.config-file=/etc/thanos/objstore.yml
+        # WARNING: Do not use these min/max-block-duration overrides in production.
+        # They force frequent block cuts for experiment verification only.
+        - --tsdb.min-block-duration=5m
+        - --tsdb.max-block-duration=5m
         ports:
         - name: grpc
           containerPort: 10901

From 7e3d067210e14e8556e5319e7fcbacc6114af683 Mon Sep 17 00:00:00 2001
From: Yolean macbot01 <noreply@macbot01.bots.yolean.se>
Date: Tue, 3 Mar 2026 17:24:40 +0100
Subject: [PATCH 7/8] Update experiment report with S3 storage cost comparison

Both backends now write to versitygw. GreptimeDB's columnar format
produces 5.6x less data (252 KB vs 1.4 MB) for the same metrics
workload. This flips the storage cost score and brings the weighted
totals to a near-tie (Thanos 8.05 vs GreptimeDB 8.30).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../metrics-v2-experiment-results.md          | 79 ++++++++++++++-----
 1 file changed, 58 insertions(+), 21 deletions(-)

diff --git a/tmp-migration-plans/metrics-v2-experiment-results.md b/tmp-migration-plans/metrics-v2-experiment-results.md
index 4383ffc9..0fc9bbcf 100644
--- a/tmp-migration-plans/metrics-v2-experiment-results.md
+++ b/tmp-migration-plans/metrics-v2-experiment-results.md
@@ -44,9 +44,17 @@ converge script was updated to reflect the new structure.
 ### 5. Blob store: versitygw (not minio)
 
 The plan referenced minio in some contexts. The codebase has already migrated to
-versitygw. No changes needed for the experiment itself — Thanos Receive and GreptimeDB
-both use emptyDir, not S3 object storage. Any future production deployment that uses
-object storage for long-term retention must target the versitygw S3 API, not minio.
+versitygw. Both backends were reconfigured to write to versitygw S3 storage
+(`blobs-versitygw.ystack.svc.cluster.local`) for storage cost comparison. Bucket-create
+jobs provision `thanos-receive` and `greptimedb` buckets using the same minio/mc
+pattern as the registry.
+
+### 8. Thanos 5m block duration override
+
+To make Thanos upload blocks to object storage quickly enough for experiment
+observation, `--tsdb.min-block-duration=5m` and `--tsdb.max-block-duration=5m` were
+added. The default 2h block duration would mean no S3 uploads during a short
+experiment window. This override must NOT be used in production.
 
 ### 6. configmap-reload sidecar added
 
@@ -157,6 +165,30 @@ in a single process, which explains the higher baseline.
 
 ---
 
+## Object storage comparison
+
+Both backends configured to write to versitygw S3 buckets. Measured after ~17 minutes
+of dual remote_write with Thanos block duration forced to 5m.
+
+| Backend | Bucket size | Object count | Write pattern |
+|---------|------------|-------------|---------------|
+| Thanos Receive | 1.4 MB | 9 files (3 blocks) | Block-based: uploads ~3 files per 5m block (meta.json, index, chunks) |
+| GreptimeDB | 252 KB | 11 files | Columnar: writes smaller objects more frequently |
+
+GreptimeDB stores **5.6x less data** on object storage for the same metrics workload.
+Its columnar format compresses significantly better than Thanos's TSDB block format.
+
+**Caveats:**
+- Thanos block duration was artificially reduced from 2h to 5m. With default settings,
+  Thanos would batch more data per block, potentially improving compression ratio.
+- Thanos Compactor (not deployed in this experiment) further reduces long-term storage
+  by merging and downsampling blocks.
+- GreptimeDB's compaction behavior over longer time windows was not tested.
+- 17 minutes of data is too short for definitive storage cost projections — a multi-day
+  test would be more representative.
+
+---
+
 ## Evaluation scores
 
 Using the criteria from the Mimir replacement research.
@@ -201,38 +233,43 @@ Thanos is significantly lighter. For a local dev cluster this matters.
 
 | Backend | Score | Notes |
 |---------|-------|-------|
-| Thanos | 8/10 | Uses S3-compatible object storage (versitygw). Well-understood cost model. Compactor reduces storage. |
-| GreptimeDB | 7/10 | Also supports S3-compatible storage. Uses columnar format which should compress well. Less proven at scale. |
+| Thanos | 6/10 | 1.4 MB for ~17 min of data. Block-based format is less space-efficient. Compactor helps long-term but adds operational complexity. |
+| GreptimeDB | 9/10 | 252 KB for same data — 5.6x smaller. Columnar format compresses metrics data very well. Fewer bytes = lower S3 storage and egress cost. |
 
-Both can target versitygw for object storage. Thanos has a more mature compaction
-story.
+GreptimeDB's columnar storage format produces significantly smaller objects. Both
+backends target versitygw S3. While Thanos Compactor can reduce long-term storage,
+GreptimeDB's baseline efficiency is notably better.
 
 ### Weighted total
 
 | Backend | Correctness (20%) | Complexity (40%) | Resources (15%) | Maturity (10%) | Storage (15%) | **Total** |
 |---------|-------------------|-----------------|-----------------|---------------|--------------|-----------|
-| Thanos | 2.0 | 2.8 | 1.35 | 1.0 | 1.2 | **8.35** |
-| GreptimeDB | 2.0 | 3.6 | 0.75 | 0.6 | 1.05 | **8.00** |
+| Thanos | 2.0 | 2.8 | 1.35 | 1.0 | 0.9 | **8.05** |
+| GreptimeDB | 2.0 | 3.6 | 0.75 | 0.6 | 1.35 | **8.30** |
 
 ---
 
 ## Recommendation
 
-**Thanos wins narrowly (8.35 vs 8.00)**, primarily due to its lower resource footprint
-and maturity. However, the scores are close enough that the decision should also
-consider:
+**The two backends are essentially tied (Thanos 8.05 vs GreptimeDB 8.30)** after
+accounting for measured object storage efficiency. GreptimeDB's columnar format
+produces 5.6x less data on S3, which flips the storage cost score and narrows
+Thanos's advantage on maturity and resource usage.
+
+1. **For ystack local dev clusters**: Thanos is still preferred — lighter CPU/memory
+   footprint matters in constrained k3d environments, and storage cost is less
+   relevant with emptyDir/local volumes.
 
-1. **For ystack local dev clusters**: Thanos is preferred — lighter resource usage
-   matters in constrained k3d environments, and the 2-component topology (Receive +
-   Query) is manageable.
+2. **For production multi-cluster with S3 storage costs**: GreptimeDB deserves
+   serious consideration — its storage efficiency advantage compounds at scale,
+   and lower object counts mean fewer S3 API calls (PUT/GET costs).
 
-2. **For production multi-cluster**: Thanos is preferred — the Receive component
-   already supports multi-tenancy via labels, and the Query component can federate
-   across multiple Receive instances. Zone-aware ingestion is well-documented.
+3. **Thanos advantages**: CNCF graduated maturity, battle-tested at massive scale,
+   well-documented multi-tenancy and zone-aware ingestion, lower runtime resource
+   footprint.
 
-3. **GreptimeDB remains interesting** for use cases that need SQL access to metrics
-   data or where the standalone deployment model is valued. It could be revisited in
-   a future evaluation as the project matures.
+4. **GreptimeDB advantages**: Simpler single-component topology, dramatically better
+   storage efficiency, SQL access to metrics data, active development pace.
 
 ## Next steps
 

From 77af59403c572aa99405291b2ebe9b70aa4a23f3 Mon Sep 17 00:00:00 2001
From: Staffan Olsson <staffan@repos.se>
Date: Wed, 11 Mar 2026 16:54:43 +0100
Subject: [PATCH 8/8] y-cluster-provision-k3d: skip sudo when /etc/hosts is
 already up to date

for provisioners that use a fixed IP.

Use y-k8s-ingress-hosts -check before attempting -write, so provision
can complete without a TTY or sudo when entries already exist.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 bin/y-cluster-provision-k3d  | 7 +++++--
 bin/y-cluster-provision-lima | 7 +++++--
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/bin/y-cluster-provision-k3d b/bin/y-cluster-provision-k3d
index 0a388a97..34292107 100755
--- a/bin/y-cluster-provision-k3d
+++ b/bin/y-cluster-provision-k3d
@@ -129,5 +129,8 @@ PROD_REGISTRY_IP=$(kubectl --context=$CTX -n ystack get service prod-registry -o
 docker exec k3d-ystack-server-0 sh -cex "echo '$BUILDS_REGISTRY_IP builds-registry.ystack.svc.cluster.local' >> /etc/hosts"
 docker exec k3d-ystack-server-0 sh -cex "echo '$PROD_REGISTRY_IP   prod-registry.ystack.svc.cluster.local' >> /etc/hosts"
 
-echo "# Updating /etc/hosts (requires sudo) ..."
-y-k8s-ingress-hosts --context=$CTX -write -override-ip "${YSTACK_PORTS_IP:-127.0.0.1}"
+echo "# Checking /etc/hosts ..."
+if ! y-k8s-ingress-hosts --context=$CTX -check -override-ip "${YSTACK_PORTS_IP:-127.0.0.1}"; then
+  echo "# Updating /etc/hosts (requires sudo) ..."
+  y-k8s-ingress-hosts --context=$CTX -write -override-ip "${YSTACK_PORTS_IP:-127.0.0.1}"
+fi
diff --git a/bin/y-cluster-provision-lima b/bin/y-cluster-provision-lima
index 389360fd..38f45662 100755
--- a/bin/y-cluster-provision-lima
+++ b/bin/y-cluster-provision-lima
@@ -122,5 +122,8 @@ PROD_REGISTRY_IP=$(kubectl --context=$CTX -n ystack get service prod-registry -o
 limactl shell ystack sudo sh -c "echo '$BUILDS_REGISTRY_IP builds-registry.ystack.svc.cluster.local' >> /etc/hosts"
 limactl shell ystack sudo sh -c "echo '$PROD_REGISTRY_IP   prod-registry.ystack.svc.cluster.local' >> /etc/hosts"
 
-echo "# Updating /etc/hosts (requires sudo) ..."
-y-k8s-ingress-hosts --context=$CTX -write -override-ip 127.0.0.1
+echo "# Checking /etc/hosts ..."
+if ! y-k8s-ingress-hosts --context=$CTX -check -override-ip 127.0.0.1; then
+  echo "# Updating /etc/hosts (requires sudo) ..."
+  y-k8s-ingress-hosts --context=$CTX -write -override-ip 127.0.0.1
+fi