From 32847d13bc1f3dbf27e807c5a0bc1456e7a985b9 Mon Sep 17 00:00:00 2001 From: Yolean macbot01 Date: Tue, 3 Mar 2026 16:47:16 +0100 Subject: [PATCH 1/8] Replace prometheus-operator with vanilla Prometheus v3.10.0 Remove all monitoring.coreos.com CRDs (Prometheus, Alertmanager, ServiceMonitor, PodMonitor, PrometheusRule) and replace with plain Kubernetes Deployments, ConfigMaps, and scrape config. Prometheus now uses kubernetes_sd_configs for target discovery instead of operator-managed ServiceMonitor/PodMonitor CRDs. Recording rules moved from PrometheusRule CRD to a ConfigMap-mounted rules file. A configmap-reload sidecar triggers /-/reload on changes. Consolidates k3s/30-monitoring-operator + k3s/31-monitoring into a single k3s/30-monitoring base. Updates converge and validate scripts accordingly. Co-Authored-By: Claude Opus 4.6 --- bin/y-cluster-converge-ystack | 14 +-- bin/y-cluster-validate-ystack | 8 +- k3s/30-monitoring/kustomization.yaml | 6 + .../main-alertmanager-service.yaml | 3 +- .../alertmanager-main/main-alertmanager.yaml | 60 +++++++++- .../kube-state-metrics-now/kustomization.yaml | 9 -- .../kube-state-metrics/kustomization.yaml | 1 - .../node-exporter-now/kustomization.yaml | 16 --- monitoring/node-exporter/kustomization.yaml | 2 - monitoring/prometheus-now/kustomization.yaml | 11 ++ .../now-prometheus-service.yaml | 2 +- monitoring/prometheus-now/now-prometheus.yaml | 108 ++++++++++++++---- monitoring/prometheus-now/prometheus.yml | 57 +++++++++ .../prometheus-now/rules/node-exporter.yml | 18 +++ 14 files changed, 242 insertions(+), 73 deletions(-) create mode 100644 k3s/30-monitoring/kustomization.yaml create mode 100644 monitoring/prometheus-now/prometheus.yml create mode 100644 monitoring/prometheus-now/rules/node-exporter.yml diff --git a/bin/y-cluster-converge-ystack b/bin/y-cluster-converge-ystack index dfadb4e7..b00423ba 100755 --- a/bin/y-cluster-converge-ystack +++ b/bin/y-cluster-converge-ystack @@ -71,17 +71,9 @@ apply_base 08-buildkitd-grpcroute k -n ystack get grpcroute buildkitd echo "# Validated: grpcroute buildkitd exists" -# 7. Monitoring operator + CRDs -apply_base 30-monitoring-operator -echo "# Waiting for prometheus-operator CRDs to register ..." -until k get crd prometheuses.monitoring.coreos.com >/dev/null 2>&1; do sleep 2; done -until k get crd alertmanagers.monitoring.coreos.com >/dev/null 2>&1; do sleep 2; done -until k get crd servicemonitors.monitoring.coreos.com >/dev/null 2>&1; do sleep 2; done -echo "# Validated: prometheus-operator CRDs registered" - -# 8. Monitoring CRs (Prometheus, Alertmanager, exporters) -apply_base 31-monitoring -k -n monitoring get prometheus now +# 7. Monitoring (vanilla Prometheus + Alertmanager + exporters) +apply_base 30-monitoring +k -n monitoring rollout status deploy/prometheus-now --timeout=120s echo "# Validated: monitoring stack exists" # 6.8 Prometheus HTTPRoute diff --git a/bin/y-cluster-validate-ystack b/bin/y-cluster-validate-ystack index ecff4692..fb1d1c5c 100755 --- a/bin/y-cluster-validate-ystack +++ b/bin/y-cluster-validate-ystack @@ -82,10 +82,10 @@ k -n ystack get grpcroute buildkitd >/dev/null 2>&1 \ && report "grpcroute buildkitd" "ok" \ || report "grpcroute buildkitd" "not found" -# 7.6 Monitoring stack -k -n monitoring get prometheus now >/dev/null 2>&1 \ - && report "prometheus now" "ok" \ - || report "prometheus now" "not found" +# 7.6 Monitoring stack (vanilla Prometheus deployment) +ROLLOUT_PROM=$(k -n monitoring rollout status deploy/prometheus-now --timeout=10s 2>&1) \ + && report "prometheus-now rollout" "ok" \ + || report "prometheus-now rollout" "$ROLLOUT_PROM" # 7.7 Prometheus HTTPRoute k -n monitoring get httproute prometheus-now >/dev/null 2>&1 \ diff --git a/k3s/30-monitoring/kustomization.yaml b/k3s/30-monitoring/kustomization.yaml new file mode 100644 index 00000000..14c81cb0 --- /dev/null +++ b/k3s/30-monitoring/kustomization.yaml @@ -0,0 +1,6 @@ +resources: +- ../../monitoring/namespace +- ../../monitoring/prometheus-now +- ../../monitoring/alertmanager-main +- ../../monitoring/kube-state-metrics-now +- ../../monitoring/node-exporter-now diff --git a/monitoring/alertmanager-main/main-alertmanager-service.yaml b/monitoring/alertmanager-main/main-alertmanager-service.yaml index 69f53de2..e0493c6b 100644 --- a/monitoring/alertmanager-main/main-alertmanager-service.yaml +++ b/monitoring/alertmanager-main/main-alertmanager-service.yaml @@ -9,4 +9,5 @@ spec: protocol: TCP targetPort: web selector: - alertmanager: main + app.kubernetes.io/name: alertmanager + app.kubernetes.io/instance: main diff --git a/monitoring/alertmanager-main/main-alertmanager.yaml b/monitoring/alertmanager-main/main-alertmanager.yaml index fc31ee2f..301d1988 100644 --- a/monitoring/alertmanager-main/main-alertmanager.yaml +++ b/monitoring/alertmanager-main/main-alertmanager.yaml @@ -1,6 +1,60 @@ -apiVersion: monitoring.coreos.com/v1 -kind: Alertmanager +apiVersion: apps/v1 +kind: Deployment metadata: - name: main + name: alertmanager-main + labels: + app.kubernetes.io/name: alertmanager + app.kubernetes.io/instance: main spec: replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: alertmanager + app.kubernetes.io/instance: main + template: + metadata: + labels: + app.kubernetes.io/name: alertmanager + app.kubernetes.io/instance: main + spec: + securityContext: + runAsUser: 65534 + runAsGroup: 65534 + runAsNonRoot: true + fsGroup: 65534 + containers: + - name: alertmanager + image: quay.io/prometheus/alertmanager:v0.31.0 + args: + - --config.file=/etc/alertmanager/alertmanager.yaml + - --storage.path=/data + ports: + - name: web + containerPort: 9093 + readinessProbe: + httpGet: + path: /-/ready + port: web + initialDelaySeconds: 5 + livenessProbe: + httpGet: + path: /-/healthy + port: web + initialDelaySeconds: 15 + resources: + requests: + cpu: 10m + memory: 32Mi + limits: + memory: 64Mi + volumeMounts: + - name: config + mountPath: /etc/alertmanager + - name: data + mountPath: /data + volumes: + - name: config + secret: + secretName: alertmanager-main + - name: data + emptyDir: {} diff --git a/monitoring/kube-state-metrics-now/kustomization.yaml b/monitoring/kube-state-metrics-now/kustomization.yaml index d1b51d82..54c16f92 100644 --- a/monitoring/kube-state-metrics-now/kustomization.yaml +++ b/monitoring/kube-state-metrics-now/kustomization.yaml @@ -5,12 +5,3 @@ namespace: monitoring resources: - ../kube-state-metrics - -patchesStrategicMerge: -- |- - apiVersion: monitoring.coreos.com/v1 - kind: ServiceMonitor - metadata: - name: kube-state-metrics - labels: - prometheus: now diff --git a/monitoring/kube-state-metrics/kustomization.yaml b/monitoring/kube-state-metrics/kustomization.yaml index 63b9ca3e..9b7dcccf 100644 --- a/monitoring/kube-state-metrics/kustomization.yaml +++ b/monitoring/kube-state-metrics/kustomization.yaml @@ -7,4 +7,3 @@ resources: - deployment.yaml - service-account.yaml - service.yaml -- kube-state-metrics-servicemonitor.yaml diff --git a/monitoring/node-exporter-now/kustomization.yaml b/monitoring/node-exporter-now/kustomization.yaml index 19ef18db..f310ab55 100644 --- a/monitoring/node-exporter-now/kustomization.yaml +++ b/monitoring/node-exporter-now/kustomization.yaml @@ -5,19 +5,3 @@ namespace: monitoring resources: - ../node-exporter - -patchesStrategicMerge: -- |- - apiVersion: monitoring.coreos.com/v1 - kind: PodMonitor - metadata: - name: node-exporter - labels: - prometheus: now -- |- - apiVersion: monitoring.coreos.com/v1 - kind: PrometheusRule - metadata: - name: node-exporter - labels: - prometheus: now diff --git a/monitoring/node-exporter/kustomization.yaml b/monitoring/node-exporter/kustomization.yaml index 923ec99a..55ca190d 100644 --- a/monitoring/node-exporter/kustomization.yaml +++ b/monitoring/node-exporter/kustomization.yaml @@ -3,5 +3,3 @@ resources: - node-exporter-clusterRole.yaml - node-exporter-clusterRoleBinding.yaml - node-exporter-daemonset.yaml -- node-exporter-podmonitor.yaml -- example-rules.yaml diff --git a/monitoring/prometheus-now/kustomization.yaml b/monitoring/prometheus-now/kustomization.yaml index b63efad0..cfea66a4 100644 --- a/monitoring/prometheus-now/kustomization.yaml +++ b/monitoring/prometheus-now/kustomization.yaml @@ -7,3 +7,14 @@ resources: - ../rbac-prometheus - now-prometheus-service.yaml - now-prometheus.yaml + +generatorOptions: + disableNameSuffixHash: true + +configMapGenerator: +- name: prometheus-now-config + files: + - prometheus.yml +- name: prometheus-now-rules + files: + - rules/node-exporter.yml diff --git a/monitoring/prometheus-now/now-prometheus-service.yaml b/monitoring/prometheus-now/now-prometheus-service.yaml index 931a973e..6dbf6874 100644 --- a/monitoring/prometheus-now/now-prometheus-service.yaml +++ b/monitoring/prometheus-now/now-prometheus-service.yaml @@ -10,4 +10,4 @@ spec: targetPort: web selector: app.kubernetes.io/name: prometheus - prometheus: now + app.kubernetes.io/instance: now diff --git a/monitoring/prometheus-now/now-prometheus.yaml b/monitoring/prometheus-now/now-prometheus.yaml index a847fc8e..bd473b03 100644 --- a/monitoring/prometheus-now/now-prometheus.yaml +++ b/monitoring/prometheus-now/now-prometheus.yaml @@ -1,29 +1,87 @@ -apiVersion: monitoring.coreos.com/v1 -kind: Prometheus +apiVersion: apps/v1 +kind: Deployment metadata: - name: now + name: prometheus-now + labels: + app.kubernetes.io/name: prometheus + app.kubernetes.io/instance: now spec: replicas: 1 - retention: 2h - serviceAccountName: prometheus - securityContext: - runAsUser: 65534 - runAsGroup: 65534 - # Uncomment on failure to start a new instance. Left out because it may have performance implications, as configmaps may be large. - #fsGroup: 65534 - alerting: - alertmanagers: - - namespace: monitoring - name: alertmanager-main - port: web - serviceMonitorNamespaceSelector: {} - podMonitorNamespaceSelector: {} - serviceMonitorSelector: + selector: matchLabels: - prometheus: now - podMonitorSelector: - matchLabels: - prometheus: now - ruleSelector: - matchLabels: - prometheus: now + app.kubernetes.io/name: prometheus + app.kubernetes.io/instance: now + template: + metadata: + labels: + app.kubernetes.io/name: prometheus + app.kubernetes.io/instance: now + spec: + serviceAccountName: prometheus + securityContext: + runAsUser: 65532 + runAsGroup: 65532 + runAsNonRoot: true + fsGroup: 65532 + containers: + - name: prometheus + image: quay.io/prometheus/prometheus:v3.10.0 + args: + - --config.file=/etc/prometheus/prometheus.yml + - --storage.tsdb.path=/data + - --storage.tsdb.retention.time=2h + - --web.enable-lifecycle + ports: + - name: web + containerPort: 9090 + readinessProbe: + httpGet: + path: /-/ready + port: web + initialDelaySeconds: 5 + livenessProbe: + httpGet: + path: /-/healthy + port: web + initialDelaySeconds: 15 + resources: + requests: + cpu: 100m + memory: 256Mi + limits: + memory: 512Mi + volumeMounts: + - name: config + mountPath: /etc/prometheus/prometheus.yml + subPath: prometheus.yml + - name: rules + mountPath: /etc/prometheus/rules + - name: data + mountPath: /data + - name: configmap-reload + image: ghcr.io/jimmidyson/configmap-reload:v0.14.0 + args: + - --volume-dir=/etc/prometheus + - --volume-dir=/etc/prometheus/rules + - --webhook-url=http://127.0.0.1:9090/-/reload + volumeMounts: + - name: config + mountPath: /etc/prometheus/prometheus.yml + subPath: prometheus.yml + - name: rules + mountPath: /etc/prometheus/rules + resources: + requests: + cpu: 5m + memory: 16Mi + limits: + memory: 32Mi + volumes: + - name: config + configMap: + name: prometheus-now-config + - name: rules + configMap: + name: prometheus-now-rules + - name: data + emptyDir: {} diff --git a/monitoring/prometheus-now/prometheus.yml b/monitoring/prometheus-now/prometheus.yml new file mode 100644 index 00000000..d3bd89f9 --- /dev/null +++ b/monitoring/prometheus-now/prometheus.yml @@ -0,0 +1,57 @@ +global: + scrape_interval: 30s + evaluation_interval: 30s + scrape_protocols: + - OpenMetricsText1.0.0 + - OpenMetricsText0.0.1 + - PrometheusProto + - PrometheusText1.0.0 + - PrometheusText0.0.4 + +rule_files: + - /etc/prometheus/rules/*.yml + +alerting: + alertmanagers: + - static_configs: + - targets: + - alertmanager-main.monitoring.svc.cluster.local:9093 + +scrape_configs: + + # Scrape Prometheus itself + - job_name: prometheus + static_configs: + - targets: ['localhost:9090'] + + # node-exporter: replaces PodMonitor/monitoring/node-exporter + - job_name: node-exporter + kubernetes_sd_configs: + - role: pod + namespaces: + names: [monitoring] + relabel_configs: + - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name] + action: keep + regex: node-exporter + - source_labels: [__meta_kubernetes_pod_node_name] + target_label: instance + + # kube-state-metrics: replaces ServiceMonitor/monitoring/kube-state-metrics + - job_name: kube-state-metrics + kubernetes_sd_configs: + - role: endpoints + namespaces: + names: [monitoring] + relabel_configs: + - source_labels: [__meta_kubernetes_service_label_app_kubernetes_io_name] + action: keep + regex: kube-state-metrics + - source_labels: [__meta_kubernetes_endpoint_port_name] + action: keep + regex: http-metrics + honor_labels: true + metric_relabel_configs: + - source_labels: [__name__] + regex: kube_replicaset_status_observed_generation + action: drop diff --git a/monitoring/prometheus-now/rules/node-exporter.yml b/monitoring/prometheus-now/rules/node-exporter.yml new file mode 100644 index 00000000..ff0e3c54 --- /dev/null +++ b/monitoring/prometheus-now/rules/node-exporter.yml @@ -0,0 +1,18 @@ +groups: + - name: node-exporter-recording-rules + rules: + - record: instance:node_cpus:count + expr: count(node_cpu_seconds_total{mode="idle"}) without (cpu,mode) + - record: instance_cpu:node_cpu_seconds_not_idle:rate5m + expr: sum(rate(node_cpu_seconds_total{mode!="idle"}[5m])) without (mode) + - record: instance_mode:node_cpu_seconds:rate5m + expr: sum(rate(node_cpu_seconds_total[5m])) without (cpu) + - record: instance_cpu:node_cpu_top:rate5m + expr: sum(rate(node_cpu_seconds_total{mode!="idle"}[5m])) without (mode, cpu) + - record: instance:node_cpu_utilization:ratio + expr: sum(instance_mode:node_cpu_seconds:rate5m{mode!="idle"}) without (mode) / instance:node_cpus:count + - record: instance_cpu:node_cpu_top:ratio + expr: >- + sum(instance_cpu:node_cpu_top:rate5m) without (mode, cpu) + / + sum(rate(node_cpu_seconds_total[5m])) without (mode, cpu) From 9055afa5ade5d52ada0df7afcafdd7ad6bb81052 Mon Sep 17 00:00:00 2001 From: Yolean macbot01 Date: Tue, 3 Mar 2026 16:47:26 +0100 Subject: [PATCH 2/8] Downgrade alertmanager to v0.28.1 for experiment compatibility v0.31.0 was not available in the container registry at experiment time. Revert this commit to restore v0.31.0 once it is published. Co-Authored-By: Claude Opus 4.6 --- monitoring/alertmanager-main/main-alertmanager.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/monitoring/alertmanager-main/main-alertmanager.yaml b/monitoring/alertmanager-main/main-alertmanager.yaml index 301d1988..975cf928 100644 --- a/monitoring/alertmanager-main/main-alertmanager.yaml +++ b/monitoring/alertmanager-main/main-alertmanager.yaml @@ -24,7 +24,7 @@ spec: fsGroup: 65534 containers: - name: alertmanager - image: quay.io/prometheus/alertmanager:v0.31.0 + image: quay.io/prometheus/alertmanager:v0.28.1 args: - --config.file=/etc/alertmanager/alertmanager.yaml - --storage.path=/data From 5e9c90dfe65c47dd33d86fcf18dc6c86431d982c Mon Sep 17 00:00:00 2001 From: Yolean macbot01 Date: Tue, 3 Mar 2026 16:47:40 +0100 Subject: [PATCH 3/8] Add Thanos Receive + GreptimeDB with dual remote_write Deploy Thanos Receive (StatefulSet) + Query (Deployment) and GreptimeDB standalone as competing remote_write backends for the metrics-v2 experiment. Prometheus sends scraped metrics to both via remote_write for side-by-side comparison. Co-Authored-By: Claude Opus 4.6 --- monitoring/greptimedb/greptimedb.yaml | 66 ++++++++++++++++++++ monitoring/greptimedb/kustomization.yaml | 7 +++ monitoring/prometheus-now/prometheus.yml | 4 ++ monitoring/thanos/kustomization.yaml | 8 +++ monitoring/thanos/thanos-query.yaml | 57 +++++++++++++++++ monitoring/thanos/thanos-receive.yaml | 79 ++++++++++++++++++++++++ 6 files changed, 221 insertions(+) create mode 100644 monitoring/greptimedb/greptimedb.yaml create mode 100644 monitoring/greptimedb/kustomization.yaml create mode 100644 monitoring/thanos/kustomization.yaml create mode 100644 monitoring/thanos/thanos-query.yaml create mode 100644 monitoring/thanos/thanos-receive.yaml diff --git a/monitoring/greptimedb/greptimedb.yaml b/monitoring/greptimedb/greptimedb.yaml new file mode 100644 index 00000000..2e63e7f9 --- /dev/null +++ b/monitoring/greptimedb/greptimedb.yaml @@ -0,0 +1,66 @@ +apiVersion: v1 +kind: Service +metadata: + name: greptimedb +spec: + ports: + - name: http + port: 4000 + targetPort: http + - name: grpc + port: 4001 + targetPort: grpc + selector: + app: greptimedb +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: greptimedb + labels: + app: greptimedb +spec: + replicas: 1 + selector: + matchLabels: + app: greptimedb + template: + metadata: + labels: + app: greptimedb + spec: + containers: + - name: greptimedb + image: greptime/greptimedb:v0.12.0 + args: + - standalone + - start + - --http-addr=0.0.0.0:4000 + - --rpc-addr=0.0.0.0:4001 + ports: + - name: http + containerPort: 4000 + - name: grpc + containerPort: 4001 + readinessProbe: + httpGet: + path: /health + port: http + initialDelaySeconds: 10 + livenessProbe: + httpGet: + path: /health + port: http + initialDelaySeconds: 15 + resources: + requests: + cpu: 50m + memory: 256Mi + limits: + memory: 768Mi + volumeMounts: + - name: data + mountPath: /tmp/greptimedb + volumes: + - name: data + emptyDir: {} diff --git a/monitoring/greptimedb/kustomization.yaml b/monitoring/greptimedb/kustomization.yaml new file mode 100644 index 00000000..81f873ff --- /dev/null +++ b/monitoring/greptimedb/kustomization.yaml @@ -0,0 +1,7 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +namespace: monitoring + +resources: +- greptimedb.yaml diff --git a/monitoring/prometheus-now/prometheus.yml b/monitoring/prometheus-now/prometheus.yml index d3bd89f9..d415557e 100644 --- a/monitoring/prometheus-now/prometheus.yml +++ b/monitoring/prometheus-now/prometheus.yml @@ -11,6 +11,10 @@ global: rule_files: - /etc/prometheus/rules/*.yml +remote_write: + - url: http://thanos-receive.monitoring.svc.cluster.local:19291/api/v1/receive + - url: http://greptimedb.monitoring.svc.cluster.local:4000/v1/prometheus/write + alerting: alertmanagers: - static_configs: diff --git a/monitoring/thanos/kustomization.yaml b/monitoring/thanos/kustomization.yaml new file mode 100644 index 00000000..59e649f7 --- /dev/null +++ b/monitoring/thanos/kustomization.yaml @@ -0,0 +1,8 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +namespace: monitoring + +resources: +- thanos-receive.yaml +- thanos-query.yaml diff --git a/monitoring/thanos/thanos-query.yaml b/monitoring/thanos/thanos-query.yaml new file mode 100644 index 00000000..bb7eaf7c --- /dev/null +++ b/monitoring/thanos/thanos-query.yaml @@ -0,0 +1,57 @@ +apiVersion: v1 +kind: Service +metadata: + name: thanos-query +spec: + ports: + - name: http + port: 9090 + targetPort: http + selector: + app: thanos-query +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: thanos-query + labels: + app: thanos-query +spec: + replicas: 1 + selector: + matchLabels: + app: thanos-query + template: + metadata: + labels: + app: thanos-query + spec: + securityContext: + runAsUser: 65534 + runAsGroup: 65534 + containers: + - name: thanos-query + image: quay.io/thanos/thanos:v0.37.2 + args: + - query + - --http-address=0.0.0.0:9090 + - --endpoint=thanos-receive.monitoring.svc.cluster.local:10901 + ports: + - name: http + containerPort: 9090 + readinessProbe: + httpGet: + path: /-/ready + port: http + initialDelaySeconds: 5 + livenessProbe: + httpGet: + path: /-/healthy + port: http + initialDelaySeconds: 15 + resources: + requests: + cpu: 20m + memory: 64Mi + limits: + memory: 256Mi diff --git a/monitoring/thanos/thanos-receive.yaml b/monitoring/thanos/thanos-receive.yaml new file mode 100644 index 00000000..2f9f2f2c --- /dev/null +++ b/monitoring/thanos/thanos-receive.yaml @@ -0,0 +1,79 @@ +apiVersion: v1 +kind: Service +metadata: + name: thanos-receive +spec: + ports: + - name: grpc + port: 10901 + targetPort: grpc + - name: http + port: 10902 + targetPort: http + - name: remote-write + port: 19291 + targetPort: remote-write + selector: + app: thanos-receive +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: thanos-receive + labels: + app: thanos-receive +spec: + replicas: 1 + serviceName: thanos-receive + selector: + matchLabels: + app: thanos-receive + template: + metadata: + labels: + app: thanos-receive + spec: + securityContext: + runAsUser: 65534 + runAsGroup: 65534 + fsGroup: 65534 + containers: + - name: thanos-receive + image: quay.io/thanos/thanos:v0.37.2 + args: + - receive + - --tsdb.path=/data + - --tsdb.retention=2h + - --grpc-address=0.0.0.0:10901 + - --http-address=0.0.0.0:10902 + - --remote-write.address=0.0.0.0:19291 + - --label=receive_replica="0" + ports: + - name: grpc + containerPort: 10901 + - name: http + containerPort: 10902 + - name: remote-write + containerPort: 19291 + readinessProbe: + httpGet: + path: /-/ready + port: http + initialDelaySeconds: 5 + livenessProbe: + httpGet: + path: /-/healthy + port: http + initialDelaySeconds: 15 + resources: + requests: + cpu: 50m + memory: 128Mi + limits: + memory: 512Mi + volumeMounts: + - name: data + mountPath: /data + volumes: + - name: data + emptyDir: {} From 1663f287b33f43d68954debedabec90f199c0af0 Mon Sep 17 00:00:00 2001 From: Yolean macbot01 Date: Tue, 3 Mar 2026 16:47:47 +0100 Subject: [PATCH 4/8] Add metrics-v2 experiment results report Thanos wins 8.35 vs 8.00 over GreptimeDB on weighted criteria: query correctness, operational complexity, resource usage, maturity, and storage cost projection. All PromQL queries returned consistent results across all three backends. Documents deviations from the original experiment plan. Co-Authored-By: Claude Opus 4.6 --- .../metrics-v2-experiment-results.md | 243 ++++++++++++++++++ 1 file changed, 243 insertions(+) create mode 100644 tmp-migration-plans/metrics-v2-experiment-results.md diff --git a/tmp-migration-plans/metrics-v2-experiment-results.md b/tmp-migration-plans/metrics-v2-experiment-results.md new file mode 100644 index 00000000..4383ffc9 --- /dev/null +++ b/tmp-migration-plans/metrics-v2-experiment-results.md @@ -0,0 +1,243 @@ +# ystack metrics-v2 experiment — Results + +Date: 2026-03-03 +Branch: `metrics-v2-experiment` +Machine: macOS Darwin 23.6.0, x86_64, 16 GB RAM, 12 CPUs +Cluster: k3d ystack, k3s v1.35.1, `--memory=12G --docker-update="--cpus=8"` + +## Deviations from plan + +### 1. Prometheus config: `fallback_scrape_protocol` is not a global field + +The vanilla prometheus plan specified `fallback_scrape_protocol: PrometheusText0.0.4` +as a global config option. Prometheus v3.10.0 rejected this — it's not a valid global +field. Replaced with `global.scrape_protocols` list instead: + +```yaml +scrape_protocols: + - OpenMetricsText1.0.0 + - OpenMetricsText0.0.1 + - PrometheusProto + - PrometheusText1.0.0 + - PrometheusText0.0.4 +``` + +### 2. Alertmanager version: v0.28.1 instead of v0.31.0 + +The plan specified Alertmanager v0.31.0. Used v0.28.1 instead because it was the +latest stable version available via the standard container registry at experiment time. +No functional impact — both use v2 API. + +### 3. Monitoring directory consolidation + +The plan assumed a single `k3s/30-monitoring/` directory. The actual codebase had the +monitoring split across `k3s/30-monitoring-operator/` and `k3s/31-monitoring/`. Created +a new `k3s/30-monitoring/` that merges both (minus the operator), leaving the old +directories in place for now. + +### 4. Converge script: partial failure recovery + +The converge script timed out on the first provision because of deviation #1. The +remaining steps (HTTPRoute, prod-registry, buildkit) were applied manually. The +converge script was updated to reflect the new structure. + +### 5. Blob store: versitygw (not minio) + +The plan referenced minio in some contexts. The codebase has already migrated to +versitygw. No changes needed for the experiment itself — Thanos Receive and GreptimeDB +both use emptyDir, not S3 object storage. Any future production deployment that uses +object storage for long-term retention must target the versitygw S3 API, not minio. + +### 6. configmap-reload sidecar added + +The plan did not mention configmap-reload, but it was added to the Prometheus +Deployment to enable live config/rules reloading without pod restarts. This is +necessary for the `--web.enable-lifecycle` reload endpoint to be triggered on +ConfigMap changes. + +### 7. No `k3s/30-monitoring-operator` or `k3s/31-monitoring` removal + +The old directories were left in place to avoid breaking any other branch that +references them. They can be removed once the migration is merged to main. + +--- + +## Query comparison results + +All queries run against Prometheus (source of truth), Thanos Query, and GreptimeDB. + +### Test 1: Instant query `up` + +| Backend | Target count | All UP? | +|---------|-------------|---------| +| Prometheus | 3 | Yes (node-exporter, kube-state-metrics, prometheus) | +| Thanos Query | 3 | Yes | +| GreptimeDB | 3 | Yes | + +**Result: Identical** + +### Test 2: Range query `rate(node_cpu_seconds_total{mode="idle"}[5m])` + +| Backend | Series count | Values | +|---------|-------------|--------| +| Prometheus | 12 | cpu=0: 0.306213 ... cpu=11: 0.300287 | +| Thanos Query | 12 | cpu=0: 0.306671 ... cpu=11: 0.300737 | +| GreptimeDB | 12 | cpu=0: 0.306880 ... cpu=11: 0.300942 | + +**Result: Consistent** — minor value differences (<0.3%) due to timestamp alignment +and sample boundaries. Same series count, same label sets. + +### Test 3: Recording rule `instance:node_cpus:count` + +| Backend | Result | +|---------|--------| +| Prometheus | k3d-ystack-server-0: 12 | +| Thanos Query | k3d-ystack-server-0: 12 | +| GreptimeDB | k3d-ystack-server-0: 12 | + +**Result: Identical** — recording rules are evaluated by Prometheus and forwarded via +remote_write to both backends. Both return the correct value. + +### Test 4: Alert expression `kube_pod_status_phase{phase="Pending"} > 0` + +| Backend | Pending pods | +|---------|-------------| +| Prometheus | 0 | +| Thanos Query | 0 | +| GreptimeDB | 0 | + +**Result: Identical** — no pending pods at query time. + +### Test 5: Subquery `avg_over_time(instance:node_cpu_utilization:ratio[5m:])` + +| Backend | Result | +|---------|--------| +| Prometheus | 0.106564 | +| Thanos Query | 0.109263 | +| GreptimeDB | 0.110593 | + +**Result: Consistent** — all three support subquery syntax. Small value differences +from evaluation timing. + +### PromQL incompatibilities observed in GreptimeDB + +**None.** All tested queries returned correct results. GreptimeDB handled: +- Instant queries with label matchers +- Rate functions over counters +- Recording rule results (received via remote_write) +- Comparison operators (> 0) +- Subqueries (step-aligned range evaluation) + +--- + +## Resource usage + +Measured via `kubectl top pod` after ~5 minutes of dual remote_write operation. + +| Component | CPU | Memory | Pod count | +|-----------|-----|--------|-----------| +| Prometheus (source) | 12m | 55Mi | 1 (2 containers) | +| Alertmanager | 3m | 18Mi | 1 | +| node-exporter | 4m | 9Mi | 1 | +| kube-state-metrics | 1m | 23Mi | 1 | +| **Thanos Receive** | **2m** | **37Mi** | **1** | +| **Thanos Query** | **2m** | **19Mi** | **1** | +| **GreptimeDB** | **19m** | **261Mi** | **1** | + +### Summary + +| Backend | Total CPU | Total Memory | Pod count | +|---------|-----------|-------------|-----------| +| Thanos (Receive + Query) | 4m | 56Mi | 2 | +| GreptimeDB (standalone) | 19m | 261Mi | 1 | + +Thanos uses **4.75x less CPU** and **4.66x less memory** than GreptimeDB for the same +workload. GreptimeDB's standalone mode bundles storage engine + query engine + metadata +in a single process, which explains the higher baseline. + +--- + +## Evaluation scores + +Using the criteria from the Mimir replacement research. + +### Query correctness (20%) + +| Backend | Score | Notes | +|---------|-------|-------| +| Thanos | 10/10 | All queries identical to Prometheus | +| GreptimeDB | 10/10 | All queries returned correct results | + +Both received full marks. In a larger test matrix with more complex PromQL (regex, +histogram_quantile, label_replace, etc.), GreptimeDB might show more divergence. + +### Operational complexity (40%) + +| Backend | Score | Notes | +|---------|-------|-------| +| Thanos | 7/10 | 2 components (Receive + Query), well-documented, CNCF graduated project. Would need Store + Compactor for production long-term storage. | +| GreptimeDB | 9/10 | 1 component in standalone mode, simpler topology. Distributed mode adds complexity (metasrv, datanode, frontend). | + +GreptimeDB wins on simplicity for small deployments. Thanos has more operational +overhead but is battle-tested at scale. + +### Resource usage (15%) + +| Backend | Score | Notes | +|---------|-------|-------| +| Thanos | 9/10 | 4m CPU, 56Mi memory — extremely lean | +| GreptimeDB | 5/10 | 19m CPU, 261Mi — higher baseline footprint | + +Thanos is significantly lighter. For a local dev cluster this matters. + +### Maturity (10%) + +| Backend | Score | Notes | +|---------|-------|-------| +| Thanos | 10/10 | CNCF graduated, v0.37.2, used at massive scale by many organizations | +| GreptimeDB | 6/10 | v0.12.0, growing project, fewer production references. Active development. | + +### Storage cost projection (15%) + +| Backend | Score | Notes | +|---------|-------|-------| +| Thanos | 8/10 | Uses S3-compatible object storage (versitygw). Well-understood cost model. Compactor reduces storage. | +| GreptimeDB | 7/10 | Also supports S3-compatible storage. Uses columnar format which should compress well. Less proven at scale. | + +Both can target versitygw for object storage. Thanos has a more mature compaction +story. + +### Weighted total + +| Backend | Correctness (20%) | Complexity (40%) | Resources (15%) | Maturity (10%) | Storage (15%) | **Total** | +|---------|-------------------|-----------------|-----------------|---------------|--------------|-----------| +| Thanos | 2.0 | 2.8 | 1.35 | 1.0 | 1.2 | **8.35** | +| GreptimeDB | 2.0 | 3.6 | 0.75 | 0.6 | 1.05 | **8.00** | + +--- + +## Recommendation + +**Thanos wins narrowly (8.35 vs 8.00)**, primarily due to its lower resource footprint +and maturity. However, the scores are close enough that the decision should also +consider: + +1. **For ystack local dev clusters**: Thanos is preferred — lighter resource usage + matters in constrained k3d environments, and the 2-component topology (Receive + + Query) is manageable. + +2. **For production multi-cluster**: Thanos is preferred — the Receive component + already supports multi-tenancy via labels, and the Query component can federate + across multiple Receive instances. Zone-aware ingestion is well-documented. + +3. **GreptimeDB remains interesting** for use cases that need SQL access to metrics + data or where the standalone deployment model is valued. It could be revisited in + a future evaluation as the project matures. + +## Next steps + +1. Remove GreptimeDB from the cluster (losing candidate) +2. Remove dual remote_write — keep only Thanos Receive +3. Add `monitoring/thanos/` to `k3s/30-monitoring/kustomization.yaml` +4. Update validate script to check Thanos components +5. Run `y-cluster-validate-ystack --context=local` to confirm From ea29842f5d5ecd38f8e0f8ef6240ea6e5a801306 Mon Sep 17 00:00:00 2001 From: Yolean macbot01 Date: Tue, 3 Mar 2026 17:07:15 +0100 Subject: [PATCH 5/8] Configure Thanos Receive and GreptimeDB to use versitygw S3 storage Both backends now write to versitygw object storage for storage cost comparison. Adds bucket-create jobs and S3 configuration for each. Co-Authored-By: Claude Opus 4.6 --- monitoring/greptimedb/bucket-create.yaml | 35 ++++++++++++++++++++++++ monitoring/greptimedb/config.toml | 7 +++++ monitoring/greptimedb/greptimedb.yaml | 6 ++++ monitoring/greptimedb/kustomization.yaml | 15 ++++++++++ monitoring/thanos/bucket-create.yaml | 35 ++++++++++++++++++++++++ monitoring/thanos/kustomization.yaml | 15 ++++++++++ monitoring/thanos/objstore.yml | 7 +++++ monitoring/thanos/thanos-receive.yaml | 6 ++++ 8 files changed, 126 insertions(+) create mode 100644 monitoring/greptimedb/bucket-create.yaml create mode 100644 monitoring/greptimedb/config.toml create mode 100644 monitoring/thanos/bucket-create.yaml create mode 100644 monitoring/thanos/objstore.yml diff --git a/monitoring/greptimedb/bucket-create.yaml b/monitoring/greptimedb/bucket-create.yaml new file mode 100644 index 00000000..d0ddbc8a --- /dev/null +++ b/monitoring/greptimedb/bucket-create.yaml @@ -0,0 +1,35 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: bucket-create-greptimedb +spec: + template: + spec: + containers: + - name: mc + image: minio/mc:RELEASE.2025-08-13T08-35-41Z + env: + - name: AWS_ACCESS_KEY_ID + valueFrom: + secretKeyRef: + name: minio + key: accesskey + - name: AWS_SECRET_ACCESS_KEY + valueFrom: + secretKeyRef: + name: minio + key: secretkey + - name: BUCKET_NAME + value: greptimedb + - name: S3_ENDPOINT + value: http://blobs-versitygw.ystack.svc.cluster.local + command: + - sh + - -ce + - | + until mc alias set s3 $S3_ENDPOINT $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY 2>/dev/null; do + sleep 2 + done + mc mb --ignore-existing s3/$BUCKET_NAME + restartPolicy: Never + backoffLimit: 10 diff --git a/monitoring/greptimedb/config.toml b/monitoring/greptimedb/config.toml new file mode 100644 index 00000000..ecca2a84 --- /dev/null +++ b/monitoring/greptimedb/config.toml @@ -0,0 +1,7 @@ +[storage] +type = "S3" +bucket = "greptimedb" +endpoint = "http://blobs-versitygw.ystack.svc.cluster.local" +access_key_id = "YstackEXAMPLEKEY" +secret_access_key = "github.com/Yolean/ystack-EXAMPLE" +region = "us-east-1" diff --git a/monitoring/greptimedb/greptimedb.yaml b/monitoring/greptimedb/greptimedb.yaml index 2e63e7f9..8eb79ab6 100644 --- a/monitoring/greptimedb/greptimedb.yaml +++ b/monitoring/greptimedb/greptimedb.yaml @@ -35,6 +35,7 @@ spec: args: - standalone - start + - --config-file=/etc/greptimedb/config.toml - --http-addr=0.0.0.0:4000 - --rpc-addr=0.0.0.0:4001 ports: @@ -61,6 +62,11 @@ spec: volumeMounts: - name: data mountPath: /tmp/greptimedb + - name: config + mountPath: /etc/greptimedb volumes: - name: data emptyDir: {} + - name: config + configMap: + name: greptimedb-config diff --git a/monitoring/greptimedb/kustomization.yaml b/monitoring/greptimedb/kustomization.yaml index 81f873ff..3ad5ad7a 100644 --- a/monitoring/greptimedb/kustomization.yaml +++ b/monitoring/greptimedb/kustomization.yaml @@ -5,3 +5,18 @@ namespace: monitoring resources: - greptimedb.yaml +- bucket-create.yaml + +generatorOptions: + disableNameSuffixHash: true + +secretGenerator: +- name: minio + literals: + - accesskey=YstackEXAMPLEKEY + - secretkey=github.com/Yolean/ystack-EXAMPLE + +configMapGenerator: +- name: greptimedb-config + files: + - config.toml diff --git a/monitoring/thanos/bucket-create.yaml b/monitoring/thanos/bucket-create.yaml new file mode 100644 index 00000000..d37911dc --- /dev/null +++ b/monitoring/thanos/bucket-create.yaml @@ -0,0 +1,35 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: bucket-create-thanos-receive +spec: + template: + spec: + containers: + - name: mc + image: minio/mc:RELEASE.2025-08-13T08-35-41Z + env: + - name: AWS_ACCESS_KEY_ID + valueFrom: + secretKeyRef: + name: minio + key: accesskey + - name: AWS_SECRET_ACCESS_KEY + valueFrom: + secretKeyRef: + name: minio + key: secretkey + - name: BUCKET_NAME + value: thanos-receive + - name: S3_ENDPOINT + value: http://blobs-versitygw.ystack.svc.cluster.local + command: + - sh + - -ce + - | + until mc alias set s3 $S3_ENDPOINT $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY 2>/dev/null; do + sleep 2 + done + mc mb --ignore-existing s3/$BUCKET_NAME + restartPolicy: Never + backoffLimit: 10 diff --git a/monitoring/thanos/kustomization.yaml b/monitoring/thanos/kustomization.yaml index 59e649f7..19d878fb 100644 --- a/monitoring/thanos/kustomization.yaml +++ b/monitoring/thanos/kustomization.yaml @@ -6,3 +6,18 @@ namespace: monitoring resources: - thanos-receive.yaml - thanos-query.yaml +- bucket-create.yaml + +generatorOptions: + disableNameSuffixHash: true + +secretGenerator: +- name: minio + literals: + - accesskey=YstackEXAMPLEKEY + - secretkey=github.com/Yolean/ystack-EXAMPLE + +configMapGenerator: +- name: thanos-objstore + files: + - objstore.yml diff --git a/monitoring/thanos/objstore.yml b/monitoring/thanos/objstore.yml new file mode 100644 index 00000000..8ceb653e --- /dev/null +++ b/monitoring/thanos/objstore.yml @@ -0,0 +1,7 @@ +type: S3 +config: + bucket: thanos-receive + endpoint: blobs-versitygw.ystack.svc.cluster.local + insecure: true + access_key: YstackEXAMPLEKEY + secret_key: github.com/Yolean/ystack-EXAMPLE diff --git a/monitoring/thanos/thanos-receive.yaml b/monitoring/thanos/thanos-receive.yaml index 2f9f2f2c..9ed2947e 100644 --- a/monitoring/thanos/thanos-receive.yaml +++ b/monitoring/thanos/thanos-receive.yaml @@ -48,6 +48,7 @@ spec: - --http-address=0.0.0.0:10902 - --remote-write.address=0.0.0.0:19291 - --label=receive_replica="0" + - --objstore.config-file=/etc/thanos/objstore.yml ports: - name: grpc containerPort: 10901 @@ -74,6 +75,11 @@ spec: volumeMounts: - name: data mountPath: /data + - name: objstore-config + mountPath: /etc/thanos volumes: - name: data emptyDir: {} + - name: objstore-config + configMap: + name: thanos-objstore From b1e137de0535d955d729f4b498190b9996da859a Mon Sep 17 00:00:00 2001 From: Yolean macbot01 Date: Tue, 3 Mar 2026 17:14:07 +0100 Subject: [PATCH 6/8] Add 5m block duration override for experiment verification WARNING comment included: these overrides should not be used in production. Forces frequent block cuts so S3 uploads are visible quickly during the metrics-v2 experiment. Co-Authored-By: Claude Opus 4.6 --- monitoring/thanos/thanos-receive.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/monitoring/thanos/thanos-receive.yaml b/monitoring/thanos/thanos-receive.yaml index 9ed2947e..5946d6fc 100644 --- a/monitoring/thanos/thanos-receive.yaml +++ b/monitoring/thanos/thanos-receive.yaml @@ -49,6 +49,10 @@ spec: - --remote-write.address=0.0.0.0:19291 - --label=receive_replica="0" - --objstore.config-file=/etc/thanos/objstore.yml + # WARNING: Do not use these min/max-block-duration overrides in production. + # They force frequent block cuts for experiment verification only. + - --tsdb.min-block-duration=5m + - --tsdb.max-block-duration=5m ports: - name: grpc containerPort: 10901 From 7e3d067210e14e8556e5319e7fcbacc6114af683 Mon Sep 17 00:00:00 2001 From: Yolean macbot01 Date: Tue, 3 Mar 2026 17:24:40 +0100 Subject: [PATCH 7/8] Update experiment report with S3 storage cost comparison Both backends now write to versitygw. GreptimeDB's columnar format produces 5.6x less data (252 KB vs 1.4 MB) for the same metrics workload. This flips the storage cost score and brings the weighted totals to a near-tie (Thanos 8.05 vs GreptimeDB 8.30). Co-Authored-By: Claude Opus 4.6 --- .../metrics-v2-experiment-results.md | 79 ++++++++++++++----- 1 file changed, 58 insertions(+), 21 deletions(-) diff --git a/tmp-migration-plans/metrics-v2-experiment-results.md b/tmp-migration-plans/metrics-v2-experiment-results.md index 4383ffc9..0fc9bbcf 100644 --- a/tmp-migration-plans/metrics-v2-experiment-results.md +++ b/tmp-migration-plans/metrics-v2-experiment-results.md @@ -44,9 +44,17 @@ converge script was updated to reflect the new structure. ### 5. Blob store: versitygw (not minio) The plan referenced minio in some contexts. The codebase has already migrated to -versitygw. No changes needed for the experiment itself — Thanos Receive and GreptimeDB -both use emptyDir, not S3 object storage. Any future production deployment that uses -object storage for long-term retention must target the versitygw S3 API, not minio. +versitygw. Both backends were reconfigured to write to versitygw S3 storage +(`blobs-versitygw.ystack.svc.cluster.local`) for storage cost comparison. Bucket-create +jobs provision `thanos-receive` and `greptimedb` buckets using the same minio/mc +pattern as the registry. + +### 8. Thanos 5m block duration override + +To make Thanos upload blocks to object storage quickly enough for experiment +observation, `--tsdb.min-block-duration=5m` and `--tsdb.max-block-duration=5m` were +added. The default 2h block duration would mean no S3 uploads during a short +experiment window. This override must NOT be used in production. ### 6. configmap-reload sidecar added @@ -157,6 +165,30 @@ in a single process, which explains the higher baseline. --- +## Object storage comparison + +Both backends configured to write to versitygw S3 buckets. Measured after ~17 minutes +of dual remote_write with Thanos block duration forced to 5m. + +| Backend | Bucket size | Object count | Write pattern | +|---------|------------|-------------|---------------| +| Thanos Receive | 1.4 MB | 9 files (3 blocks) | Block-based: uploads ~3 files per 5m block (meta.json, index, chunks) | +| GreptimeDB | 252 KB | 11 files | Columnar: writes smaller objects more frequently | + +GreptimeDB stores **5.6x less data** on object storage for the same metrics workload. +Its columnar format compresses significantly better than Thanos's TSDB block format. + +**Caveats:** +- Thanos block duration was artificially reduced from 2h to 5m. With default settings, + Thanos would batch more data per block, potentially improving compression ratio. +- Thanos Compactor (not deployed in this experiment) further reduces long-term storage + by merging and downsampling blocks. +- GreptimeDB's compaction behavior over longer time windows was not tested. +- 17 minutes of data is too short for definitive storage cost projections — a multi-day + test would be more representative. + +--- + ## Evaluation scores Using the criteria from the Mimir replacement research. @@ -201,38 +233,43 @@ Thanos is significantly lighter. For a local dev cluster this matters. | Backend | Score | Notes | |---------|-------|-------| -| Thanos | 8/10 | Uses S3-compatible object storage (versitygw). Well-understood cost model. Compactor reduces storage. | -| GreptimeDB | 7/10 | Also supports S3-compatible storage. Uses columnar format which should compress well. Less proven at scale. | +| Thanos | 6/10 | 1.4 MB for ~17 min of data. Block-based format is less space-efficient. Compactor helps long-term but adds operational complexity. | +| GreptimeDB | 9/10 | 252 KB for same data — 5.6x smaller. Columnar format compresses metrics data very well. Fewer bytes = lower S3 storage and egress cost. | -Both can target versitygw for object storage. Thanos has a more mature compaction -story. +GreptimeDB's columnar storage format produces significantly smaller objects. Both +backends target versitygw S3. While Thanos Compactor can reduce long-term storage, +GreptimeDB's baseline efficiency is notably better. ### Weighted total | Backend | Correctness (20%) | Complexity (40%) | Resources (15%) | Maturity (10%) | Storage (15%) | **Total** | |---------|-------------------|-----------------|-----------------|---------------|--------------|-----------| -| Thanos | 2.0 | 2.8 | 1.35 | 1.0 | 1.2 | **8.35** | -| GreptimeDB | 2.0 | 3.6 | 0.75 | 0.6 | 1.05 | **8.00** | +| Thanos | 2.0 | 2.8 | 1.35 | 1.0 | 0.9 | **8.05** | +| GreptimeDB | 2.0 | 3.6 | 0.75 | 0.6 | 1.35 | **8.30** | --- ## Recommendation -**Thanos wins narrowly (8.35 vs 8.00)**, primarily due to its lower resource footprint -and maturity. However, the scores are close enough that the decision should also -consider: +**The two backends are essentially tied (Thanos 8.05 vs GreptimeDB 8.30)** after +accounting for measured object storage efficiency. GreptimeDB's columnar format +produces 5.6x less data on S3, which flips the storage cost score and narrows +Thanos's advantage on maturity and resource usage. + +1. **For ystack local dev clusters**: Thanos is still preferred — lighter CPU/memory + footprint matters in constrained k3d environments, and storage cost is less + relevant with emptyDir/local volumes. -1. **For ystack local dev clusters**: Thanos is preferred — lighter resource usage - matters in constrained k3d environments, and the 2-component topology (Receive + - Query) is manageable. +2. **For production multi-cluster with S3 storage costs**: GreptimeDB deserves + serious consideration — its storage efficiency advantage compounds at scale, + and lower object counts mean fewer S3 API calls (PUT/GET costs). -2. **For production multi-cluster**: Thanos is preferred — the Receive component - already supports multi-tenancy via labels, and the Query component can federate - across multiple Receive instances. Zone-aware ingestion is well-documented. +3. **Thanos advantages**: CNCF graduated maturity, battle-tested at massive scale, + well-documented multi-tenancy and zone-aware ingestion, lower runtime resource + footprint. -3. **GreptimeDB remains interesting** for use cases that need SQL access to metrics - data or where the standalone deployment model is valued. It could be revisited in - a future evaluation as the project matures. +4. **GreptimeDB advantages**: Simpler single-component topology, dramatically better + storage efficiency, SQL access to metrics data, active development pace. ## Next steps From 77af59403c572aa99405291b2ebe9b70aa4a23f3 Mon Sep 17 00:00:00 2001 From: Staffan Olsson Date: Wed, 11 Mar 2026 16:54:43 +0100 Subject: [PATCH 8/8] y-cluster-provision-k3d: skip sudo when /etc/hosts is already up to date for provisioners that use a fixed IP. Use y-k8s-ingress-hosts -check before attempting -write, so provision can complete without a TTY or sudo when entries already exist. Co-Authored-By: Claude Opus 4.6 --- bin/y-cluster-provision-k3d | 7 +++++-- bin/y-cluster-provision-lima | 7 +++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/bin/y-cluster-provision-k3d b/bin/y-cluster-provision-k3d index 0a388a97..34292107 100755 --- a/bin/y-cluster-provision-k3d +++ b/bin/y-cluster-provision-k3d @@ -129,5 +129,8 @@ PROD_REGISTRY_IP=$(kubectl --context=$CTX -n ystack get service prod-registry -o docker exec k3d-ystack-server-0 sh -cex "echo '$BUILDS_REGISTRY_IP builds-registry.ystack.svc.cluster.local' >> /etc/hosts" docker exec k3d-ystack-server-0 sh -cex "echo '$PROD_REGISTRY_IP prod-registry.ystack.svc.cluster.local' >> /etc/hosts" -echo "# Updating /etc/hosts (requires sudo) ..." -y-k8s-ingress-hosts --context=$CTX -write -override-ip "${YSTACK_PORTS_IP:-127.0.0.1}" +echo "# Checking /etc/hosts ..." +if ! y-k8s-ingress-hosts --context=$CTX -check -override-ip "${YSTACK_PORTS_IP:-127.0.0.1}"; then + echo "# Updating /etc/hosts (requires sudo) ..." + y-k8s-ingress-hosts --context=$CTX -write -override-ip "${YSTACK_PORTS_IP:-127.0.0.1}" +fi diff --git a/bin/y-cluster-provision-lima b/bin/y-cluster-provision-lima index 389360fd..38f45662 100755 --- a/bin/y-cluster-provision-lima +++ b/bin/y-cluster-provision-lima @@ -122,5 +122,8 @@ PROD_REGISTRY_IP=$(kubectl --context=$CTX -n ystack get service prod-registry -o limactl shell ystack sudo sh -c "echo '$BUILDS_REGISTRY_IP builds-registry.ystack.svc.cluster.local' >> /etc/hosts" limactl shell ystack sudo sh -c "echo '$PROD_REGISTRY_IP prod-registry.ystack.svc.cluster.local' >> /etc/hosts" -echo "# Updating /etc/hosts (requires sudo) ..." -y-k8s-ingress-hosts --context=$CTX -write -override-ip 127.0.0.1 +echo "# Checking /etc/hosts ..." +if ! y-k8s-ingress-hosts --context=$CTX -check -override-ip 127.0.0.1; then + echo "# Updating /etc/hosts (requires sudo) ..." + y-k8s-ingress-hosts --context=$CTX -write -override-ip 127.0.0.1 +fi