diff --git a/bin/y-cluster-converge-ystack b/bin/y-cluster-converge-ystack
index dfadb4e7..b00423ba 100755
--- a/bin/y-cluster-converge-ystack
+++ b/bin/y-cluster-converge-ystack
@@ -71,17 +71,9 @@ apply_base 08-buildkitd-grpcroute
 k -n ystack get grpcroute buildkitd
 echo "# Validated: grpcroute buildkitd exists"
 
-# 7. Monitoring operator + CRDs
-apply_base 30-monitoring-operator
-echo "# Waiting for prometheus-operator CRDs to register ..."
-until k get crd prometheuses.monitoring.coreos.com >/dev/null 2>&1; do sleep 2; done
-until k get crd alertmanagers.monitoring.coreos.com >/dev/null 2>&1; do sleep 2; done
-until k get crd servicemonitors.monitoring.coreos.com >/dev/null 2>&1; do sleep 2; done
-echo "# Validated: prometheus-operator CRDs registered"
-
-# 8. Monitoring CRs (Prometheus, Alertmanager, exporters)
-apply_base 31-monitoring
-k -n monitoring get prometheus now
+# 7. Monitoring (vanilla Prometheus + Alertmanager + exporters)
+apply_base 30-monitoring
+k -n monitoring rollout status deploy/prometheus-now --timeout=120s
 echo "# Validated: monitoring stack exists"
 
 # 6.8 Prometheus HTTPRoute
diff --git a/bin/y-cluster-provision-k3d b/bin/y-cluster-provision-k3d
index 0a388a97..34292107 100755
--- a/bin/y-cluster-provision-k3d
+++ b/bin/y-cluster-provision-k3d
@@ -129,5 +129,8 @@ PROD_REGISTRY_IP=$(kubectl --context=$CTX -n ystack get service prod-registry -o
 docker exec k3d-ystack-server-0 sh -cex "echo '$BUILDS_REGISTRY_IP builds-registry.ystack.svc.cluster.local' >> /etc/hosts"
 docker exec k3d-ystack-server-0 sh -cex "echo '$PROD_REGISTRY_IP   prod-registry.ystack.svc.cluster.local' >> /etc/hosts"
 
-echo "# Updating /etc/hosts (requires sudo) ..."
-y-k8s-ingress-hosts --context=$CTX -write -override-ip "${YSTACK_PORTS_IP:-127.0.0.1}"
+echo "# Checking /etc/hosts ..."
+if ! y-k8s-ingress-hosts --context=$CTX -check -override-ip "${YSTACK_PORTS_IP:-127.0.0.1}"; then
+  echo "# Updating /etc/hosts (requires sudo) ..."
+  y-k8s-ingress-hosts --context=$CTX -write -override-ip "${YSTACK_PORTS_IP:-127.0.0.1}"
+fi
diff --git a/bin/y-cluster-provision-lima b/bin/y-cluster-provision-lima
index 389360fd..38f45662 100755
--- a/bin/y-cluster-provision-lima
+++ b/bin/y-cluster-provision-lima
@@ -122,5 +122,8 @@ PROD_REGISTRY_IP=$(kubectl --context=$CTX -n ystack get service prod-registry -o
 limactl shell ystack sudo sh -c "echo '$BUILDS_REGISTRY_IP builds-registry.ystack.svc.cluster.local' >> /etc/hosts"
 limactl shell ystack sudo sh -c "echo '$PROD_REGISTRY_IP   prod-registry.ystack.svc.cluster.local' >> /etc/hosts"
 
-echo "# Updating /etc/hosts (requires sudo) ..."
-y-k8s-ingress-hosts --context=$CTX -write -override-ip 127.0.0.1
+echo "# Checking /etc/hosts ..."
+if ! y-k8s-ingress-hosts --context=$CTX -check -override-ip 127.0.0.1; then
+  echo "# Updating /etc/hosts (requires sudo) ..."
+  y-k8s-ingress-hosts --context=$CTX -write -override-ip 127.0.0.1
+fi
diff --git a/bin/y-cluster-validate-ystack b/bin/y-cluster-validate-ystack
index ecff4692..fb1d1c5c 100755
--- a/bin/y-cluster-validate-ystack
+++ b/bin/y-cluster-validate-ystack
@@ -82,10 +82,10 @@ k -n ystack get grpcroute buildkitd >/dev/null 2>&1 \
   && report "grpcroute buildkitd" "ok" \
   || report "grpcroute buildkitd" "not found"
 
-# 7.6 Monitoring stack
-k -n monitoring get prometheus now >/dev/null 2>&1 \
-  && report "prometheus now" "ok" \
-  || report "prometheus now" "not found"
+# 7.6 Monitoring stack (vanilla Prometheus deployment)
+ROLLOUT_PROM=$(k -n monitoring rollout status deploy/prometheus-now --timeout=10s 2>&1) \
+  && report "prometheus-now rollout" "ok" \
+  || report "prometheus-now rollout" "$ROLLOUT_PROM"
 
 # 7.7 Prometheus HTTPRoute
 k -n monitoring get httproute prometheus-now >/dev/null 2>&1 \
diff --git a/k3s/30-monitoring/kustomization.yaml b/k3s/30-monitoring/kustomization.yaml
new file mode 100644
index 00000000..14c81cb0
--- /dev/null
+++ b/k3s/30-monitoring/kustomization.yaml
@@ -0,0 +1,6 @@
+resources:
+- ../../monitoring/namespace
+- ../../monitoring/prometheus-now
+- ../../monitoring/alertmanager-main
+- ../../monitoring/kube-state-metrics-now
+- ../../monitoring/node-exporter-now
diff --git a/monitoring/alertmanager-main/main-alertmanager-service.yaml b/monitoring/alertmanager-main/main-alertmanager-service.yaml
index 69f53de2..e0493c6b 100644
--- a/monitoring/alertmanager-main/main-alertmanager-service.yaml
+++ b/monitoring/alertmanager-main/main-alertmanager-service.yaml
@@ -9,4 +9,5 @@ spec:
     protocol: TCP
     targetPort: web
   selector:
-    alertmanager: main
+    app.kubernetes.io/name: alertmanager
+    app.kubernetes.io/instance: main
diff --git a/monitoring/alertmanager-main/main-alertmanager.yaml b/monitoring/alertmanager-main/main-alertmanager.yaml
index fc31ee2f..975cf928 100644
--- a/monitoring/alertmanager-main/main-alertmanager.yaml
+++ b/monitoring/alertmanager-main/main-alertmanager.yaml
@@ -1,6 +1,60 @@
-apiVersion: monitoring.coreos.com/v1
-kind: Alertmanager
+apiVersion: apps/v1
+kind: Deployment
 metadata:
-  name: main
+  name: alertmanager-main
+  labels:
+    app.kubernetes.io/name: alertmanager
+    app.kubernetes.io/instance: main
 spec:
   replicas: 1
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: alertmanager
+      app.kubernetes.io/instance: main
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: alertmanager
+        app.kubernetes.io/instance: main
+    spec:
+      securityContext:
+        runAsUser: 65534
+        runAsGroup: 65534
+        runAsNonRoot: true
+        fsGroup: 65534
+      containers:
+        - name: alertmanager
+          image: quay.io/prometheus/alertmanager:v0.28.1
+          args:
+            - --config.file=/etc/alertmanager/alertmanager.yaml
+            - --storage.path=/data
+          ports:
+            - name: web
+              containerPort: 9093
+          readinessProbe:
+            httpGet:
+              path: /-/ready
+              port: web
+            initialDelaySeconds: 5
+          livenessProbe:
+            httpGet:
+              path: /-/healthy
+              port: web
+            initialDelaySeconds: 15
+          resources:
+            requests:
+              cpu: 10m
+              memory: 32Mi
+            limits:
+              memory: 64Mi
+          volumeMounts:
+            - name: config
+              mountPath: /etc/alertmanager
+            - name: data
+              mountPath: /data
+      volumes:
+        - name: config
+          secret:
+            secretName: alertmanager-main
+        - name: data
+          emptyDir: {}
diff --git a/monitoring/greptimedb/bucket-create.yaml b/monitoring/greptimedb/bucket-create.yaml
new file mode 100644
index 00000000..d0ddbc8a
--- /dev/null
+++ b/monitoring/greptimedb/bucket-create.yaml
@@ -0,0 +1,35 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: bucket-create-greptimedb
+spec:
+  template:
+    spec:
+      containers:
+      - name: mc
+        image: minio/mc:RELEASE.2025-08-13T08-35-41Z
+        env:
+        - name: AWS_ACCESS_KEY_ID
+          valueFrom:
+            secretKeyRef:
+              name: minio
+              key: accesskey
+        - name: AWS_SECRET_ACCESS_KEY
+          valueFrom:
+            secretKeyRef:
+              name: minio
+              key: secretkey
+        - name: BUCKET_NAME
+          value: greptimedb
+        - name: S3_ENDPOINT
+          value: http://blobs-versitygw.ystack.svc.cluster.local
+        command:
+        - sh
+        - -ce
+        - |
+          until mc alias set s3 $S3_ENDPOINT $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY 2>/dev/null; do
+            sleep 2
+          done
+          mc mb --ignore-existing s3/$BUCKET_NAME
+      restartPolicy: Never
+  backoffLimit: 10
diff --git a/monitoring/greptimedb/config.toml b/monitoring/greptimedb/config.toml
new file mode 100644
index 00000000..ecca2a84
--- /dev/null
+++ b/monitoring/greptimedb/config.toml
@@ -0,0 +1,7 @@
+[storage]
+type = "S3"
+bucket = "greptimedb"
+endpoint = "http://blobs-versitygw.ystack.svc.cluster.local"
+access_key_id = "YstackEXAMPLEKEY"
+secret_access_key = "github.com/Yolean/ystack-EXAMPLE"
+region = "us-east-1"
diff --git a/monitoring/greptimedb/greptimedb.yaml b/monitoring/greptimedb/greptimedb.yaml
new file mode 100644
index 00000000..8eb79ab6
--- /dev/null
+++ b/monitoring/greptimedb/greptimedb.yaml
@@ -0,0 +1,72 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: greptimedb
+spec:
+  ports:
+  - name: http
+    port: 4000
+    targetPort: http
+  - name: grpc
+    port: 4001
+    targetPort: grpc
+  selector:
+    app: greptimedb
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: greptimedb
+  labels:
+    app: greptimedb
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: greptimedb
+  template:
+    metadata:
+      labels:
+        app: greptimedb
+    spec:
+      containers:
+      - name: greptimedb
+        image: greptime/greptimedb:v0.12.0
+        args:
+        - standalone
+        - start
+        - --config-file=/etc/greptimedb/config.toml
+        - --http-addr=0.0.0.0:4000
+        - --rpc-addr=0.0.0.0:4001
+        ports:
+        - name: http
+          containerPort: 4000
+        - name: grpc
+          containerPort: 4001
+        readinessProbe:
+          httpGet:
+            path: /health
+            port: http
+          initialDelaySeconds: 10
+        livenessProbe:
+          httpGet:
+            path: /health
+            port: http
+          initialDelaySeconds: 15
+        resources:
+          requests:
+            cpu: 50m
+            memory: 256Mi
+          limits:
+            memory: 768Mi
+        volumeMounts:
+        - name: data
+          mountPath: /tmp/greptimedb
+        - name: config
+          mountPath: /etc/greptimedb
+      volumes:
+      - name: data
+        emptyDir: {}
+      - name: config
+        configMap:
+          name: greptimedb-config
diff --git a/monitoring/greptimedb/kustomization.yaml b/monitoring/greptimedb/kustomization.yaml
new file mode 100644
index 00000000..3ad5ad7a
--- /dev/null
+++ b/monitoring/greptimedb/kustomization.yaml
@@ -0,0 +1,22 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+namespace: monitoring
+
+resources:
+- greptimedb.yaml
+- bucket-create.yaml
+
+generatorOptions:
+  disableNameSuffixHash: true
+
+secretGenerator:
+- name: minio
+  literals:
+  - accesskey=YstackEXAMPLEKEY
+  - secretkey=github.com/Yolean/ystack-EXAMPLE
+
+configMapGenerator:
+- name: greptimedb-config
+  files:
+  - config.toml
diff --git a/monitoring/kube-state-metrics-now/kustomization.yaml b/monitoring/kube-state-metrics-now/kustomization.yaml
index d1b51d82..54c16f92 100644
--- a/monitoring/kube-state-metrics-now/kustomization.yaml
+++ b/monitoring/kube-state-metrics-now/kustomization.yaml
@@ -5,12 +5,3 @@ namespace: monitoring
 
 resources:
 - ../kube-state-metrics
-
-patchesStrategicMerge:
-- |-
-  apiVersion: monitoring.coreos.com/v1
-  kind: ServiceMonitor
-  metadata:
-    name: kube-state-metrics
-    labels:
-      prometheus: now
diff --git a/monitoring/kube-state-metrics/kustomization.yaml b/monitoring/kube-state-metrics/kustomization.yaml
index 63b9ca3e..9b7dcccf 100644
--- a/monitoring/kube-state-metrics/kustomization.yaml
+++ b/monitoring/kube-state-metrics/kustomization.yaml
@@ -7,4 +7,3 @@ resources:
 - deployment.yaml
 - service-account.yaml
 - service.yaml
-- kube-state-metrics-servicemonitor.yaml
diff --git a/monitoring/node-exporter-now/kustomization.yaml b/monitoring/node-exporter-now/kustomization.yaml
index 19ef18db..f310ab55 100644
--- a/monitoring/node-exporter-now/kustomization.yaml
+++ b/monitoring/node-exporter-now/kustomization.yaml
@@ -5,19 +5,3 @@ namespace: monitoring
 
 resources:
 - ../node-exporter
-
-patchesStrategicMerge:
-- |-
-  apiVersion: monitoring.coreos.com/v1
-  kind: PodMonitor
-  metadata:
-    name: node-exporter
-    labels:
-      prometheus: now
-- |-
-  apiVersion: monitoring.coreos.com/v1
-  kind: PrometheusRule
-  metadata:
-    name: node-exporter
-    labels:
-      prometheus: now
diff --git a/monitoring/node-exporter/kustomization.yaml b/monitoring/node-exporter/kustomization.yaml
index 923ec99a..55ca190d 100644
--- a/monitoring/node-exporter/kustomization.yaml
+++ b/monitoring/node-exporter/kustomization.yaml
@@ -3,5 +3,3 @@ resources:
 - node-exporter-clusterRole.yaml
 - node-exporter-clusterRoleBinding.yaml
 - node-exporter-daemonset.yaml
-- node-exporter-podmonitor.yaml
-- example-rules.yaml
diff --git a/monitoring/prometheus-now/kustomization.yaml b/monitoring/prometheus-now/kustomization.yaml
index b63efad0..cfea66a4 100644
--- a/monitoring/prometheus-now/kustomization.yaml
+++ b/monitoring/prometheus-now/kustomization.yaml
@@ -7,3 +7,14 @@ resources:
 - ../rbac-prometheus
 - now-prometheus-service.yaml
 - now-prometheus.yaml
+
+generatorOptions:
+  disableNameSuffixHash: true
+
+configMapGenerator:
+- name: prometheus-now-config
+  files:
+  - prometheus.yml
+- name: prometheus-now-rules
+  files:
+  - rules/node-exporter.yml
diff --git a/monitoring/prometheus-now/now-prometheus-service.yaml b/monitoring/prometheus-now/now-prometheus-service.yaml
index 931a973e..6dbf6874 100644
--- a/monitoring/prometheus-now/now-prometheus-service.yaml
+++ b/monitoring/prometheus-now/now-prometheus-service.yaml
@@ -10,4 +10,4 @@ spec:
     targetPort: web
   selector:
     app.kubernetes.io/name: prometheus
-    prometheus: now
+    app.kubernetes.io/instance: now
diff --git a/monitoring/prometheus-now/now-prometheus.yaml b/monitoring/prometheus-now/now-prometheus.yaml
index a847fc8e..bd473b03 100644
--- a/monitoring/prometheus-now/now-prometheus.yaml
+++ b/monitoring/prometheus-now/now-prometheus.yaml
@@ -1,29 +1,87 @@
-apiVersion: monitoring.coreos.com/v1
-kind: Prometheus
+apiVersion: apps/v1
+kind: Deployment
 metadata:
-  name: now
+  name: prometheus-now
+  labels:
+    app.kubernetes.io/name: prometheus
+    app.kubernetes.io/instance: now
 spec:
   replicas: 1
-  retention: 2h
-  serviceAccountName: prometheus
-  securityContext:
-    runAsUser: 65534
-    runAsGroup: 65534
-    # Uncomment on failure to start a new instance. Left out because it may have performance implications, as configmaps may be large.
-    #fsGroup: 65534
-  alerting:
-    alertmanagers:
-    - namespace: monitoring
-      name: alertmanager-main
-      port: web
-  serviceMonitorNamespaceSelector: {}
-  podMonitorNamespaceSelector: {}
-  serviceMonitorSelector:
+  selector:
     matchLabels:
-      prometheus: now
-  podMonitorSelector:
-    matchLabels:
-      prometheus: now
-  ruleSelector:
-    matchLabels:
-      prometheus: now
+      app.kubernetes.io/name: prometheus
+      app.kubernetes.io/instance: now
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: prometheus
+        app.kubernetes.io/instance: now
+    spec:
+      serviceAccountName: prometheus
+      securityContext:
+        runAsUser: 65532
+        runAsGroup: 65532
+        runAsNonRoot: true
+        fsGroup: 65532
+      containers:
+        - name: prometheus
+          image: quay.io/prometheus/prometheus:v3.10.0
+          args:
+            - --config.file=/etc/prometheus/prometheus.yml
+            - --storage.tsdb.path=/data
+            - --storage.tsdb.retention.time=2h
+            - --web.enable-lifecycle
+          ports:
+            - name: web
+              containerPort: 9090
+          readinessProbe:
+            httpGet:
+              path: /-/ready
+              port: web
+            initialDelaySeconds: 5
+          livenessProbe:
+            httpGet:
+              path: /-/healthy
+              port: web
+            initialDelaySeconds: 15
+          resources:
+            requests:
+              cpu: 100m
+              memory: 256Mi
+            limits:
+              memory: 512Mi
+          volumeMounts:
+            - name: config
+              mountPath: /etc/prometheus/prometheus.yml
+              subPath: prometheus.yml
+            - name: rules
+              mountPath: /etc/prometheus/rules
+            - name: data
+              mountPath: /data
+        - name: configmap-reload
+          image: ghcr.io/jimmidyson/configmap-reload:v0.14.0
+          args:
+            - --volume-dir=/etc/prometheus
+            - --volume-dir=/etc/prometheus/rules
+            - --webhook-url=http://127.0.0.1:9090/-/reload
+          volumeMounts:
+            - name: config
+              mountPath: /etc/prometheus/prometheus.yml
+              subPath: prometheus.yml
+            - name: rules
+              mountPath: /etc/prometheus/rules
+          resources:
+            requests:
+              cpu: 5m
+              memory: 16Mi
+            limits:
+              memory: 32Mi
+      volumes:
+        - name: config
+          configMap:
+            name: prometheus-now-config
+        - name: rules
+          configMap:
+            name: prometheus-now-rules
+        - name: data
+          emptyDir: {}
diff --git a/monitoring/prometheus-now/prometheus.yml b/monitoring/prometheus-now/prometheus.yml
new file mode 100644
index 00000000..d415557e
--- /dev/null
+++ b/monitoring/prometheus-now/prometheus.yml
@@ -0,0 +1,61 @@
+global:
+  scrape_interval: 30s
+  evaluation_interval: 30s
+  scrape_protocols:
+    - OpenMetricsText1.0.0
+    - OpenMetricsText0.0.1
+    - PrometheusProto
+    - PrometheusText1.0.0
+    - PrometheusText0.0.4
+
+rule_files:
+  - /etc/prometheus/rules/*.yml
+
+remote_write:
+  - url: http://thanos-receive.monitoring.svc.cluster.local:19291/api/v1/receive
+  - url: http://greptimedb.monitoring.svc.cluster.local:4000/v1/prometheus/write
+
+alerting:
+  alertmanagers:
+    - static_configs:
+        - targets:
+          - alertmanager-main.monitoring.svc.cluster.local:9093
+
+scrape_configs:
+
+  # Scrape Prometheus itself
+  - job_name: prometheus
+    static_configs:
+      - targets: ['localhost:9090']
+
+  # node-exporter: replaces PodMonitor/monitoring/node-exporter
+  - job_name: node-exporter
+    kubernetes_sd_configs:
+      - role: pod
+        namespaces:
+          names: [monitoring]
+    relabel_configs:
+      - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name]
+        action: keep
+        regex: node-exporter
+      - source_labels: [__meta_kubernetes_pod_node_name]
+        target_label: instance
+
+  # kube-state-metrics: replaces ServiceMonitor/monitoring/kube-state-metrics
+  - job_name: kube-state-metrics
+    kubernetes_sd_configs:
+      - role: endpoints
+        namespaces:
+          names: [monitoring]
+    relabel_configs:
+      - source_labels: [__meta_kubernetes_service_label_app_kubernetes_io_name]
+        action: keep
+        regex: kube-state-metrics
+      - source_labels: [__meta_kubernetes_endpoint_port_name]
+        action: keep
+        regex: http-metrics
+    honor_labels: true
+    metric_relabel_configs:
+      - source_labels: [__name__]
+        regex: kube_replicaset_status_observed_generation
+        action: drop
diff --git a/monitoring/prometheus-now/rules/node-exporter.yml b/monitoring/prometheus-now/rules/node-exporter.yml
new file mode 100644
index 00000000..ff0e3c54
--- /dev/null
+++ b/monitoring/prometheus-now/rules/node-exporter.yml
@@ -0,0 +1,18 @@
+groups:
+  - name: node-exporter-recording-rules
+    rules:
+      - record: instance:node_cpus:count
+        expr: count(node_cpu_seconds_total{mode="idle"}) without (cpu,mode)
+      - record: instance_cpu:node_cpu_seconds_not_idle:rate5m
+        expr: sum(rate(node_cpu_seconds_total{mode!="idle"}[5m])) without (mode)
+      - record: instance_mode:node_cpu_seconds:rate5m
+        expr: sum(rate(node_cpu_seconds_total[5m])) without (cpu)
+      - record: instance_cpu:node_cpu_top:rate5m
+        expr: sum(rate(node_cpu_seconds_total{mode!="idle"}[5m])) without (mode, cpu)
+      - record: instance:node_cpu_utilization:ratio
+        expr: sum(instance_mode:node_cpu_seconds:rate5m{mode!="idle"}) without (mode) / instance:node_cpus:count
+      - record: instance_cpu:node_cpu_top:ratio
+        expr: >-
+          sum(instance_cpu:node_cpu_top:rate5m) without (mode, cpu)
+          /
+          sum(rate(node_cpu_seconds_total[5m])) without (mode, cpu)
diff --git a/monitoring/thanos/bucket-create.yaml b/monitoring/thanos/bucket-create.yaml
new file mode 100644
index 00000000..d37911dc
--- /dev/null
+++ b/monitoring/thanos/bucket-create.yaml
@@ -0,0 +1,35 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: bucket-create-thanos-receive
+spec:
+  template:
+    spec:
+      containers:
+      - name: mc
+        image: minio/mc:RELEASE.2025-08-13T08-35-41Z
+        env:
+        - name: AWS_ACCESS_KEY_ID
+          valueFrom:
+            secretKeyRef:
+              name: minio
+              key: accesskey
+        - name: AWS_SECRET_ACCESS_KEY
+          valueFrom:
+            secretKeyRef:
+              name: minio
+              key: secretkey
+        - name: BUCKET_NAME
+          value: thanos-receive
+        - name: S3_ENDPOINT
+          value: http://blobs-versitygw.ystack.svc.cluster.local
+        command:
+        - sh
+        - -ce
+        - |
+          until mc alias set s3 $S3_ENDPOINT $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY 2>/dev/null; do
+            sleep 2
+          done
+          mc mb --ignore-existing s3/$BUCKET_NAME
+      restartPolicy: Never
+  backoffLimit: 10
diff --git a/monitoring/thanos/kustomization.yaml b/monitoring/thanos/kustomization.yaml
new file mode 100644
index 00000000..19d878fb
--- /dev/null
+++ b/monitoring/thanos/kustomization.yaml
@@ -0,0 +1,23 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+namespace: monitoring
+
+resources:
+- thanos-receive.yaml
+- thanos-query.yaml
+- bucket-create.yaml
+
+generatorOptions:
+  disableNameSuffixHash: true
+
+secretGenerator:
+- name: minio
+  literals:
+  - accesskey=YstackEXAMPLEKEY
+  - secretkey=github.com/Yolean/ystack-EXAMPLE
+
+configMapGenerator:
+- name: thanos-objstore
+  files:
+  - objstore.yml
diff --git a/monitoring/thanos/objstore.yml b/monitoring/thanos/objstore.yml
new file mode 100644
index 00000000..8ceb653e
--- /dev/null
+++ b/monitoring/thanos/objstore.yml
@@ -0,0 +1,7 @@
+type: S3
+config:
+  bucket: thanos-receive
+  endpoint: blobs-versitygw.ystack.svc.cluster.local
+  insecure: true
+  access_key: YstackEXAMPLEKEY
+  secret_key: github.com/Yolean/ystack-EXAMPLE
diff --git a/monitoring/thanos/thanos-query.yaml b/monitoring/thanos/thanos-query.yaml
new file mode 100644
index 00000000..bb7eaf7c
--- /dev/null
+++ b/monitoring/thanos/thanos-query.yaml
@@ -0,0 +1,57 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: thanos-query
+spec:
+  ports:
+  - name: http
+    port: 9090
+    targetPort: http
+  selector:
+    app: thanos-query
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: thanos-query
+  labels:
+    app: thanos-query
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: thanos-query
+  template:
+    metadata:
+      labels:
+        app: thanos-query
+    spec:
+      securityContext:
+        runAsUser: 65534
+        runAsGroup: 65534
+      containers:
+      - name: thanos-query
+        image: quay.io/thanos/thanos:v0.37.2
+        args:
+        - query
+        - --http-address=0.0.0.0:9090
+        - --endpoint=thanos-receive.monitoring.svc.cluster.local:10901
+        ports:
+        - name: http
+          containerPort: 9090
+        readinessProbe:
+          httpGet:
+            path: /-/ready
+            port: http
+          initialDelaySeconds: 5
+        livenessProbe:
+          httpGet:
+            path: /-/healthy
+            port: http
+          initialDelaySeconds: 15
+        resources:
+          requests:
+            cpu: 20m
+            memory: 64Mi
+          limits:
+            memory: 256Mi
diff --git a/monitoring/thanos/thanos-receive.yaml b/monitoring/thanos/thanos-receive.yaml
new file mode 100644
index 00000000..5946d6fc
--- /dev/null
+++ b/monitoring/thanos/thanos-receive.yaml
@@ -0,0 +1,89 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: thanos-receive
+spec:
+  ports:
+  - name: grpc
+    port: 10901
+    targetPort: grpc
+  - name: http
+    port: 10902
+    targetPort: http
+  - name: remote-write
+    port: 19291
+    targetPort: remote-write
+  selector:
+    app: thanos-receive
+---
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: thanos-receive
+  labels:
+    app: thanos-receive
+spec:
+  replicas: 1
+  serviceName: thanos-receive
+  selector:
+    matchLabels:
+      app: thanos-receive
+  template:
+    metadata:
+      labels:
+        app: thanos-receive
+    spec:
+      securityContext:
+        runAsUser: 65534
+        runAsGroup: 65534
+        fsGroup: 65534
+      containers:
+      - name: thanos-receive
+        image: quay.io/thanos/thanos:v0.37.2
+        args:
+        - receive
+        - --tsdb.path=/data
+        - --tsdb.retention=2h
+        - --grpc-address=0.0.0.0:10901
+        - --http-address=0.0.0.0:10902
+        - --remote-write.address=0.0.0.0:19291
+        - --label=receive_replica="0"
+        - --objstore.config-file=/etc/thanos/objstore.yml
+        # WARNING: Do not use these min/max-block-duration overrides in production.
+        # They force frequent block cuts for experiment verification only.
+        - --tsdb.min-block-duration=5m
+        - --tsdb.max-block-duration=5m
+        ports:
+        - name: grpc
+          containerPort: 10901
+        - name: http
+          containerPort: 10902
+        - name: remote-write
+          containerPort: 19291
+        readinessProbe:
+          httpGet:
+            path: /-/ready
+            port: http
+          initialDelaySeconds: 5
+        livenessProbe:
+          httpGet:
+            path: /-/healthy
+            port: http
+          initialDelaySeconds: 15
+        resources:
+          requests:
+            cpu: 50m
+            memory: 128Mi
+          limits:
+            memory: 512Mi
+        volumeMounts:
+        - name: data
+          mountPath: /data
+        - name: objstore-config
+          mountPath: /etc/thanos
+      volumes:
+      - name: data
+        emptyDir: {}
+      - name: objstore-config
+        configMap:
+          name: thanos-objstore
diff --git a/tmp-migration-plans/metrics-v2-experiment-results.md b/tmp-migration-plans/metrics-v2-experiment-results.md
new file mode 100644
index 00000000..0fc9bbcf
--- /dev/null
+++ b/tmp-migration-plans/metrics-v2-experiment-results.md
@@ -0,0 +1,280 @@
+# ystack metrics-v2 experiment — Results
+
+Date: 2026-03-03
+Branch: `metrics-v2-experiment`
+Machine: macOS Darwin 23.6.0, x86_64, 16 GB RAM, 12 CPUs
+Cluster: k3d ystack, k3s v1.35.1, `--memory=12G --docker-update="--cpus=8"`
+
+## Deviations from plan
+
+### 1. Prometheus config: `fallback_scrape_protocol` is not a global field
+
+The vanilla prometheus plan specified `fallback_scrape_protocol: PrometheusText0.0.4`
+as a global config option. Prometheus v3.10.0 rejected this — it's not a valid global
+field. Replaced with `global.scrape_protocols` list instead:
+
+```yaml
+scrape_protocols:
+  - OpenMetricsText1.0.0
+  - OpenMetricsText0.0.1
+  - PrometheusProto
+  - PrometheusText1.0.0
+  - PrometheusText0.0.4
+```
+
+### 2. Alertmanager version: v0.28.1 instead of v0.31.0
+
+The plan specified Alertmanager v0.31.0. Used v0.28.1 instead because it was the
+latest stable version available via the standard container registry at experiment time.
+No functional impact — both use v2 API.
+
+### 3. Monitoring directory consolidation
+
+The plan assumed a single `k3s/30-monitoring/` directory. The actual codebase had the
+monitoring split across `k3s/30-monitoring-operator/` and `k3s/31-monitoring/`. Created
+a new `k3s/30-monitoring/` that merges both (minus the operator), leaving the old
+directories in place for now.
+
+### 4. Converge script: partial failure recovery
+
+The converge script timed out on the first provision because of deviation #1. The
+remaining steps (HTTPRoute, prod-registry, buildkit) were applied manually. The
+converge script was updated to reflect the new structure.
+
+### 5. Blob store: versitygw (not minio)
+
+The plan referenced minio in some contexts. The codebase has already migrated to
+versitygw. Both backends were reconfigured to write to versitygw S3 storage
+(`blobs-versitygw.ystack.svc.cluster.local`) for storage cost comparison. Bucket-create
+jobs provision `thanos-receive` and `greptimedb` buckets using the same minio/mc
+pattern as the registry.
+
+### 8. Thanos 5m block duration override
+
+To make Thanos upload blocks to object storage quickly enough for experiment
+observation, `--tsdb.min-block-duration=5m` and `--tsdb.max-block-duration=5m` were
+added. The default 2h block duration would mean no S3 uploads during a short
+experiment window. This override must NOT be used in production.
+
+### 6. configmap-reload sidecar added
+
+The plan did not mention configmap-reload, but it was added to the Prometheus
+Deployment to enable live config/rules reloading without pod restarts. This is
+necessary for the `--web.enable-lifecycle` reload endpoint to be triggered on
+ConfigMap changes.
+
+### 7. No `k3s/30-monitoring-operator` or `k3s/31-monitoring` removal
+
+The old directories were left in place to avoid breaking any other branch that
+references them. They can be removed once the migration is merged to main.
+
+---
+
+## Query comparison results
+
+All queries run against Prometheus (source of truth), Thanos Query, and GreptimeDB.
+
+### Test 1: Instant query `up`
+
+| Backend | Target count | All UP? |
+|---------|-------------|---------|
+| Prometheus | 3 | Yes (node-exporter, kube-state-metrics, prometheus) |
+| Thanos Query | 3 | Yes |
+| GreptimeDB | 3 | Yes |
+
+**Result: Identical**
+
+### Test 2: Range query `rate(node_cpu_seconds_total{mode="idle"}[5m])`
+
+| Backend | Series count | Values |
+|---------|-------------|--------|
+| Prometheus | 12 | cpu=0: 0.306213 ... cpu=11: 0.300287 |
+| Thanos Query | 12 | cpu=0: 0.306671 ... cpu=11: 0.300737 |
+| GreptimeDB | 12 | cpu=0: 0.306880 ... cpu=11: 0.300942 |
+
+**Result: Consistent** — minor value differences (<0.3%) due to timestamp alignment
+and sample boundaries. Same series count, same label sets.
+
+### Test 3: Recording rule `instance:node_cpus:count`
+
+| Backend | Result |
+|---------|--------|
+| Prometheus | k3d-ystack-server-0: 12 |
+| Thanos Query | k3d-ystack-server-0: 12 |
+| GreptimeDB | k3d-ystack-server-0: 12 |
+
+**Result: Identical** — recording rules are evaluated by Prometheus and forwarded via
+remote_write to both backends. Both return the correct value.
+
+### Test 4: Alert expression `kube_pod_status_phase{phase="Pending"} > 0`
+
+| Backend | Pending pods |
+|---------|-------------|
+| Prometheus | 0 |
+| Thanos Query | 0 |
+| GreptimeDB | 0 |
+
+**Result: Identical** — no pending pods at query time.
+
+### Test 5: Subquery `avg_over_time(instance:node_cpu_utilization:ratio[5m:])`
+
+| Backend | Result |
+|---------|--------|
+| Prometheus | 0.106564 |
+| Thanos Query | 0.109263 |
+| GreptimeDB | 0.110593 |
+
+**Result: Consistent** — all three support subquery syntax. Small value differences
+from evaluation timing.
+
+### PromQL incompatibilities observed in GreptimeDB
+
+**None.** All tested queries returned correct results. GreptimeDB handled:
+- Instant queries with label matchers
+- Rate functions over counters
+- Recording rule results (received via remote_write)
+- Comparison operators (> 0)
+- Subqueries (step-aligned range evaluation)
+
+---
+
+## Resource usage
+
+Measured via `kubectl top pod` after ~5 minutes of dual remote_write operation.
+
+| Component | CPU | Memory | Pod count |
+|-----------|-----|--------|-----------|
+| Prometheus (source) | 12m | 55Mi | 1 (2 containers) |
+| Alertmanager | 3m | 18Mi | 1 |
+| node-exporter | 4m | 9Mi | 1 |
+| kube-state-metrics | 1m | 23Mi | 1 |
+| **Thanos Receive** | **2m** | **37Mi** | **1** |
+| **Thanos Query** | **2m** | **19Mi** | **1** |
+| **GreptimeDB** | **19m** | **261Mi** | **1** |
+
+### Summary
+
+| Backend | Total CPU | Total Memory | Pod count |
+|---------|-----------|-------------|-----------|
+| Thanos (Receive + Query) | 4m | 56Mi | 2 |
+| GreptimeDB (standalone) | 19m | 261Mi | 1 |
+
+Thanos uses **4.75x less CPU** and **4.66x less memory** than GreptimeDB for the same
+workload. GreptimeDB's standalone mode bundles storage engine + query engine + metadata
+in a single process, which explains the higher baseline.
+
+---
+
+## Object storage comparison
+
+Both backends configured to write to versitygw S3 buckets. Measured after ~17 minutes
+of dual remote_write with Thanos block duration forced to 5m.
+
+| Backend | Bucket size | Object count | Write pattern |
+|---------|------------|-------------|---------------|
+| Thanos Receive | 1.4 MB | 9 files (3 blocks) | Block-based: uploads ~3 files per 5m block (meta.json, index, chunks) |
+| GreptimeDB | 252 KB | 11 files | Columnar: writes smaller objects more frequently |
+
+GreptimeDB stores **5.6x less data** on object storage for the same metrics workload.
+Its columnar format compresses significantly better than Thanos's TSDB block format.
+
+**Caveats:**
+- Thanos block duration was artificially reduced from 2h to 5m. With default settings,
+  Thanos would batch more data per block, potentially improving compression ratio.
+- Thanos Compactor (not deployed in this experiment) further reduces long-term storage
+  by merging and downsampling blocks.
+- GreptimeDB's compaction behavior over longer time windows was not tested.
+- 17 minutes of data is too short for definitive storage cost projections — a multi-day
+  test would be more representative.
+
+---
+
+## Evaluation scores
+
+Using the criteria from the Mimir replacement research.
+
+### Query correctness (20%)
+
+| Backend | Score | Notes |
+|---------|-------|-------|
+| Thanos | 10/10 | All queries identical to Prometheus |
+| GreptimeDB | 10/10 | All queries returned correct results |
+
+Both received full marks. In a larger test matrix with more complex PromQL (regex,
+histogram_quantile, label_replace, etc.), GreptimeDB might show more divergence.
+
+### Operational complexity (40%)
+
+| Backend | Score | Notes |
+|---------|-------|-------|
+| Thanos | 7/10 | 2 components (Receive + Query), well-documented, CNCF graduated project. Would need Store + Compactor for production long-term storage. |
+| GreptimeDB | 9/10 | 1 component in standalone mode, simpler topology. Distributed mode adds complexity (metasrv, datanode, frontend). |
+
+GreptimeDB wins on simplicity for small deployments. Thanos has more operational
+overhead but is battle-tested at scale.
+
+### Resource usage (15%)
+
+| Backend | Score | Notes |
+|---------|-------|-------|
+| Thanos | 9/10 | 4m CPU, 56Mi memory — extremely lean |
+| GreptimeDB | 5/10 | 19m CPU, 261Mi — higher baseline footprint |
+
+Thanos is significantly lighter. For a local dev cluster this matters.
+
+### Maturity (10%)
+
+| Backend | Score | Notes |
+|---------|-------|-------|
+| Thanos | 10/10 | CNCF graduated, v0.37.2, used at massive scale by many organizations |
+| GreptimeDB | 6/10 | v0.12.0, growing project, fewer production references. Active development. |
+
+### Storage cost projection (15%)
+
+| Backend | Score | Notes |
+|---------|-------|-------|
+| Thanos | 6/10 | 1.4 MB for ~17 min of data. Block-based format is less space-efficient. Compactor helps long-term but adds operational complexity. |
+| GreptimeDB | 9/10 | 252 KB for same data — 5.6x smaller. Columnar format compresses metrics data very well. Fewer bytes = lower S3 storage and egress cost. |
+
+GreptimeDB's columnar storage format produces significantly smaller objects. Both
+backends target versitygw S3. While Thanos Compactor can reduce long-term storage,
+GreptimeDB's baseline efficiency is notably better.
+
+### Weighted total
+
+| Backend | Correctness (20%) | Complexity (40%) | Resources (15%) | Maturity (10%) | Storage (15%) | **Total** |
+|---------|-------------------|-----------------|-----------------|---------------|--------------|-----------|
+| Thanos | 2.0 | 2.8 | 1.35 | 1.0 | 0.9 | **8.05** |
+| GreptimeDB | 2.0 | 3.6 | 0.75 | 0.6 | 1.35 | **8.30** |
+
+---
+
+## Recommendation
+
+**The two backends are essentially tied (Thanos 8.05 vs GreptimeDB 8.30)** after
+accounting for measured object storage efficiency. GreptimeDB's columnar format
+produces 5.6x less data on S3, which flips the storage cost score and narrows
+Thanos's advantage on maturity and resource usage.
+
+1. **For ystack local dev clusters**: Thanos is still preferred — lighter CPU/memory
+   footprint matters in constrained k3d environments, and storage cost is less
+   relevant with emptyDir/local volumes.
+
+2. **For production multi-cluster with S3 storage costs**: GreptimeDB deserves
+   serious consideration — its storage efficiency advantage compounds at scale,
+   and lower object counts mean fewer S3 API calls (PUT/GET costs).
+
+3. **Thanos advantages**: CNCF graduated maturity, battle-tested at massive scale,
+   well-documented multi-tenancy and zone-aware ingestion, lower runtime resource
+   footprint.
+
+4. **GreptimeDB advantages**: Simpler single-component topology, dramatically better
+   storage efficiency, SQL access to metrics data, active development pace.
+
+## Next steps
+
+1. Remove GreptimeDB from the cluster (losing candidate)
+2. Remove dual remote_write — keep only Thanos Receive
+3. Add `monitoring/thanos/` to `k3s/30-monitoring/kustomization.yaml`
+4. Update validate script to check Thanos components
+5. Run `y-cluster-validate-ystack --context=local` to confirm