diff --git a/.gitignore b/.gitignore index 401eae5..781e094 100644 --- a/.gitignore +++ b/.gitignore @@ -23,7 +23,7 @@ config/**/charts coverage.* *.coverprofile profile.cov - +test/e2e/kubeconfig-* # Dependency directories (remove the comment below to include it) # vendor/ @@ -35,5 +35,5 @@ go.work.sum .env # Editor/IDE -# .idea/ +.idea/ # .vscode/ diff --git a/Makefile b/Makefile index ce6e312..2a9c25a 100644 --- a/Makefile +++ b/Makefile @@ -260,6 +260,9 @@ bootstrap-downstream: ## Create kind downstream and deploy agent with embedded P CONTEXT=kind-$(DOWNSTREAM_CLUSTER_NAME) KUSTOMIZE_DIR=config/overlays/agent-powerdns $(MAKE) kustomize-apply # Export external kubeconfig for downstream cluster (reachable from host/other containers) CLUSTER=$(DOWNSTREAM_CLUSTER_NAME) OUT=dev/kind.downstream.kubeconfig $(MAKE) export-kind-kubeconfig-raw + # Install monitoring stack into downstream + CONTEXT=kind-$(DOWNSTREAM_CLUSTER_NAME) KUSTOMIZE_DIR=config/monitoring $(MAKE) kustomize-apply + CONTEXT=kind-$(DOWNSTREAM_CLUSTER_NAME) KUSTOMIZE_DIR=config/overlays/vector-metrics-gateway $(MAKE) kustomize-apply .PHONY: bootstrap-upstream bootstrap-upstream: ## Create kind upstream and deploy replicator pointing to downstream @@ -474,7 +477,7 @@ set -e; \ package=$(2)@$(3) ;\ echo "Downloading $${package}" ;\ rm -f $(1) ;\ -GOBIN=$(LOCALBIN) go install $${package} ;\ +CGO_ENABLED=0 GOOS=$$(go env GOOS) GOARCH=$$(go env GOARCH) GOBIN=$(LOCALBIN) go install $${package} ;\ mv $(1) $(1)-$(3) ;\ } ;\ ln -sf $$(realpath $(1)-$(3)) $(1) diff --git a/config/agent/dnscollector-config.yaml b/config/agent/dnscollector-config.yaml new file mode 100644 index 0000000..5db4407 --- /dev/null +++ b/config/agent/dnscollector-config.yaml @@ -0,0 +1,109 @@ +################################################ +# global configuration +# more details: https://github.com/dmachard/DNS-collector/blob/main/docs/configuration.md#global +################################################ +global: + trace: + verbose: true + log-malformed: true + filename: "" + max-size: 10 + max-backups: 10 + server-identity: "dns-collector" + worker: + interval-monitor: 10 + buffer-size: 8192 + telemetry: + enabled: true + web-path: "/metrics" + web-listen: ":9165" + prometheus-prefix: "dnscollector_exporter" + + # Optional TLS configuration + tls-support: false + tls-cert-file: "" + tls-key-file: "" + client-ca-file: "" + + # Optional authentication + basic-auth-enable: false + basic-auth-login: admin + basic-auth-pwd: changeme + +################################################ +# Pipelining configuration +# more details: https://github.com/dmachard/DNS-collector/blob/main/docs/running_mode.md#pipelining +# workers: https://github.com/dmachard/DNS-collector/blob/main/docs/workers.md +# transformers: https://github.com/dmachard/DNS-collector/blob/main/docs/transformers.md +################################################ +pipelines: + - name: tap + dnstap: + listen-ip: 0.0.0.0 + listen-port: 6000 + transforms: + normalize: + enable: true + qname-lowercase: true + rr-lowercase: true + qname-replace-nonprintable: true + add-tld: true + add-tld-plus-one: true + quiet-text: false + reordering: + enable: true + flush-interval: 30 + max-buffer-size: 100 + suspicious: + enable: true + threshold-qname-len: 100 + threshold-packet-len: 1000 + threshold-slow: 1.0 + common-qtypes: + - A + - AAAA + - TXT + - CNAME + - PTR + - NAPTR + - DNSKEY + - SRV + - SOA + - NS + - MX + - DS + - HTTPS + unallowed-chars: + - '"' + - '==' + - '/' + - ':' + threshold-max-labels: 10 + whitelist-domains: + - '\.ip6\.arpa' + latency: + enable: true + measure-latency: true + unanswered-queries: true + queries-timeout: 2 + routing-policy: + forward: [ vector ] + dropped: [ ] + + - name: vector + tcpclient: + transport: tcp + remote-address: 127.0.0.1 + remote-port: 6001 + connect-timeout: 5 + retry-interval: 10 + flush-interval: 30 + tls-insecure: false + tls-min-version: 1.2 + ca-file: "" + cert-file: "" + key-file: "" + mode: json + text-format: "" + buffer-size: 100 + chan-buffer-size: 0 \ No newline at end of file diff --git a/config/agent/dnsdist-config.yaml b/config/agent/dnsdist-config.yaml new file mode 100644 index 0000000..bfb472f --- /dev/null +++ b/config/agent/dnsdist-config.yaml @@ -0,0 +1,75 @@ +# TODO(cristhian): Make sure we block api access from outside the cluster. +webserver: + listen_addresses: + - "0.0.0.0:8083" + password: "" + api_key: "" + acl: + - 0.0.0.0/0 + api_requires_authentication: false + stats_require_authentication: false + dashboard_requires_authentication: false + +acl: + - 0.0.0.0/0 + +binds: + - listen_address: "0.0.0.0:53" + reuseport: true + protocol: Do53 + threads: 2 + +packet_caches: + - name: cache + size: 100 + +pools: + - name: default + packet_cache: cache + +backends: + - address: "127.0.0.1:5300" + protocol: Do53 + pools: + - default + +remote_logging: + dnstap_loggers: + - name: remote_logging + transport: tcp + address: "127.0.0.1:6000" + connection_count: 2 + +query_rules: + - name: "log all queries" + selector: + type: All + action: + type: DnstapLog + identity: dnsdist + logger_name: remote_logging + + - name: "default rule" + selector: + type: All + action: + type: Pool + pool_name: default + +response_rules: + - name: log all responses + selector: + type: All + action: + type: DnstapLog + identity: dnsdist + logger_name: remote_logging + +cache_hit_response_rules: + - name: log all responses from cache + selector: + type: All + action: + type: DnstapLog + identity: dnsdist_cache + logger_name: remote_logging \ No newline at end of file diff --git a/config/agent/kustomization.yaml b/config/agent/kustomization.yaml index b282d52..c60c9e9 100644 --- a/config/agent/kustomization.yaml +++ b/config/agent/kustomization.yaml @@ -22,6 +22,15 @@ configMapGenerator: files: - pdns.conf - recursor.conf +- name: dnsdist-config + files: + - dnsdist.yml=dnsdist-config.yaml +- name: dnscollector-config + files: + - config.yaml=dnscollector-config.yaml +- name: vector-config + files: + - vector-config.yaml images: - name: ghcr.io/datum-cloud/dns-operator diff --git a/config/agent/manager.yaml b/config/agent/manager.yaml index f853eb5..1d7ed5d 100644 --- a/config/agent/manager.yaml +++ b/config/agent/manager.yaml @@ -131,14 +131,15 @@ spec: mountPath: /config - name: pdns-shared mountPath: /run/pdns + - name: pdns image: powerdns/pdns-auth-51:latest imagePullPolicy: IfNotPresent ports: - - containerPort: 53 + - containerPort: 5300 name: dns protocol: UDP - - containerPort: 53 + - containerPort: 5300 name: dns-tcp protocol: TCP - containerPort: 8082 @@ -157,7 +158,10 @@ spec: - | set -eu; exec pdns_server \ - --api-key="$(cat /run/pdns/api-key)" --api=yes --webserver-port=8082 + --local-port=5300 \ + --api-key="$(cat /run/pdns/api-key)" \ + --api=yes \ + --webserver-port=8082 securityContext: runAsUser: 953 runAsGroup: 953 @@ -167,6 +171,7 @@ spec: drop: - "ALL" add: ["NET_BIND_SERVICE"] + - name: pdns-recursor image: powerdns/pdns-recursor-51:latest imagePullPolicy: IfNotPresent @@ -201,6 +206,7 @@ spec: - name: pdns-config mountPath: /etc/powerdns readOnly: true + - name: lightningstream image: powerdns/lightningstream:main imagePullPolicy: IfNotPresent @@ -258,7 +264,85 @@ spec: mountPath: /lmdb - name: lightningstream-config mountPath: /etc/lightningstream - + + - name: dnsdist + image: powerdns/dnsdist-21:latest + args: ["--config", "/etc/dnsdist/dnsdist.yml"] + imagePullPolicy: IfNotPresent + ports: + - containerPort: 53 + name: dnsdist-udp + protocol: UDP + - containerPort: 53 + name: dnsdist-tcp + protocol: TCP + - containerPort: 8083 + name: dnsdist-metrics + protocol: TCP + volumeMounts: + - name: pdns-shared + mountPath: /run/pdns + - name: dnsdist-config + mountPath: /etc/dnsdist + readOnly: true + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: + drop: + - "ALL" + add: ["NET_BIND_SERVICE"] + + - name: dnstap-collector + image: dmachard/dnscollector:latest + imagePullPolicy: IfNotPresent + args: ["-config", "/etc/dnscollector/config.yaml"] + ports: + - containerPort: 6000 + name: dnstap + protocol: TCP + - containerPort: 9165 + name: dnscol-metrics + protocol: TCP + volumeMounts: + - name: dnscollector-config + mountPath: /etc/dnscollector + readOnly: true + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: + drop: + - "ALL" + + - name: vector + image: timberio/vector:0.51.1-distroless-static + args: + - --log-format=json + - --verbose + - --watch-config + - --config-dir + - /etc/vector/ + env: + - name: VECTOR_METRICS_GATEWAY_ADDRESS + value: vector-metrics-gateway:9000 + volumeMounts: + - name: vector-config + mountPath: /etc/vector/vector-config.yaml + subPath: vector-config.yaml + - name: vector-config-volume + mountPath: /etc/vector + ports: + - containerPort: 9598 + name: vector-metrics + protocol: TCP + resources: + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: 500m + memory: 512Mi volumes: - name: server-config configMap: @@ -271,5 +355,16 @@ spec: - name: pdns-config configMap: name: pdns-config + - name: dnsdist-config + configMap: + name: dnsdist-config + - name: dnscollector-config + configMap: + name: dnscollector-config + - name: vector-config + configMap: + name: vector-config + - name: vector-config-volume + emptyDir: {} serviceAccountName: controller-manager terminationGracePeriodSeconds: 10 diff --git a/config/agent/pdns-service.yaml b/config/agent/pdns-service.yaml index cd1ff9b..f2edc98 100644 --- a/config/agent/pdns-service.yaml +++ b/config/agent/pdns-service.yaml @@ -18,3 +18,15 @@ spec: port: 8082 targetPort: 8082 protocol: TCP + - name: metrics + port: 8083 + targetPort: 8083 + protocol: TCP + - name: dnscol-metrics + port: 9165 + targetPort: 9165 + protocol: TCP + - name: vector-metrics + port: 9598 + targetPort: 9598 + protocol: TCP diff --git a/config/agent/pdns.conf b/config/agent/pdns.conf index 278d685..fa04b6f 100644 --- a/config/agent/pdns.conf +++ b/config/agent/pdns.conf @@ -23,7 +23,7 @@ api=yes # api-key will be passed via CLI using /run/pdns/api-key # Required for ALIAS expansion: point to an in-pod resolver/recursor. -resolver=127.0.0.1:5300 +resolver=127.0.0.1:5301 expand-alias=yes zone-cache-refresh-interval=0 @@ -37,4 +37,3 @@ lmdb-random-ids=yes lmdb-flag-deleted=yes lmdb-map-size=1000 lmdb-lightning-stream=yes - diff --git a/config/agent/recursor.conf b/config/agent/recursor.conf index 182f4cd..3f0670e 100644 --- a/config/agent/recursor.conf +++ b/config/agent/recursor.conf @@ -14,8 +14,8 @@ recursor: incoming: # Only serve recursion to the local pod; auth will use this for ALIAS expansion. listen: - - "127.0.0.1:5300" - - "[::1]:5300" + - "127.0.0.1:5301" + - "[::1]:5301" allow_from: - "127.0.0.1/32" - "::1/128" diff --git a/config/agent/vector-config.yaml b/config/agent/vector-config.yaml new file mode 100644 index 0000000..aae9015 --- /dev/null +++ b/config/agent/vector-config.yaml @@ -0,0 +1,155 @@ +data_dir: /var/lib/vector + +api: + enabled: false + +sources: + dnstap_tcp: + type: socket + address: 0.0.0.0:6001 + mode: tcp + decoding: + codec: json + internal_metrics: + type: internal_metrics + +transforms: + dnstap_enriched: + type: remap + inputs: + - dnstap_tcp + source: | + # Reuse dns-collector enrichment; only normalize for metrics + .message_type = upcase(to_string(.dnstap.operation) ?? "UNKNOWN") + .server_id = to_string(.dnstap.identity) ?? "unknown" + .stream_id = .server_id + .proto = to_string(.network.protocol) ?? "unknown" + .family = to_string(.network.family) ?? "unknown" + .opcode = to_string(.dns.opcode) ?? "unknown" + .qname = downcase(to_string(.dns.qname) ?? "unknown") + .qtype = to_string(.dns.qtype) ?? "unknown" + .rcode = upcase(to_string(.dns.rcode) ?? "UNKNOWN") + .zone = downcase(to_string(.publicsuffix."etld+1") ?? .qname) + .hit = 1 + + if exists(.dnstap.latency) { + lat, err = to_float(.dnstap.latency) + if err == null { + .latency_seconds = lat + } + } + + # Cache heuristic: dnsdist identity naming (e.g., *_cache) + .cache_hit = contains(.server_id, "_cache") + + # DNSSEC status via AD bit and SERVFAIL heuristic + ad = to_bool(.dns.flags.ad) ?? false + status = "insecure" + if ad { status = "secure" } + if .rcode == "SERVFAIL" { status = "bogus" } + .dnssec_status = status + + dnstap_metrics: + type: log_to_metric + inputs: + - dnstap_enriched + metrics: + - type: counter + name: dns_queries_total + field: hit + condition: '.message_type == "CLIENT_QUERY"' + tags: + family: "{{family}}" + proto: "{{proto}}" + opcode: "{{opcode}}" + qtype: "{{qtype}}" + zone: "{{zone}}" + server: "{{server_id}}" + stream: "{{stream_id}}" + - type: counter + name: dns_responses_total + field: hit + condition: '.message_type == "CLIENT_RESPONSE"' + tags: + family: "{{family}}" + proto: "{{proto}}" + qtype: "{{qtype}}" + rcode: "{{rcode}}" + server: "{{server_id}}" + zone: "{{zone}}" + stream: "{{stream_id}}" + - type: counter + name: dns_cache_hits_total + field: hit + condition: '.message_type == "CLIENT_RESPONSE" && .cache_hit == true' + tags: + proto: "{{proto}}" + server: "{{server_id}}" + zone: "{{zone}}" + qtype: "{{qtype}}" + - type: counter + name: dns_dnssec_status_total + field: hit + condition: '.message_type == "CLIENT_RESPONSE"' + tags: + proto: "{{proto}}" + server: "{{server_id}}" + zone: "{{zone}}" + dnssec_status: "{{dnssec_status}}" + - type: counter + name: dns_servfail_total + field: hit + condition: '.message_type == "CLIENT_RESPONSE" && .rcode == "SERVFAIL"' + tags: + proto: "{{proto}}" + server: "{{server_id}}" + zone: "{{zone}}" + qtype: "{{qtype}}" + - type: counter + name: dns_nxdomain_total + field: hit + condition: '.message_type == "CLIENT_RESPONSE" && .rcode == "NXDOMAIN"' + tags: + proto: "{{proto}}" + server: "{{server_id}}" + zone: "{{zone}}" + qtype: "{{qtype}}" + - type: histogram + name: dns_response_latency_seconds + field: latency_seconds + condition: '.message_type == "CLIENT_RESPONSE" && exists(.latency_seconds)' + bins: [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.25, 0.5, 1, 2, 5] + tags: + proto: "{{proto}}" + qtype: "{{qtype}}" + rcode: "{{rcode}}" + server: "{{server_id}}" + zone: "{{zone}}" + +sinks: + console: + type: console + target: stdout + encoding: + codec: json + inputs: + - dnstap_tcp + - dnstap_metrics + + central_vector: + type: vector + inputs: + - dnstap_metrics + address: ${VECTOR_METRICS_GATEWAY_ADDRESS} + healthcheck: + enabled: true + buffer: + type: disk + max_size: 268435488 + when_full: block + + prometheus: + type: prometheus_exporter + inputs: + - internal_metrics + address: 0.0.0.0:9598 diff --git a/config/components/vector-metrics-gateway/helmrepository.yaml b/config/components/vector-metrics-gateway/helmrepository.yaml new file mode 100644 index 0000000..602bb21 --- /dev/null +++ b/config/components/vector-metrics-gateway/helmrepository.yaml @@ -0,0 +1,8 @@ +apiVersion: source.toolkit.fluxcd.io/v1 +kind: HelmRepository +metadata: + name: vector + namespace: dns-agent-system +spec: + interval: 1h + url: https://helm.vector.dev diff --git a/config/components/vector-metrics-gateway/kustomization.yaml b/config/components/vector-metrics-gateway/kustomization.yaml new file mode 100644 index 0000000..cf3a2a7 --- /dev/null +++ b/config/components/vector-metrics-gateway/kustomization.yaml @@ -0,0 +1,6 @@ +apiVersion: kustomize.config.k8s.io/v1alpha1 +kind: Component + +resources: + - helmrepository.yaml + - vector-hr.yaml diff --git a/config/components/vector-metrics-gateway/vector-hr.yaml b/config/components/vector-metrics-gateway/vector-hr.yaml new file mode 100644 index 0000000..634d1a5 --- /dev/null +++ b/config/components/vector-metrics-gateway/vector-hr.yaml @@ -0,0 +1,325 @@ +--- +apiVersion: helm.toolkit.fluxcd.io/v2 +kind: HelmRelease +metadata: + name: vector-metrics-gateway + namespace: dns-agent-system +spec: + interval: 5m + timeout: 1m + + chart: + spec: + chart: vector + version: 0.49.x + sourceRef: + kind: HelmRepository + name: vector + namespace: dns-agent-system + interval: 1h + values: + fullnameOverride: vector-metrics-gateway + # Role: Aggregator (stateless, can run as Deployment) + role: Stateless-Aggregator + replicas: 2 + image: + repository: timberio/vector + tag: 0.52.0-distroless-libc + pullPolicy: IfNotPresent + args: + - --log-format=json + - --verbose + - --watch-config + - --config-dir + - /etc/vector/ + resources: + requests: + cpu: 200m + memory: 256Mi + limits: + cpu: 1000m + memory: 1Gi + podDisruptionBudget: + enabled: true + minAvailable: 1 + autoscaling: + enabled: true + minReplicas: 2 + maxReplicas: 5 + targetCPUUtilizationPercentage: 70 + targetMemoryUtilizationPercentage: 80 + service: + enabled: true + type: ClusterIP + ports: + - name: vector + port: 9000 + targetPort: vector + protocol: TCP + podMonitor: + enabled: true + port: internal-promet + + serviceAccount: + create: true + name: vector + + containerPorts: + - name: vector + containerPort: 9000 + protocol: TCP + + serviceHeadless: + enabled: false + + defaultVolumes: [] + defaultVolumeMounts: [] + + extraVolumes: + - name: vector-enrichment + emptyDir: {} + - name: vector-enrichment-scripts + configMap: + name: vector-enrichment-scripts + defaultMode: 0555 + + extraVolumeMounts: + - name: vector-enrichment + mountPath: /etc/vector/enrichment + readOnly: true + + initContainers: + - name: init-enrichment-table + image: busybox:1.36 + imagePullPolicy: IfNotPresent + command: + - sh + - -c + - | + set -eu + mkdir -p /enrichment/configmaps + # create a valid CSV with headers so Vector can compile VRL at startup + printf 'zone,project\n' > /enrichment/zone_project.csv + volumeMounts: + - name: vector-enrichment + mountPath: /enrichment + + extraContainers: + - name: zone-accounting-sidecar + image: kiwigrid/k8s-sidecar:1.30.9 + imagePullPolicy: IfNotPresent + env: + - name: RESOURCE + value: configmap + - name: NAMESPACE + value: datum-downstream-dnszone-accounting + - name: LABEL + value: datum.net/dnszone-owner + - name: LABEL_VALUE + value: "" + - name: FOLDER + value: /tmp/ + - name: UNIQUE_FILENAMES + value: "true" + - name: METHOD + value: WATCH + - name: SCRIPT + value: /scripts/build-zone-project.sh + volumeMounts: + - name: vector-enrichment + mountPath: /enrichment + - name: vector-enrichment-scripts + mountPath: /scripts + readOnly: true + + customConfig: + data_dir: /var/lib/vector + api: + enabled: false + enrichment_tables: + zone_project: + type: file + file: + path: /etc/vector/enrichment/zone_project.csv + encoding: + type: csv + delimiter: "," + schema: + zone: string + project: string + sources: + edge_metrics: + type: vector + address: 0.0.0.0:9000 + internal_metrics: + type: internal_metrics + namespace: vector + transforms: + add_project_tags: + type: remap + inputs: + - edge_metrics + source: | + # Add tags if missing + if !exists(.tags) { .tags = {} } + + # Get zone from metric tags + zone = to_string(.tags.zone) ?? null + + if zone != null { + + # Lookup project from enrichment table + record, err = get_enrichment_table_record("zone_project", {"zone": zone}) + + # Add project tag if found + if err == null && record != null && exists(record.project) { + .tags.project = to_string(record.project) ?? record.project + } + } + sinks: + + console: + type: console + target: stdout + encoding: + codec: json + inputs: + - add_project_tags + + victoria_metrics: + type: prometheus_remote_write + inputs: + - add_project_tags + endpoint: ${VICTORIA_METRICS_REMOTE_WRITE_URL} + batch: + max_bytes: 262144 + max_events: 500 + timeout_secs: 5 + buffer: + type: disk + max_size: 10737418240 + when_full: block + request: + retry_attempts: 9999999 + retry_initial_backoff_secs: 1 + retry_max_duration_secs: 300 + timeout_secs: 300 + healthcheck: + enabled: true + compression: snappy + + # Export metrics over prometheus + internal_prometheus: + type: prometheus_exporter + inputs: + - internal_metrics + address: 0.0.0.0:9091 + default_namespace: vector_internal + + extraObjects: + - apiVersion: v1 + kind: ConfigMap + metadata: + name: vector-enrichment-scripts + namespace: "{{ .Release.Namespace }}" + data: + build-zone-project.sh: | + #!/bin/sh + # Build zone_project.csv from datum-downstream-dnszone-accounting ConfigMaps. + # + # Input file example: + # namespace_.configmap_.owner (content: //<...>) + # + # Output CSV: + # zone,project + # , + + set -eu + + SRC_DIR="${ENRICHMENT_CONFIGMAP_DIR:-/tmp}" + OUT_FILE="${ENRICHMENT_OUTPUT_FILE:-/enrichment/zone_project.csv}" + + mkdir -p "$(dirname "$OUT_FILE")" + + tmp="$(mktemp)" + trap 'rm -f "$tmp"' EXIT + + # Always produce a valid CSV (header at minimum). + printf 'zone,project\n' > "$tmp" + + # Nothing to do if input directory doesn't exist. + [ -d "$SRC_DIR" ] || { mv "$tmp" "$OUT_FILE"; exit 0; } + + # Collect rows, then sort+dedupe for stable output. + rows="$(mktemp)" + trap 'rm -f "$tmp" "$rows"' EXIT + : > "$rows" + + for f in "$SRC_DIR"/*.owner; do + [ -f "$f" ] || continue + + base="$(basename "$f")" + # Expect: namespace_.configmap_.owner -> extract + zone="${base#*configmap_}" + zone="${zone%.owner}" + + # Skip if we can't infer zone. + [ -n "$zone" ] && [ "$zone" != "$base" ] || continue + + owner="$(tr -d '\r\n' < "$f")" + owner="${owner#/}" # trim leading / + project="${owner%%/*}" # first segment + + [ -n "$project" ] || continue + + printf '%s,%s\n' "$zone" "$project" >> "$rows" + done + + # Append sorted unique rows. + if [ -s "$rows" ]; then + sort "$rows" | uniq >> "$tmp" + fi + + mv "$tmp" "$OUT_FILE" + - apiVersion: rbac.authorization.k8s.io/v1 + kind: ClusterRole + metadata: + name: vector-configmap-reader + rules: + - apiGroups: [""] + resources: ["configmaps"] + verbs: ["get", "list", "watch"] + - apiVersion: rbac.authorization.k8s.io/v1 + kind: ClusterRoleBinding + metadata: + name: vector-configmap-reader + roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: vector-configmap-reader + subjects: + - kind: ServiceAccount + name: vector + namespace: "{{ .Release.Namespace }}" + + env: + - name: VICTORIA_METRICS_REMOTE_WRITE_URL + value: http://prometheus.dns-monitoring.svc:9090/api/v1/write + + # Persistence for buffer + persistence: + enabled: true + storageClassName: "" # Use default storage class + accessModes: + - ReadWriteOnce + size: 10Gi + + install: + crds: Create + createNamespace: false + + upgrade: + crds: CreateReplace + + uninstall: + keepHistory: false diff --git a/config/monitoring/README.md b/config/monitoring/README.md new file mode 100644 index 0000000..c3f4b26 --- /dev/null +++ b/config/monitoring/README.md @@ -0,0 +1,46 @@ +# Telemetry & Observability + +## Overview + +Lightweight observability stack for the DNS agent components. + +## Components +- **Grafana**: Pre-provisioned Prometheus and Loki data sources. +- **Prometheus**: Single instance scraping dnsdist, dnscollector, and vector + metrics out of the box. +- **Loki**: Single-binary log storage for dnstap/log forwarding from vector. +- **Vector (central)**: Receives metrics from edge Vector and forwards to the + consumer-facing Victoria Metrics remote-write endpoint. +- **Namespace**: `dns-monitoring` is created automatically. + +## Deploy +Apply the full stack: +```bash +kubectl apply -k config/monitoring +``` + +Grafana credentials are `admin` / `admin` (stored in +`Secret/grafana-admin`). + +Port-forward to reach the UI: +```bash +kubectl -n dns-monitoring port-forward svc/grafana 3000:80 +open http://localhost:3000 +``` + +## Data sources +- Prometheus URL: `${PROMETHEUS_URL}` (default + `http://prometheus.dns-monitoring.svc:9090`) +- Loki URL: `${LOKI_URL}` (default `http://loki.dns-monitoring.svc:3100`) +The central Vector instance uses `VICTORIA_METRICS_REMOTE_WRITE_URL` (default +`http://prometheus.dns-monitoring.svc:9090/api/v1/write`). + +If you want to use an existing cluster Prometheus instead of the bundled one, +patch `PROMETHEUS_URL` and remove the `prometheus` entry from +`config/monitoring/kustomization.yaml` before applying. + +## Prometheus scraping +The bundled Prometheus scrapes: +- `dnsdist` at `pdns-auth.dns-agent-system.svc:8083` (`/metrics`) +- `dnscollector_exporter` at `pdns-auth.dns-agent-system.svc:9165` (`/metrics`) +- `vector` at `pdns-auth.dns-agent-system.svc:9598` (`/metrics`) diff --git a/config/monitoring/grafana/dashboards/dashboards.yaml b/config/monitoring/grafana/dashboards/dashboards.yaml new file mode 100644 index 0000000..8923c60 --- /dev/null +++ b/config/monitoring/grafana/dashboards/dashboards.yaml @@ -0,0 +1,10 @@ +apiVersion: 1 +providers: + - name: dns-observability + orgId: 1 + folder: DNS + type: file + disableDeletion: true + editable: true + options: + path: /var/lib/grafana/dashboards \ No newline at end of file diff --git a/config/monitoring/grafana/dashboards/dns_workbench.json b/config/monitoring/grafana/dashboards/dns_workbench.json new file mode 100644 index 0000000..715b1dc --- /dev/null +++ b/config/monitoring/grafana/dashboards/dns_workbench.json @@ -0,0 +1,900 @@ +{ + "timezone": "", + "title": "DNS Workbench", + "uid": "", + "schemaVersion": 39, + "version": 1, + "weekStart": "", + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "links": [], + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.2", + "targets": [ + { + "expr": "sum(increase(dns_queries_total[$__range]))", + "refId": "A" + } + ], + "title": "Total Queries", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 6, + "y": 0 + }, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.2", + "targets": [ + { + "expr": "1000 * (sum(rate(dns_response_latency_seconds_sum[5m])) / sum(rate(dns_response_latency_seconds_count[5m])))", + "refId": "A" + } + ], + "title": "Avg Latency", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 0 + }, + "id": 3, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.2", + "targets": [ + { + "expr": "sum(rate(dns_cache_hits_total[5m])) / sum(rate(dns_responses_total[5m]))", + "refId": "A" + } + ], + "title": "Cache Hit Rate", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 18, + "y": 0 + }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.2", + "targets": [ + { + "expr": "sum(rate(dns_responses_total{rcode=\"NXDOMAIN\"}[5m])) / sum(rate(dns_responses_total[5m]))", + "refId": "A" + } + ], + "title": "NXDOMAIN Rate", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 4 + }, + "id": 5, + "options": { + "displayMode": "lcd", + "maxVizHeight": 300, + "minVizHeight": 16, + "minVizWidth": 8, + "namePlacement": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "sizing": "auto", + "valueMode": "color" + }, + "pluginVersion": "10.4.2", + "targets": [ + { + "expr": "sum(rate(dns_queries_total[5m])) by (qtype)", + "refId": "A" + } + ], + "title": "Query Type Distribution", + "type": "bargauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 4 + }, + "id": 6, + "options": { + "displayMode": "lcd", + "maxVizHeight": 300, + "minVizHeight": 16, + "minVizWidth": 8, + "namePlacement": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "sizing": "auto", + "valueMode": "color" + }, + "pluginVersion": "10.4.2", + "targets": [ + { + "expr": "sum(rate(dns_responses_total[5m])) by (rcode)", + "refId": "A" + } + ], + "title": "Response Code Distribution", + "type": "bargauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 12, + "x": 0, + "y": 12 + }, + "id": 7, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "value_and_name", + "wideLayout": true + }, + "pluginVersion": "10.4.2", + "targets": [ + { + "expr": "sum(rate(dns_dnssec_status_total[5m])) by (dnssec_status)", + "refId": "A" + } + ], + "title": "DNSSEC Validation", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 12, + "y": 12 + }, + "id": 9, + "options": { + "displayMode": "lcd", + "maxVizHeight": 300, + "minVizHeight": 16, + "minVizWidth": 8, + "namePlacement": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "sizing": "auto", + "valueMode": "color" + }, + "pluginVersion": "10.4.2", + "targets": [ + { + "expr": "sum(rate(dns_queries_total[5m])) by (stream)", + "refId": "A" + } + ], + "title": "Query Source Distribution", + "type": "bargauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 8, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "Value" + } + ] + }, + "pluginVersion": "10.4.2", + "targets": [ + { + "expr": "topk(10, sum(rate(dns_queries_total[5m])) by (zone))", + "refId": "A" + } + ], + "title": "Top Zones", + "type": "table" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 0, + "y": 24 + }, + "id": 10, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.2", + "targets": [ + { + "expr": "1000 * histogram_quantile(0.5, sum(rate(dns_response_latency_seconds_bucket[5m])) by (le))", + "refId": "A" + } + ], + "title": "p50 Latency", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 4, + "y": 24 + }, + "id": 11, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.2", + "targets": [ + { + "expr": "1000 * histogram_quantile(0.75, sum(rate(dns_response_latency_seconds_bucket[5m])) by (le))", + "refId": "A" + } + ], + "title": "p75 Latency", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 8, + "y": 24 + }, + "id": 12, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.2", + "targets": [ + { + "expr": "1000 * histogram_quantile(0.9, sum(rate(dns_response_latency_seconds_bucket[5m])) by (le))", + "refId": "A" + } + ], + "title": "p90 Latency", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 12, + "y": 24 + }, + "id": 13, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.2", + "targets": [ + { + "expr": "1000 * histogram_quantile(0.95, sum(rate(dns_response_latency_seconds_bucket[5m])) by (le))", + "refId": "A" + } + ], + "title": "p95 Latency", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 16, + "y": 24 + }, + "id": 14, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.2", + "targets": [ + { + "expr": "1000 * histogram_quantile(0.99, sum(rate(dns_response_latency_seconds_bucket[5m])) by (le))", + "refId": "A" + } + ], + "title": "p99 Latency", + "type": "stat" + } + ], + "refresh": "30s", + "tags": [ + "dns", + "dnstap", + "workbench" + ], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "Prometheus", + "value": "prometheus" + }, + "hide": 0, + "includeAll": false, + "label": "Prometheus", + "multi": false, + "name": "DS_PROMETHEUS", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + } + ] + }, + "time": { + "from": "now-24h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h" + ] + } +} \ No newline at end of file diff --git a/config/monitoring/grafana/datasources/datasources.yaml b/config/monitoring/grafana/datasources/datasources.yaml new file mode 100644 index 0000000..39bfe51 --- /dev/null +++ b/config/monitoring/grafana/datasources/datasources.yaml @@ -0,0 +1,17 @@ +apiVersion: 1 +datasources: + - name: Prometheus + uid: prometheus + type: prometheus + access: proxy + url: ${PROMETHEUS_URL} + isDefault: true + editable: true + jsonData: + timeInterval: 30s + - name: Loki + uid: loki + type: loki + access: proxy + url: ${LOKI_URL} + editable: true \ No newline at end of file diff --git a/config/monitoring/grafana/deployment.yaml b/config/monitoring/grafana/deployment.yaml new file mode 100644 index 0000000..0a4b2d9 --- /dev/null +++ b/config/monitoring/grafana/deployment.yaml @@ -0,0 +1,81 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: grafana + labels: + app.kubernetes.io/name: grafana +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: grafana + template: + metadata: + labels: + app.kubernetes.io/name: grafana + spec: + containers: + - name: grafana + image: grafana/grafana:10.4.2 + imagePullPolicy: IfNotPresent + ports: + - containerPort: 3000 + name: http + protocol: TCP + env: + - name: GF_SECURITY_ADMIN_USER + valueFrom: + secretKeyRef: + name: grafana-admin + key: admin-user + - name: GF_SECURITY_ADMIN_PASSWORD + valueFrom: + secretKeyRef: + name: grafana-admin + key: admin-password + - name: PROMETHEUS_URL + value: http://prometheus.dns-monitoring.svc:9090 + - name: LOKI_URL + value: http://loki.dns-monitoring.svc:3100 + volumeMounts: + - name: datasources + mountPath: /etc/grafana/provisioning/datasources/datasources.yaml + subPath: datasources.yaml + - name: dashboard-providers + mountPath: /etc/grafana/provisioning/dashboards/dashboards.yaml + subPath: dashboards.yaml + - name: dashboards + mountPath: /var/lib/grafana/dashboards + - name: storage + mountPath: /var/lib/grafana + readinessProbe: + httpGet: + path: /api/health + port: 3000 + initialDelaySeconds: 10 + periodSeconds: 10 + livenessProbe: + httpGet: + path: /api/health + port: 3000 + initialDelaySeconds: 30 + periodSeconds: 30 + resources: + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: 500m + memory: 1Gi + volumes: + - name: datasources + configMap: + name: grafana-datasources + - name: dashboard-providers + configMap: + name: grafana-dashboard-providers + - name: dashboards + configMap: + name: grafana-dashboards + - name: storage + emptyDir: {} diff --git a/config/monitoring/grafana/kustomization.yaml b/config/monitoring/grafana/kustomization.yaml new file mode 100644 index 0000000..2c1e3cc --- /dev/null +++ b/config/monitoring/grafana/kustomization.yaml @@ -0,0 +1,26 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - deployment.yaml + - service.yaml + +generatorOptions: + disableNameSuffixHash: true + +configMapGenerator: + - name: grafana-dashboard-providers + files: + - dashboards/dashboards.yaml + - name: grafana-dashboards + files: + - dashboards/dns_workbench.json + - name: grafana-datasources + files: + - datasources/datasources.yaml + +secretGenerator: + - name: grafana-admin + literals: + - admin-user=admin + - admin-password=admin diff --git a/config/monitoring/grafana/service.yaml b/config/monitoring/grafana/service.yaml new file mode 100644 index 0000000..59145d4 --- /dev/null +++ b/config/monitoring/grafana/service.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Service +metadata: + name: grafana + labels: + app.kubernetes.io/name: grafana +spec: + type: ClusterIP + selector: + app.kubernetes.io/name: grafana + ports: + - name: http + port: 80 + targetPort: 3000 + protocol: TCP diff --git a/config/monitoring/kustomization.yaml b/config/monitoring/kustomization.yaml new file mode 100644 index 0000000..7113c0c --- /dev/null +++ b/config/monitoring/kustomization.yaml @@ -0,0 +1,10 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +namespace: dns-monitoring + +resources: + - namespace.yaml + - grafana + - prometheus + - loki diff --git a/config/monitoring/loki/config/loki.yaml b/config/monitoring/loki/config/loki.yaml new file mode 100644 index 0000000..b871983 --- /dev/null +++ b/config/monitoring/loki/config/loki.yaml @@ -0,0 +1,41 @@ +auth_enabled: false +server: + http_listen_port: 3100 +common: + path_prefix: /var/loki + ring: + instance_addr: 127.0.0.1 + kvstore: + store: inmemory + replication_factor: 1 +schema_config: + configs: + - from: 2020-10-24 + store: boltdb-shipper + object_store: filesystem + schema: v11 + index: + prefix: index_ + period: 24h +storage_config: + boltdb_shipper: + active_index_directory: /var/loki/index + cache_location: /var/loki/cache + shared_store: filesystem + filesystem: + directory: /var/loki/chunks +ingester: + wal: + dir: /var/loki/wal + chunk_idle_period: 5m + chunk_retain_period: 30s + max_chunk_age: 1h +limits_config: + retention_period: 168h + reject_old_samples: true + reject_old_samples_max_age: 168h +compactor: + working_directory: /var/loki/compactor + shared_store: filesystem +analytics: + reporting_enabled: false \ No newline at end of file diff --git a/config/monitoring/loki/deployment.yaml b/config/monitoring/loki/deployment.yaml new file mode 100644 index 0000000..1ed8144 --- /dev/null +++ b/config/monitoring/loki/deployment.yaml @@ -0,0 +1,58 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: loki + labels: + app.kubernetes.io/name: loki +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: loki + template: + metadata: + labels: + app.kubernetes.io/name: loki + spec: + securityContext: + fsGroup: 10001 + containers: + - name: loki + image: grafana/loki:2.9.4 + imagePullPolicy: IfNotPresent + args: + - -config.file=/etc/loki/loki.yaml + ports: + - containerPort: 3100 + name: http + protocol: TCP + readinessProbe: + httpGet: + path: /ready + port: 3100 + initialDelaySeconds: 10 + periodSeconds: 10 + livenessProbe: + httpGet: + path: /ready + port: 3100 + initialDelaySeconds: 30 + periodSeconds: 30 + resources: + requests: + cpu: 200m + memory: 512Mi + limits: + cpu: 1 + memory: 2Gi + volumeMounts: + - name: config + mountPath: /etc/loki + - name: data + mountPath: /var/loki + volumes: + - name: config + configMap: + name: loki-config + - name: data + emptyDir: {} diff --git a/config/monitoring/loki/kustomization.yaml b/config/monitoring/loki/kustomization.yaml new file mode 100644 index 0000000..0b1acb8 --- /dev/null +++ b/config/monitoring/loki/kustomization.yaml @@ -0,0 +1,14 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - deployment.yaml + - service.yaml + +generatorOptions: + disableNameSuffixHash: true + +configMapGenerator: + - name: loki-config + files: + - config/loki.yaml diff --git a/config/monitoring/loki/service.yaml b/config/monitoring/loki/service.yaml new file mode 100644 index 0000000..48c0c86 --- /dev/null +++ b/config/monitoring/loki/service.yaml @@ -0,0 +1,14 @@ +apiVersion: v1 +kind: Service +metadata: + name: loki + labels: + app.kubernetes.io/name: loki +spec: + selector: + app.kubernetes.io/name: loki + ports: + - name: http + port: 3100 + targetPort: 3100 + protocol: TCP diff --git a/config/monitoring/namespace.yaml b/config/monitoring/namespace.yaml new file mode 100644 index 0000000..cf27890 --- /dev/null +++ b/config/monitoring/namespace.yaml @@ -0,0 +1,6 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: dns-monitoring + labels: + app.kubernetes.io/name: dns-monitoring diff --git a/config/monitoring/prometheus/config/prometheus.yml b/config/monitoring/prometheus/config/prometheus.yml new file mode 100644 index 0000000..5847a45 --- /dev/null +++ b/config/monitoring/prometheus/config/prometheus.yml @@ -0,0 +1,34 @@ +global: + scrape_interval: 30s + evaluation_interval: 30s + +scrape_configs: + - job_name: prometheus + static_configs: + - targets: + - localhost:9090 + + - job_name: dnsdist + metrics_path: /metrics + static_configs: + - targets: + - pdns-auth.dns-agent-system.svc:8083 + labels: + app: dnsdist + + - job_name: dnscollector_exporter + metrics_path: /metrics + scrape_interval: 5s + static_configs: + - targets: + - pdns-auth.dns-agent-system.svc:9165 + labels: + app: dnscollector_exporter + + - job_name: vector + metrics_path: /metrics + static_configs: + - targets: + - pdns-auth.dns-agent-system.svc:9598 + labels: + app: vector \ No newline at end of file diff --git a/config/monitoring/prometheus/deployment.yaml b/config/monitoring/prometheus/deployment.yaml new file mode 100644 index 0000000..8ecfbc2 --- /dev/null +++ b/config/monitoring/prometheus/deployment.yaml @@ -0,0 +1,48 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: prometheus + labels: + app.kubernetes.io/name: prometheus +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: prometheus + template: + metadata: + labels: + app.kubernetes.io/name: prometheus + spec: + containers: + - name: prometheus + image: prom/prometheus:v2.52.0 + imagePullPolicy: IfNotPresent + args: + - --config.file=/etc/prometheus/prometheus.yml + - --storage.tsdb.path=/prometheus + - --web.enable-lifecycle + - --web.enable-remote-write-receiver + ports: + - name: http + containerPort: 9090 + protocol: TCP + volumeMounts: + - name: prometheus-config + mountPath: /etc/prometheus/prometheus.yml + subPath: prometheus.yml + - name: prometheus-storage + mountPath: /prometheus + resources: + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: 500m + memory: 512Mi + volumes: + - name: prometheus-config + configMap: + name: prometheus-config + - name: prometheus-storage + emptyDir: {} diff --git a/config/monitoring/prometheus/kustomization.yaml b/config/monitoring/prometheus/kustomization.yaml new file mode 100644 index 0000000..2e6380d --- /dev/null +++ b/config/monitoring/prometheus/kustomization.yaml @@ -0,0 +1,14 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - deployment.yaml + - service.yaml + +generatorOptions: + disableNameSuffixHash: true + +configMapGenerator: + - name: prometheus-config + files: + - config/prometheus.yml diff --git a/config/monitoring/prometheus/service.yaml b/config/monitoring/prometheus/service.yaml new file mode 100644 index 0000000..732d3f1 --- /dev/null +++ b/config/monitoring/prometheus/service.yaml @@ -0,0 +1,14 @@ +apiVersion: v1 +kind: Service +metadata: + name: prometheus + labels: + app.kubernetes.io/name: prometheus +spec: + selector: + app.kubernetes.io/name: prometheus + ports: + - name: http + port: 9090 + targetPort: http + protocol: TCP diff --git a/config/monitoring/vector/config/vector-config.yaml b/config/monitoring/vector/config/vector-config.yaml new file mode 100644 index 0000000..999fca5 --- /dev/null +++ b/config/monitoring/vector/config/vector-config.yaml @@ -0,0 +1,65 @@ +data_dir: /var/lib/vector + +api: + enabled: false + +enrichment_tables: + zone_project: + type: file + file: + path: /etc/vector/enrichment/zone_project.csv + encoding: + type: csv + delimiter: "," + schema: + zone: string + project: string + +sources: + edge_metrics: + type: vector + address: 0.0.0.0:9000 + +transforms: + add_project_tags: + type: remap + inputs: + - edge_metrics + source: | + # Add tags if missing + if !exists(.tags) { .tags = {} } + + # Get zone from metric tags + zone = to_string(.tags.zone) ?? null + + if zone != null { + + # Lookup project from enrichment table + record, err = get_enrichment_table_record("zone_project", {"zone": zone}) + + # Add project tag if found + if err == null && record != null && exists(record.project) { + .tags.project = to_string(record.project) ?? record.project + } + } + +sinks: + console: + type: console + target: stdout + encoding: + codec: json + inputs: + - add_project_tags + + victoria_metrics: + type: prometheus_remote_write + inputs: + - add_project_tags + endpoint: ${VICTORIA_METRICS_REMOTE_WRITE_URL} + healthcheck: + enabled: false + buffer: + type: disk + max_size: 268435488 + when_full: block diff --git a/config/monitoring/vector/deployment.yaml b/config/monitoring/vector/deployment.yaml new file mode 100644 index 0000000..7e22751 --- /dev/null +++ b/config/monitoring/vector/deployment.yaml @@ -0,0 +1,107 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vector + labels: + app.kubernetes.io/name: vector +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: vector + template: + metadata: + labels: + app.kubernetes.io/name: vector + spec: + serviceAccountName: vector + initContainers: + - name: init-enrichment-table + image: busybox:1.36 + imagePullPolicy: IfNotPresent + command: + - sh + - -c + - | + set -eu + mkdir -p /enrichment/configmaps + # create a valid CSV with headers so Vector can compile VRL at startup + printf 'zone,project\n' > /enrichment/zone_project.csv + volumeMounts: + - name: vector-enrichment + mountPath: /enrichment + containers: + - name: vector + image: timberio/vector:0.51.1-distroless-static + imagePullPolicy: IfNotPresent + args: + - --log-format=json + - --verbose + - --watch-config + - --config-dir + - /etc/vector/ + env: + - name: VICTORIA_METRICS_REMOTE_WRITE_URL + value: http://prometheus.dns-monitoring.svc:9090/api/v1/write + ports: + - containerPort: 9000 + name: vector + protocol: TCP + volumeMounts: + - name: vector-config + mountPath: /etc/vector/vector-config.yaml + subPath: vector-config.yaml + - name: vector-config-volume + mountPath: /etc/vector + - name: vector-enrichment + mountPath: /etc/vector/enrichment + readOnly: true + - name: vector-data + mountPath: /var/lib/vector + resources: + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: 500m + memory: 512Mi + - name: zone-accounting-sidecar + image: kiwigrid/k8s-sidecar:1.30.9 + imagePullPolicy: IfNotPresent + env: + - name: RESOURCE + value: configmap + - name: NAMESPACE + value: datum-downstream-dnszone-accounting + - name: LABEL + value: datum.net/dnszone-accounting + - name: LABEL_VALUE + value: "" + - name: FOLDER + value: /tmp/ + - name: UNIQUE_FILENAMES + value: "true" + - name: METHOD + value: WATCH + - name: SCRIPT + value: /scripts/build-zone-project.sh + volumeMounts: + - name: vector-enrichment + mountPath: /enrichment + - name: vector-enrichment-scripts + mountPath: /scripts + readOnly: true + volumes: + - name: vector-config + configMap: + name: vector-config + - name: vector-config-volume + emptyDir: {} + - name: vector-enrichment + emptyDir: {} + - name: vector-enrichment-scripts + configMap: + name: vector-enrichment-scripts + defaultMode: 0555 + - name: vector-data + emptyDir: {} diff --git a/config/monitoring/vector/kustomization.yaml b/config/monitoring/vector/kustomization.yaml new file mode 100644 index 0000000..6f11db5 --- /dev/null +++ b/config/monitoring/vector/kustomization.yaml @@ -0,0 +1,18 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - deployment.yaml + - service.yaml + - rbac.yaml + +generatorOptions: + disableNameSuffixHash: true + +configMapGenerator: + - name: vector-config + files: + - config/vector-config.yaml + - name: vector-enrichment-scripts + files: + - scripts/build-zone-project.sh diff --git a/config/monitoring/vector/rbac.yaml b/config/monitoring/vector/rbac.yaml new file mode 100644 index 0000000..0a60106 --- /dev/null +++ b/config/monitoring/vector/rbac.yaml @@ -0,0 +1,26 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: vector +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: vector-configmap-reader +rules: + - apiGroups: [""] + resources: ["configmaps"] + verbs: ["get", "list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: vector-configmap-reader +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: vector-configmap-reader +subjects: + - kind: ServiceAccount + name: vector + namespace: dns-monitoring diff --git a/config/monitoring/vector/scripts/build-zone-project.sh b/config/monitoring/vector/scripts/build-zone-project.sh new file mode 100644 index 0000000..f57cdb1 --- /dev/null +++ b/config/monitoring/vector/scripts/build-zone-project.sh @@ -0,0 +1,57 @@ +#!/bin/sh +# Build zone_project.csv from datum-downstream-dnszone-accounting ConfigMaps. +# +# Input file example: +# namespace_.configmap_.owner (content: //<...>) +# +# Output CSV: +# zone,project +# , + +set -eu + +SRC_DIR="${ENRICHMENT_CONFIGMAP_DIR:-/tmp}" +OUT_FILE="${ENRICHMENT_OUTPUT_FILE:-/enrichment/zone_project.csv}" + +mkdir -p "$(dirname "$OUT_FILE")" + +tmp="$(mktemp)" +trap 'rm -f "$tmp"' EXIT + +# Always produce a valid CSV (header at minimum). +printf 'zone,project\n' > "$tmp" + +# Nothing to do if input directory doesn't exist. +[ -d "$SRC_DIR" ] || { mv "$tmp" "$OUT_FILE"; exit 0; } + +# Collect rows, then sort+dedupe for stable output. +rows="$(mktemp)" +trap 'rm -f "$tmp" "$rows"' EXIT +: > "$rows" + +for f in "$SRC_DIR"/*.owner; do + [ -f "$f" ] || continue + + base="$(basename "$f")" + # Expect: namespace_.configmap_.owner -> extract + zone="${base#*configmap_}" + zone="${zone%.owner}" + + # Skip if we can't infer zone. + [ -n "$zone" ] && [ "$zone" != "$base" ] || continue + + owner="$(tr -d '\r\n' < "$f")" + owner="${owner#/}" # trim leading / + project="${owner%%/*}" # first segment + + [ -n "$project" ] || continue + + printf '%s,%s\n' "$zone" "$project" >> "$rows" +done + +# Append sorted unique rows. +if [ -s "$rows" ]; then + sort "$rows" | uniq >> "$tmp" +fi + +mv "$tmp" "$OUT_FILE" \ No newline at end of file diff --git a/config/monitoring/vector/service.yaml b/config/monitoring/vector/service.yaml new file mode 100644 index 0000000..b77149a --- /dev/null +++ b/config/monitoring/vector/service.yaml @@ -0,0 +1,14 @@ +apiVersion: v1 +kind: Service +metadata: + name: vector + labels: + app.kubernetes.io/name: vector +spec: + selector: + app.kubernetes.io/name: vector + ports: + - name: vector + port: 9000 + targetPort: vector + protocol: TCP diff --git a/config/overlays/vector-metrics-gateway/kustomization.yaml b/config/overlays/vector-metrics-gateway/kustomization.yaml new file mode 100644 index 0000000..bdbc976 --- /dev/null +++ b/config/overlays/vector-metrics-gateway/kustomization.yaml @@ -0,0 +1,7 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +namespace: dns-agent-system + +resources: + - ../../monitoring/vector diff --git a/internal/controller/dnszone_replicator_controller.go b/internal/controller/dnszone_replicator_controller.go index 58e3729..3d5d028 100644 --- a/internal/controller/dnszone_replicator_controller.go +++ b/internal/controller/dnszone_replicator_controller.go @@ -323,6 +323,9 @@ func (r *DNSZoneReplicator) ensureZoneAccounting(ctx context.Context, upstream * newCM.Data = map[string]string{ "owner": owner, } + newCM.Labels = map[string]string{ + "datum.net/dnszone-owner": owner, + } if cerr := r.DownstreamClient.Create(ctx, &newCM); cerr != nil { // A race can occur; if created by another, treat as not owned and let next reconcile decide if apierrors.IsAlreadyExists(cerr) { diff --git a/test/e2e/zones-and-records/chainsaw-test.yaml b/test/e2e/zones-and-records/chainsaw-test.yaml index 01a5974..13e4df3 100644 --- a/test/e2e/zones-and-records/chainsaw-test.yaml +++ b/test/e2e/zones-and-records/chainsaw-test.yaml @@ -404,3 +404,47 @@ spec: kind: DNSZoneClass check: ($error == null): true + + - name: Verify Metrics - curl into vector pod and check for metrics + try: + - create: + cluster: downstream + resource: + apiVersion: v1 + kind: Pod + metadata: + name: vector-metrics-check + namespace: default + spec: + restartPolicy: Never + containers: + - name: curl + image: curlimages/curl:8.5.0 + command: ["sleep", "3600"] + - assert: + cluster: downstream + resource: + apiVersion: v1 + kind: Pod + metadata: + name: vector-metrics-check + namespace: default + status: + phase: Running + - sleep: + duration: 10s + - script: + cluster: downstream + content: | + set -euo pipefail + kubectl -n default exec pod/vector-metrics-check -- sh -c ' + metrics=$(curl -fsS http://pdns-auth.dns-agent-system.svc.cluster.local:9598/metrics) + echo "Prometheus metrics:" + echo "$metrics" | awk "/^dns_/ { split(\$0,a,\"{\"); print a[1] }" | sort -u || true + total=$(echo "$metrics" | awk "/^dns_queries_total/ {print \$2; exit}") + if [ -z "${total:-}" ] || [ "$total" = "0" ]; then + echo "dns_queries_total is zero or missing" >&2 + exit 1 + fi + echo "dns_queries_total: $total" + '