diff --git a/docs/guides/monitoring.md b/docs/guides/monitoring.md index 2e72098..91233de 100644 --- a/docs/guides/monitoring.md +++ b/docs/guides/monitoring.md @@ -532,6 +532,42 @@ PTD deploys a set of Grafana alerts to the control room for centralized monitori All alerts are configured to send notifications to OpsGenie when triggered. +### Alert Format + +Alerts use a standardized format for consistency across all alert types: + +``` +[🔴 CRITICAL | 🟡 WARNING]: [Title] + +[Description] + +─── WHERE ─────────────────────────── +Tenant: [tenant name] (Note: The organization or group that a workload cluster is provisioned for) +Cluster: [cluster name] +Component: [affected component] + +─── DETAILS ───────────────────────── +[Key]: [Value] +[Key]: [Value] +... + +📖 [runbook link] +📊 [dashboard link] +``` + +**Severity levels:** +- 🔴 **CRITICAL** — Immediate action required +- 🟡 **WARNING** — Investigate soon + +**Alert types and their WHERE/DETAILS fields:** + +| Type | WHERE | DETAILS | +|------|-------|---------| +| Health Check | Tenant, Cluster, Product | Endpoint, Status, Response Time, Down Since | +| Kubernetes | Tenant, Cluster, Namespace, Pod/Node | Varies by alert (restarts, replicas, conditions) | +| Cloud (AWS) | Tenant, Cluster, Resource, Region | Metric, Current, Threshold, Duration | +| Cloud (Azure) | Tenant, Cluster, Resource, Location | Metric, Current, Threshold, Duration | + ### Application Alerts | Alert | Threshold | Duration | Description | diff --git a/examples/workload/ptd.yaml b/examples/workload/ptd.yaml index c04a275..4a867c0 100644 --- a/examples/workload/ptd.yaml +++ b/examples/workload/ptd.yaml @@ -11,6 +11,9 @@ spec: # AWS account ID where the workload will be deployed account_id: "123456789012" + # Human-readable tenant name for alerts (defaults to compound_name if not set) + tenant_name: "Example Analytics Team" + # Control room that manages this workload control_room_account_id: "123456789012" control_room_cluster_name: control-room-production diff --git a/python-pulumi/src/ptd/__init__.py b/python-pulumi/src/ptd/__init__.py index 2e2c01f..4949468 100644 --- a/python-pulumi/src/ptd/__init__.py +++ b/python-pulumi/src/ptd/__init__.py @@ -316,6 +316,7 @@ class WorkloadConfig: network_trust: NetworkTrust sites: typing.Mapping[str, SiteConfig] true_name: str + tenant_name: str | None = dataclasses.field(default=None, kw_only=True) # Human-readable name for the tenant @property def domain(self) -> str: diff --git a/python-pulumi/src/ptd/grafana_alerts/applications.yaml b/python-pulumi/src/ptd/grafana_alerts/applications.yaml index 2e089d6..7f36bb7 100644 --- a/python-pulumi/src/ptd/grafana_alerts/applications.yaml +++ b/python-pulumi/src/ptd/grafana_alerts/applications.yaml @@ -63,8 +63,21 @@ groups: execErrState: Error for: 5m annotations: - description: Loki ingester has experienced WAL disk full failures. This indicates storage issues with the Loki WAL directory in cluster {{ $labels.cluster }}. - summary: Loki WAL disk full failures detected in {{ $labels.cluster }} + summary: "🔴 CRITICAL: Loki WAL Disk Full" + description: | + Loki ingester experiencing WAL disk full failures + + ─── WHERE ─────────────────────────── + Tenant: {{ $labels.tenant_name }} + Cluster: {{ $labels.cluster }} + Namespace: {{ $labels.namespace }} + Pod: {{ $labels.pod }} + + ─── DETAILS ───────────────────────── + Component: Loki Ingester + Issue: WAL disk full + Duration: 5 minutes + labels: opsgenie: "1" isPaused: false diff --git a/python-pulumi/src/ptd/grafana_alerts/cloudwatch.yaml b/python-pulumi/src/ptd/grafana_alerts/cloudwatch.yaml index 0e18559..076c603 100644 --- a/python-pulumi/src/ptd/grafana_alerts/cloudwatch.yaml +++ b/python-pulumi/src/ptd/grafana_alerts/cloudwatch.yaml @@ -63,8 +63,22 @@ groups: execErrState: Error for: 5m annotations: - description: Evaluates FSx instance usage to determine if it is greater than 80% allocated. - summary: FSx instance {{$labels.dimension_FileSystemId}} has less than 20% of capacity remaining. + summary: "🟡 WARNING: FSx Storage Capacity Low" + description: | + FSx file system is running low on storage capacity + + ─── WHERE ─────────────────────────── + Tenant: {{ $labels.tenant_name }} + Cluster: {{ $labels.cluster }} + Resource: {{ $labels.dimension_FileSystemId }} + Region: {{ $labels.region }} + + ─── DETAILS ───────────────────────── + Metric: Storage Capacity + Current: > 80% used + Threshold: 80% + Duration: 5 minutes + labels: opsgenie: "1" isPaused: false @@ -124,8 +138,22 @@ groups: execErrState: Error for: 5m annotations: - description: EC2 instance has high network outbound traffic (over 300 MiB/s) for more than 5 minutes. - summary: High network outbound traffic on EC2 instance {{$labels.dimension_InstanceId}} in cluster {{$labels.cluster}} + summary: "🟡 WARNING: EC2 Network Out High" + description: | + EC2 instance has unusually high outbound network traffic + + ─── WHERE ─────────────────────────── + Tenant: {{ $labels.tenant_name }} + Cluster: {{ $labels.cluster }} + Resource: {{ $labels.dimension_InstanceId }} + Region: {{ $labels.region }} + + ─── DETAILS ───────────────────────── + Metric: Network Out + Current: > 300 MiB/s + Threshold: 300 MiB/s + Duration: 5 minutes + labels: opsgenie: "1" isPaused: false @@ -181,8 +209,22 @@ groups: execErrState: Error for: 5m annotations: - description: EC2 instance has an unusually high packet transmission rate (over 400,000 packets/s) for more than 5 minutes, which could indicate abnormal network activity. - summary: High network packet rate on EC2 instance {{$labels.dimension_InstanceId}} in cluster {{$labels.cluster}} + summary: "🟡 WARNING: EC2 Network Packets Out High" + description: | + EC2 instance has unusually high packet transmission rate + + ─── WHERE ─────────────────────────── + Tenant: {{ $labels.tenant_name }} + Cluster: {{ $labels.cluster }} + Resource: {{ $labels.dimension_InstanceId }} + Region: {{ $labels.region }} + + ─── DETAILS ───────────────────────── + Metric: Network Packets Out + Current: > 400,000 packets/s + Threshold: 400,000 packets/s + Duration: 5 minutes + labels: opsgenie: "1" isPaused: false diff --git a/python-pulumi/src/ptd/grafana_alerts/healthchecks.yaml b/python-pulumi/src/ptd/grafana_alerts/healthchecks.yaml index 3924d39..0922c8b 100644 --- a/python-pulumi/src/ptd/grafana_alerts/healthchecks.yaml +++ b/python-pulumi/src/ptd/grafana_alerts/healthchecks.yaml @@ -68,8 +68,22 @@ groups: execErrState: Error for: 5m annotations: - description: Calls the health check for each component of each PTD site and errors on non-200 response. - summary: "Health check failed! \nCluster: {{ $labels.cluster }} \nSite: {{ $labels.ptd_site }}\nComponent: {{ $labels.ptd_component }}\nCheck Type: {{ $labels.check_type }}\nURL: {{ $labels.health_check_url }}" + summary: "🔴 CRITICAL: Health Check Failed" + description: | + Product health check returning non-200 response + + ─── WHERE ─────────────────────────── + Tenant: {{ $labels.tenant_name }} + Cluster: {{ $labels.cluster }} + Site: {{ $labels.ptd_site }} + Product: {{ $labels.ptd_component }} + + ─── DETAILS ───────────────────────── + Check Type: {{ $labels.check_type }} + Endpoint: {{ $labels.health_check_url }} + Status: Non-200 response + Duration: 5 minutes + labels: opsgenie: "1" isPaused: false diff --git a/python-pulumi/src/ptd/grafana_alerts/mimir.yaml b/python-pulumi/src/ptd/grafana_alerts/mimir.yaml index 71da183..b5f6779 100644 --- a/python-pulumi/src/ptd/grafana_alerts/mimir.yaml +++ b/python-pulumi/src/ptd/grafana_alerts/mimir.yaml @@ -91,8 +91,19 @@ groups: execErrState: Error for: 10m annotations: - description: No metrics have been received from workload cluster {{ $labels.cluster }} for 10 minutes. This could indicate Alloy is not running, network issues between the workload and control room, or the workload cluster is down. - summary: "Workload metrics silent!\nCluster: {{ $labels.cluster }}" + summary: "🔴 CRITICAL: Workload Metrics Silent" + description: | + No metrics received from workload cluster + + ─── WHERE ─────────────────────────── + Tenant: {{ $labels.tenant_name }} + Cluster: {{ $labels.cluster }} + + ─── DETAILS ───────────────────────── + Component: Metrics Pipeline + Issue: No metrics received + Duration: 10 minutes + labels: opsgenie: "1" isPaused: false diff --git a/python-pulumi/src/ptd/grafana_alerts/nodes.yaml b/python-pulumi/src/ptd/grafana_alerts/nodes.yaml index dde33dc..bb6f91d 100644 --- a/python-pulumi/src/ptd/grafana_alerts/nodes.yaml +++ b/python-pulumi/src/ptd/grafana_alerts/nodes.yaml @@ -55,8 +55,19 @@ groups: execErrState: Error for: 15m annotations: - summary: Node not in ready state (instance {{ $labels.instance }}) - description: "Node {{ $labels.node }} in cluster {{ $labels.cluster }} has been unready for 15 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + summary: "🔴 CRITICAL: Node Not Ready" + description: | + Kubernetes node is not accepting workloads + + ─── WHERE ─────────────────────────── + Tenant: {{ $labels.tenant_name }} + Cluster: {{ $labels.cluster }} + Node: {{ $labels.node }} + + ─── DETAILS ───────────────────────── + Condition: NotReady + Duration: 15 minutes + labels: opsgenie: "1" isPaused: false @@ -110,8 +121,19 @@ groups: execErrState: Error for: 15m annotations: - summary: Kubernetes Node memory pressure (instance {{ $labels.instance }}) - description: "Node {{ $labels.node }} in cluster {{ $labels.cluster }} has MemoryPressure condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + summary: "🟡 WARNING: Node Memory Pressure" + description: | + Node is running low on available memory + + ─── WHERE ─────────────────────────── + Tenant: {{ $labels.tenant_name }} + Cluster: {{ $labels.cluster }} + Node: {{ $labels.node }} + + ─── DETAILS ───────────────────────── + Condition: MemoryPressure + Duration: 15 minutes + labels: opsgenie: "1" isPaused: false @@ -165,8 +187,19 @@ groups: execErrState: Error for: 15m annotations: - summary: Kubernetes Node disk pressure (instance {{ $labels.instance }}) - description: "Node {{ $labels.node }} in cluster {{ $labels.cluster }} has DiskPressure condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + summary: "🟡 WARNING: Node Disk Pressure" + description: | + Node is running low on available disk space + + ─── WHERE ─────────────────────────── + Tenant: {{ $labels.tenant_name }} + Cluster: {{ $labels.cluster }} + Node: {{ $labels.node }} + + ─── DETAILS ───────────────────────── + Condition: DiskPressure + Duration: 15 minutes + labels: opsgenie: "1" isPaused: false diff --git a/python-pulumi/src/ptd/grafana_alerts/pods.yaml b/python-pulumi/src/ptd/grafana_alerts/pods.yaml index 753fce6..5ffda98 100644 --- a/python-pulumi/src/ptd/grafana_alerts/pods.yaml +++ b/python-pulumi/src/ptd/grafana_alerts/pods.yaml @@ -49,7 +49,7 @@ groups: type: prometheus uid: mimir editorMode: code - expr: count by (cluster, container)(kube_pod_container_status_waiting_reason{namespace=~"posit-team|posit-team-system|alloy|mimir|loki|grafana",reason="CrashLoopBackOff"}) + expr: count by (cluster, namespace, pod, container)(kube_pod_container_status_waiting_reason{namespace=~"posit-team|posit-team-system|alloy|mimir|loki|grafana",reason="CrashLoopBackOff"}) instant: true intervalMs: 1000 legendFormat: __auto @@ -116,7 +116,21 @@ groups: execErrState: Error for: 5m annotations: - summary: Container {{ $labels.container }} in {{ $labels.cluster }} is in CrashLoopBackoff state. \n VALUE = {{ $value }}\n LABELS = {{ $labels }} + summary: "🔴 CRITICAL: Container Crash-Looping" + description: | + Container keeps crashing and restarting + + ─── WHERE ─────────────────────────── + Tenant: {{ $labels.tenant_name }} + Cluster: {{ $labels.cluster }} + Namespace: {{ $labels.namespace }} + Pod: {{ $labels.pod }} + Container: {{ $labels.container }} + + ─── DETAILS ───────────────────────── + Status: CrashLoopBackOff + Duration: 5 minutes + labels: opsgenie: "1" isPaused: false @@ -135,7 +149,7 @@ groups: uid: mimir disableTextWrap: false editorMode: code - expr: count by(cluster, pod, reason) (kube_pod_container_status_terminated_reason{namespace=~"posit-team|posit-team-system|alloy|mimir|loki|grafana",reason!="Completed"} * on(cluster,pod) group_left(label_launcher_instance_id) kube_pod_labels{label_launcher_instance_id=""}) + expr: count by(cluster, namespace, pod, reason) (kube_pod_container_status_terminated_reason{namespace=~"posit-team|posit-team-system|alloy|mimir|loki|grafana",reason!="Completed"} * on(cluster,pod) group_left(label_launcher_instance_id) kube_pod_labels{label_launcher_instance_id=""}) fullMetaSearch: false includeNullMetadata: true instant: true @@ -207,7 +221,20 @@ groups: execErrState: Error for: 5m annotations: - summary: Pod {{ $labels.pod }} in cluster {{ $labels.cluster }} is in a {{ $labels.reason }} state. + summary: "🟡 WARNING: Pod Error" + description: | + Pod container terminated with an error + + ─── WHERE ─────────────────────────── + Tenant: {{ $labels.tenant_name }} + Cluster: {{ $labels.cluster }} + Namespace: {{ $labels.namespace }} + Pod: {{ $labels.pod }} + + ─── DETAILS ───────────────────────── + Reason: {{ $labels.reason }} + Duration: 5 minutes + labels: opsgenie: "1" isPaused: false @@ -226,7 +253,7 @@ groups: uid: mimir disableTextWrap: false editorMode: code - expr: sum by (cluster, pod, phase) (kube_pod_status_phase{namespace=~"posit-team|posit-team-system|alloy|mimir|loki|grafana",phase=~"Pending|Unknown|Failed"} * on(cluster,pod) group_left(label_launcher_instance_id) kube_pod_labels{label_launcher_instance_id=""}) > 0 + expr: sum by (cluster, namespace, pod, phase) (kube_pod_status_phase{namespace=~"posit-team|posit-team-system|alloy|mimir|loki|grafana",phase=~"Pending|Unknown|Failed"} * on(cluster,pod) group_left(label_launcher_instance_id) kube_pod_labels{label_launcher_instance_id=""}) > 0 fullMetaSearch: false includeNullMetadata: true instant: true @@ -294,8 +321,20 @@ groups: execErrState: Error for: 15m annotations: - summary: Workload Cluster Pod not healthy (pod {{ $labels.pod }}) - description: "Cluster: {{ $labels.cluster }}, Pod: {{ $labels.pod }} has been in a non-running state for longer than 15 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + summary: "🟡 WARNING: Pod Not Healthy" + description: | + Pod has been in a non-running state + + ─── WHERE ─────────────────────────── + Tenant: {{ $labels.tenant_name }} + Cluster: {{ $labels.cluster }} + Namespace: {{ $labels.namespace }} + Pod: {{ $labels.pod }} + + ─── DETAILS ───────────────────────── + Phase: {{ $labels.phase }} + Duration: 15 minutes + labels: opsgenie: "1" isPaused: false @@ -386,8 +425,19 @@ groups: execErrState: Error for: 15m annotations: - summary: Workload Cluster Pod {{ $labels.pod }} has more than 5 restarts in 15 minutes. - description: "Cluster: {{ $labels.cluster }}, Pod: {{ $labels.pod }} has restarted {{$value}} times in 15 minutes. \n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + summary: "🟡 WARNING: Pod Restarts" + description: | + Pod has restarted excessively + + ─── WHERE ─────────────────────────── + Tenant: {{ $labels.tenant_name }} + Cluster: {{ $labels.cluster }} + Namespace: {{ $labels.namespace }} + Pod: {{ $labels.pod }} + + ─── DETAILS ───────────────────────── + Issue: > 5 restarts in 15 minutes + labels: opsgenie: "1" isPaused: false @@ -471,8 +521,20 @@ groups: execErrState: Error for: 15m annotations: - summary: Deployment replicas mismatch (instance {{ $labels.instance }}) - description: "Cluster: {{ $labels.cluster }}, Deployment: {{ $labels.deployment }} has mismatched desired and available replicas.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + summary: "🟡 WARNING: Deployment Unhealthy" + description: | + Deployment has fewer replicas than expected + + ─── WHERE ─────────────────────────── + Tenant: {{ $labels.tenant_name }} + Cluster: {{ $labels.cluster }} + Namespace: {{ $labels.namespace }} + Deployment: {{ $labels.deployment }} + + ─── DETAILS ───────────────────────── + Status: Replicas mismatch + Duration: 15 minutes + labels: opsgenie: "1" isPaused: false @@ -556,8 +618,20 @@ groups: execErrState: Error for: 15m annotations: - summary: Stateful Set replicas mismatch (instance {{ $labels.instance }}) - description: "Cluster: {{ $labels.cluster }}, Stateful Set: {{ $labels.statefulset }} does not match the desired number of replicas.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + summary: "🟡 WARNING: StatefulSet Unhealthy" + description: | + StatefulSet has fewer replicas than expected + + ─── WHERE ─────────────────────────── + Tenant: {{ $labels.tenant_name }} + Cluster: {{ $labels.cluster }} + Namespace: {{ $labels.namespace }} + StatefulSet: {{ $labels.statefulset }} + + ─── DETAILS ───────────────────────── + Status: Replicas mismatch + Duration: 15 minutes + labels: opsgenie: "1" isPaused: false diff --git a/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py b/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py index 9e08490..7d97f0c 100644 --- a/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py +++ b/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py @@ -1963,6 +1963,21 @@ def with_grafana( ), values={ "alerting": { + # Custom notification templates for clean alert formatting. + # These templates output ONLY our formatted content without + # Grafana's default prefix (Firing, Value, Labels, Annotations). + "templates.yaml": { + "apiVersion": 1, + "templates": [ + { + "orgId": 1, + "name": "ptd_templates", + # Template outputs description annotation + Source/Silence links for each alert. + # This avoids Grafana's default verbose format while keeping useful links. + "template": '{{ "{{" }} define "ptd.description" {{ "}}" }}{{ "{{" }} range .Alerts {{ "}}" }}{{ "{{" }} .Annotations.description {{ "}}" }}\n\nSource: {{ "{{" }} .GeneratorURL {{ "}}" }}\nSilence: {{ "{{" }} .SilenceURL {{ "}}" }}{{ "{{" }} end {{ "}}" }}{{ "{{" }} end {{ "}}" }}', + } + ], + }, "contactpoints.yaml": { "apiVersion": 1, "contactPoints": [ @@ -1976,6 +1991,10 @@ def with_grafana( "settings": { "apiKey": '${{ "{" }}POSIT_OPSGENIE_KEY{{ "}" }}', # ${POSIT_OPSGENIE_KEY} in the resulting configMap, "apiUrl": "https://api.opsgenie.com/v2/alerts", + "sendTagsAs": "tags", + "message": '{{ "{{" }} .CommonAnnotations.summary {{ "}}" }}', + # Use custom template for clean description without label dumps + "description": '{{ "{{" }} template "ptd.description" . {{ "}}" }}', }, } ], @@ -1988,7 +2007,10 @@ def with_grafana( { "orgId": 1, "receiver": "posit-opsgenie", - "group_by": ["alertname", "cluster"], + # health_check_url ensures each health check endpoint gets its own alert + # (internal vs fqdn checks are separate). This label is empty for + # non-healthcheck alerts so it won't affect their grouping. + "group_by": ["alertname", "cluster", "ptd_component", "health_check_url"], "matchers": ["opsgenie = 1"], "group_wait": "30s", "group_interval": "5m", diff --git a/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py b/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py index 874fa33..fa9235c 100644 --- a/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py +++ b/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py @@ -127,6 +127,7 @@ def _is_fqdn_health_check_enabled(self, site_dict: dict[str, typing.Any] | None) def _define_blackbox_targets(self) -> str: output = "" + tenant_name = self.workload.cfg.tenant_name or self.workload.compound_name for site_name, site_config in self.workload.cfg.sites.items(): # Parse site YAML once for this site @@ -150,6 +151,7 @@ def _define_blackbox_targets(self) -> str: address = {internal_address} module = "{component.module_name}" labels = {{ + "tenant_name" = "{tenant_name}", "ptd_site" = "{site_name}", "ptd_component" = "{lower_name}", "check_type" = "internal", @@ -168,6 +170,7 @@ def _define_blackbox_targets(self) -> str: address = {fqdn_address} module = "{component.module_name}" labels = {{ + "tenant_name" = "{tenant_name}", "ptd_site" = "{site_name}", "ptd_component" = "{lower_name}", "check_type" = "fqdn", @@ -264,12 +267,12 @@ def _define_cloudwatch_config(self) -> str: period = "5m" }} - # TODO: Remove ["Sum"] from statistics once all Grafana dashboards have - # been updated to query aws_rds_database_connections_average. - # Collecting both Sum and Average during migration. Average is the - # target metric (aws_rds_database_connections_average); Sum - # (aws_rds_database_connections_sum) is kept temporarily for existing - # dashboards. NOTE: Keeping Sum doubles the CloudWatch API cost for this metric. + // TODO: Remove ["Sum"] from statistics once all Grafana dashboards have + // been updated to query aws_rds_database_connections_average. + // Collecting both Sum and Average during migration. Average is the + // target metric (aws_rds_database_connections_average). Sum + // (aws_rds_database_connections_sum) is kept temporarily for existing + // dashboards. NOTE: Keeping Sum doubles the CloudWatch API cost for this metric. metric {{ name = "DatabaseConnections" statistics = ["Average", "Sum"] @@ -294,14 +297,14 @@ def _define_cloudwatch_config(self) -> str: period = "5m" }} - # Collected for dashboard visibility; no alert rules defined + // Collected for dashboard visibility, no alert rules defined metric {{ name = "WriteLatency" statistics = ["Average"] period = "5m" }} - # Collected for dashboard visibility; no alert rules defined + // Collected for dashboard visibility, no alert rules defined metric {{ name = "Deadlocks" statistics = ["Sum"] @@ -334,8 +337,8 @@ def _define_cloudwatch_config(self) -> str: type = "AWS/NATGateway" regions = ["{self.region}"] - # NAT Gateways inherit VPC tags including posit.team/true-name - # (see python-pulumi/src/ptd/pulumi_resources/aws_vpc.py:607-616) + // NAT Gateways inherit VPC tags including posit.team/true-name + // (see python-pulumi/src/ptd/pulumi_resources/aws_vpc.py:607-616) search_tags = {{ "posit.team/true-name" = "{self.workload.cfg.true_name}", }} @@ -357,12 +360,12 @@ def _define_cloudwatch_config(self) -> str: type = "AWS/ApplicationELB" regions = ["{self.region}"] - # ALBs are tagged at creation time via aws_workload_helm.py. - # LBs provisioned before this tag was added won't be discovered - # until the cluster is redeployed. - # FIXME: To tag existing ALBs without redeploying, use the AWS CLI: - # aws elbv2 add-tags --resource-arns \ - # --tags Key=posit.team/true-name,Value= + // ALBs are tagged at creation time via aws_workload_helm.py. + // LBs provisioned before this tag was added won't be discovered + // until the cluster is redeployed. + // FIXME: To tag existing ALBs without redeploying, use the AWS CLI: + // aws elbv2 add-tags --resource-arns + // --tags Key=posit.team/true-name,Value= search_tags = {{ "posit.team/true-name" = "{self.workload.cfg.true_name}", }} @@ -390,12 +393,12 @@ def _define_cloudwatch_config(self) -> str: type = "AWS/NetworkELB" regions = ["{self.region}"] - # NLBs are tagged at creation time via traefik.py. - # LBs provisioned before this tag was added won't be discovered - # until the cluster is redeployed. - # FIXME: To tag existing NLBs without redeploying, use the AWS CLI: - # aws elbv2 add-tags --resource-arns \ - # --tags Key=posit.team/true-name,Value= + // NLBs are tagged at creation time via traefik.py. + // LBs provisioned before this tag was added won't be discovered + // until the cluster is redeployed. + // FIXME: To tag existing NLBs without redeploying, use the AWS CLI: + // aws elbv2 add-tags --resource-arns + // --tags Key=posit.team/true-name,Value= search_tags = {{ "posit.team/true-name" = "{self.workload.cfg.true_name}", }} @@ -433,6 +436,9 @@ def _define_config_map( account_id = self.workload.cfg.account_id cluster_name = self.workload.eks_cluster_name(self.release) + # Use tenant_name if set, otherwise fall back to compound_name + tenant_name = self.workload.cfg.tenant_name or self.workload.compound_name + # Generate CloudWatch exporter configuration for AWS cloudwatch_config = self._define_cloudwatch_config() @@ -586,6 +592,9 @@ def _define_config_map( }} prometheus.remote_write "control_room" {{ + external_labels = {{ + tenant_name = "{tenant_name}", + }} endpoint {{ url = "{control_room_url}" basic_auth {{ @@ -599,6 +608,9 @@ def _define_config_map( }} prometheus.remote_write "workload" {{ + external_labels = {{ + tenant_name = "{tenant_name}", + }} endpoint {{ url = "{workload_url}" }} @@ -768,6 +780,7 @@ def _define_config_map( external_labels = {{ data = "true", + tenant_name = "{tenant_name}", }} }} """