From 4e138298e24e8855045fb5f3345be03f7964bb19 Mon Sep 17 00:00:00 2001 From: ian-flores Date: Mon, 23 Feb 2026 12:55:13 -0800 Subject: [PATCH 01/27] Add CloudWatch alerting for RDS, NAT Gateway, and Load Balancers Add alert rules for RDS (CPU, storage, memory, latency, connections), NAT Gateway (port allocation errors, dropped packets), and ALB/NLB (5XX errors, unhealthy targets, response latency). Add WriteLatency and Deadlocks metrics to RDS Alloy scraping, and add new CloudWatch discovery blocks for NAT Gateway, ALB, and NLB. Ref: ptd-config#2778 --- .../src/ptd/grafana_alerts/loadbalancer.yaml | 235 ++++++++++++++ .../src/ptd/grafana_alerts/natgateway.yaml | 125 ++++++++ python-pulumi/src/ptd/grafana_alerts/rds.yaml | 290 ++++++++++++++++++ .../ptd/pulumi_resources/aws_eks_cluster.py | 3 + .../src/ptd/pulumi_resources/grafana_alloy.py | 75 +++++ 5 files changed, 728 insertions(+) create mode 100644 python-pulumi/src/ptd/grafana_alerts/loadbalancer.yaml create mode 100644 python-pulumi/src/ptd/grafana_alerts/natgateway.yaml create mode 100644 python-pulumi/src/ptd/grafana_alerts/rds.yaml diff --git a/python-pulumi/src/ptd/grafana_alerts/loadbalancer.yaml b/python-pulumi/src/ptd/grafana_alerts/loadbalancer.yaml new file mode 100644 index 0000000..0faf1a0 --- /dev/null +++ b/python-pulumi/src/ptd/grafana_alerts/loadbalancer.yaml @@ -0,0 +1,235 @@ +# To delete these alerts, simply removing the configMap that uses this method will not work. +# Replace file contents with the following and apply in order to delete the alerts: +# apiVersion: 1 +# deleteRules: +# - orgId: 1 +# uid: alb_target_5xx_errors_high +# +# See https://grafana.com/docs/grafana/latest/alerting/set-up/provision-alerting-resources/file-provisioning/ +apiVersion: 1 +groups: + - orgId: 1 + name: LoadBalancer + folder: Posit Alerts + interval: 5m + rules: + - uid: alb_target_5xx_errors_high + title: ALB Target 5XX Errors High + condition: B + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: mimir + model: + editorMode: code + expr: aws_applicationelb_httpcode_target_5_xx_count_sum{job="integrations/cloudwatch"} + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: A + - refId: B + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 10 + type: gt + operator: + type: and + query: + params: + - C + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + refId: B + type: threshold + noDataState: NoData + execErrState: Error + for: 5m + annotations: + description: Application Load Balancer has more than 10 target 5XX errors for over 5 minutes, indicating backend service failures. + summary: High 5XX errors on ALB {{$labels.dimension_LoadBalancer}} in cluster {{$labels.cluster}} + labels: + opsgenie: "1" + isPaused: false + - uid: alb_unhealthy_targets + title: ALB Unhealthy Targets + condition: B + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: mimir + model: + editorMode: code + expr: aws_applicationelb_un_healthy_host_count_average{job="integrations/cloudwatch"} + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: A + - refId: B + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 0 + type: gt + operator: + type: and + query: + params: + - C + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + refId: B + type: threshold + noDataState: NoData + execErrState: Error + for: 5m + annotations: + description: Application Load Balancer has unhealthy targets for over 5 minutes, indicating backend service health issues. + summary: Unhealthy targets on ALB {{$labels.dimension_LoadBalancer}} in cluster {{$labels.cluster}} + labels: + opsgenie: "1" + isPaused: false + - uid: nlb_unhealthy_targets + title: NLB Unhealthy Targets + condition: B + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: mimir + model: + editorMode: code + expr: aws_networkelb_un_healthy_host_count_average{job="integrations/cloudwatch"} + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: A + - refId: B + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 0 + type: gt + operator: + type: and + query: + params: + - C + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + refId: B + type: threshold + noDataState: NoData + execErrState: Error + for: 5m + annotations: + description: Network Load Balancer has unhealthy targets for over 5 minutes, indicating backend service health issues. + summary: Unhealthy targets on NLB {{$labels.dimension_LoadBalancer}} in cluster {{$labels.cluster}} + labels: + opsgenie: "1" + isPaused: false + - uid: alb_response_latency_high + title: ALB Response Latency High + condition: B + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: mimir + model: + editorMode: code + expr: aws_applicationelb_target_response_time_average{job="integrations/cloudwatch"} + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: A + - refId: B + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 2 # 2 seconds + type: gt + operator: + type: and + query: + params: + - C + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + refId: B + type: threshold + noDataState: NoData + execErrState: Error + for: 10m + annotations: + description: Application Load Balancer target response time is above 2 seconds for more than 10 minutes, indicating performance degradation. + summary: High response latency on ALB {{$labels.dimension_LoadBalancer}} in cluster {{$labels.cluster}} + labels: + opsgenie: "1" + isPaused: false diff --git a/python-pulumi/src/ptd/grafana_alerts/natgateway.yaml b/python-pulumi/src/ptd/grafana_alerts/natgateway.yaml new file mode 100644 index 0000000..41e5798 --- /dev/null +++ b/python-pulumi/src/ptd/grafana_alerts/natgateway.yaml @@ -0,0 +1,125 @@ +# To delete these alerts, simply removing the configMap that uses this method will not work. +# Replace file contents with the following and apply in order to delete the alerts: +# apiVersion: 1 +# deleteRules: +# - orgId: 1 +# uid: nat_gateway_port_allocation_errors +# +# See https://grafana.com/docs/grafana/latest/alerting/set-up/provision-alerting-resources/file-provisioning/ +apiVersion: 1 +groups: + - orgId: 1 + name: NATGateway + folder: Posit Alerts + interval: 5m + rules: + - uid: nat_gateway_port_allocation_errors + title: NAT Gateway Port Allocation Errors + condition: B + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: mimir + model: + editorMode: code + expr: aws_natgateway_error_port_allocation_sum{job="integrations/cloudwatch"} + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: A + - refId: B + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 0 + type: gt + operator: + type: and + query: + params: + - C + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + refId: B + type: threshold + noDataState: NoData + execErrState: Error + for: 1m + annotations: + description: NAT Gateway is experiencing port allocation errors, which means outbound network connectivity is failing. This is a critical issue that requires immediate attention. + summary: Port allocation errors on NAT Gateway {{$labels.dimension_NatGatewayId}} in cluster {{$labels.cluster}} + labels: + opsgenie: "1" + isPaused: false + - uid: nat_gateway_packets_dropped + title: NAT Gateway Packets Dropped + condition: B + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: mimir + model: + editorMode: code + expr: aws_natgateway_packets_drop_count_sum{job="integrations/cloudwatch"} + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: A + - refId: B + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 100 + type: gt + operator: + type: and + query: + params: + - C + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + refId: B + type: threshold + noDataState: NoData + execErrState: Error + for: 5m + annotations: + description: NAT Gateway has dropped more than 100 packets for over 5 minutes, indicating potential network issues. + summary: High packet drop rate on NAT Gateway {{$labels.dimension_NatGatewayId}} in cluster {{$labels.cluster}} + labels: + opsgenie: "1" + isPaused: false diff --git a/python-pulumi/src/ptd/grafana_alerts/rds.yaml b/python-pulumi/src/ptd/grafana_alerts/rds.yaml new file mode 100644 index 0000000..80a0a93 --- /dev/null +++ b/python-pulumi/src/ptd/grafana_alerts/rds.yaml @@ -0,0 +1,290 @@ +# To delete these alerts, simply removing the configMap that uses this method will not work. +# Replace file contents with the following and apply in order to delete the alerts: +# apiVersion: 1 +# deleteRules: +# - orgId: 1 +# uid: rds_cpu_utilization_high +# +# See https://grafana.com/docs/grafana/latest/alerting/set-up/provision-alerting-resources/file-provisioning/ +apiVersion: 1 +groups: + - orgId: 1 + name: RDS + folder: Posit Alerts + interval: 5m + rules: + - uid: rds_cpu_utilization_high + title: RDS CPU Utilization High + condition: B + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: mimir + model: + editorMode: code + expr: aws_rds_cpuutilization_average{job="integrations/cloudwatch"} + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: A + - refId: B + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 80 + type: gt + operator: + type: and + query: + params: + - C + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + refId: B + type: threshold + noDataState: NoData + execErrState: Error + for: 10m + annotations: + description: RDS instance CPU utilization is above 80% for more than 10 minutes. + summary: High CPU utilization on RDS instance {{$labels.dimension_DBInstanceIdentifier}} in cluster {{$labels.cluster}} + labels: + opsgenie: "1" + isPaused: false + - uid: rds_free_storage_low + title: RDS Free Storage Low + condition: B + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: mimir + model: + editorMode: code + expr: aws_rds_free_storage_space_average{job="integrations/cloudwatch"} + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: A + - refId: B + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 5368709120 # 5 GiB in bytes + type: lt + operator: + type: and + query: + params: + - C + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + refId: B + type: threshold + noDataState: NoData + execErrState: Error + for: 5m + annotations: + description: RDS instance has less than 5 GiB of free storage space remaining. + summary: Low free storage on RDS instance {{$labels.dimension_DBInstanceIdentifier}} in cluster {{$labels.cluster}} + labels: + opsgenie: "1" + isPaused: false + - uid: rds_freeable_memory_low + title: RDS Freeable Memory Low + condition: B + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: mimir + model: + editorMode: code + expr: aws_rds_freeable_memory_average{job="integrations/cloudwatch"} + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: A + - refId: B + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 268435456 # 256 MiB in bytes + type: lt + operator: + type: and + query: + params: + - C + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + refId: B + type: threshold + noDataState: NoData + execErrState: Error + for: 10m + annotations: + description: RDS instance has less than 256 MiB of freeable memory remaining for more than 10 minutes. + summary: Low freeable memory on RDS instance {{$labels.dimension_DBInstanceIdentifier}} in cluster {{$labels.cluster}} + labels: + opsgenie: "1" + isPaused: false + - uid: rds_read_latency_high + title: RDS Read Latency High + condition: B + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: mimir + model: + editorMode: code + expr: aws_rds_read_latency_average{job="integrations/cloudwatch"} + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: A + - refId: B + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 0.1 # 100ms in seconds + type: gt + operator: + type: and + query: + params: + - C + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + refId: B + type: threshold + noDataState: NoData + execErrState: Error + for: 10m + annotations: + description: RDS instance read latency is above 100ms for more than 10 minutes. + summary: High read latency on RDS instance {{$labels.dimension_DBInstanceIdentifier}} in cluster {{$labels.cluster}} + labels: + opsgenie: "1" + isPaused: false + - uid: rds_database_connections_high + title: RDS Database Connections High + condition: B + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: mimir + model: + editorMode: code + expr: aws_rds_database_connections_sum{job="integrations/cloudwatch"} + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: A + - refId: B + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 80 + type: gt + operator: + type: and + query: + params: + - C + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + refId: B + type: threshold + noDataState: NoData + execErrState: Error + for: 5m + annotations: + description: RDS instance has more than 80 active database connections for more than 5 minutes. + summary: High database connections on RDS instance {{$labels.dimension_DBInstanceIdentifier}} in cluster {{$labels.cluster}} + labels: + opsgenie: "1" + isPaused: false diff --git a/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py b/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py index 463c703..a7023af 100644 --- a/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py +++ b/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py @@ -1949,6 +1949,9 @@ def with_grafana( self._create_alert_configmap("pods", grafana_ns) self._create_alert_configmap("cloudwatch", grafana_ns) + self._create_alert_configmap("rds", grafana_ns) + self._create_alert_configmap("natgateway", grafana_ns) + self._create_alert_configmap("loadbalancer", grafana_ns) self._create_alert_configmap("healthchecks", grafana_ns) self._create_alert_configmap("nodes", grafana_ns) self._create_alert_configmap("applications", grafana_ns) diff --git a/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py b/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py index 4a75bec..ec616a8 100644 --- a/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py +++ b/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py @@ -285,6 +285,18 @@ def _define_config_map( statistics = ["Average"] period = "5m" }} + + metric {{ + name = "WriteLatency" + statistics = ["Average"] + period = "5m" + }} + + metric {{ + name = "Deadlocks" + statistics = ["Sum"] + period = "5m" + }} }} discovery {{ @@ -307,6 +319,69 @@ def _define_config_map( period = "5m" }} }} + + discovery {{ + type = "AWS/NATGateway" + regions = ["{self.region}"] + + search_tags = {{ + "posit.team/true-name" = "{self.workload.cfg.true_name}", + }} + + metric {{ + name = "ErrorPortAllocation" + statistics = ["Sum"] + period = "5m" + }} + + metric {{ + name = "PacketsDropCount" + statistics = ["Sum"] + period = "5m" + }} + }} + + discovery {{ + type = "AWS/ApplicationELB" + regions = ["{self.region}"] + + search_tags = {{ + "posit.team/true-name" = "{self.workload.cfg.true_name}", + }} + + metric {{ + name = "HTTPCode_Target_5XX_Count" + statistics = ["Sum"] + period = "5m" + }} + + metric {{ + name = "UnHealthyHostCount" + statistics = ["Average"] + period = "1m" + }} + + metric {{ + name = "TargetResponseTime" + statistics = ["Average"] + period = "5m" + }} + }} + + discovery {{ + type = "AWS/NetworkELB" + regions = ["{self.region}"] + + search_tags = {{ + "posit.team/true-name" = "{self.workload.cfg.true_name}", + }} + + metric {{ + name = "UnHealthyHostCount" + statistics = ["Average"] + period = "1m" + }} + }} }} prometheus.scrape "cloudwatch" {{ From a172905010526268ab6336cfcbc7fa32e3e68a9a Mon Sep 17 00:00:00 2001 From: ian-flores Date: Mon, 23 Feb 2026 12:55:18 -0800 Subject: [PATCH 02/27] Tag Load Balancers with PTD workload identifiers Add alb.ingress.kubernetes.io/tags annotation to workload ALBs and aws-load-balancer-additional-resource-tags to control room NLBs so Alloy CloudWatch discovery can scope metrics to the correct workload via search_tags instead of scraping all LBs in the region. --- .../src/ptd/pulumi_resources/aws_workload_helm.py | 1 + python-pulumi/src/ptd/pulumi_resources/traefik.py | 11 +++++++++++ 2 files changed, 12 insertions(+) diff --git a/python-pulumi/src/ptd/pulumi_resources/aws_workload_helm.py b/python-pulumi/src/ptd/pulumi_resources/aws_workload_helm.py index 1e454b2..a4df0a2 100644 --- a/python-pulumi/src/ptd/pulumi_resources/aws_workload_helm.py +++ b/python-pulumi/src/ptd/pulumi_resources/aws_workload_helm.py @@ -836,6 +836,7 @@ def _define_ingress_alb_annotations(self, cert_arns: list[str]) -> dict[str, str "alb.ingress.kubernetes.io/healthcheck-path": "/ping", "alb.ingress.kubernetes.io/healthcheck-port": "32090", "alb.ingress.kubernetes.io/load-balancer-attributes": "routing.http.drop_invalid_header_fields.enabled=true,idle_timeout.timeout_seconds=300", + "alb.ingress.kubernetes.io/tags": f"posit.team/true-name={self.workload.cfg.true_name},posit.team/environment={self.workload.cfg.environment},Name={self.workload.compound_name}", } if self.workload.cfg.provisioned_vpc: diff --git a/python-pulumi/src/ptd/pulumi_resources/traefik.py b/python-pulumi/src/ptd/pulumi_resources/traefik.py index 874d2cf..7f6b767 100644 --- a/python-pulumi/src/ptd/pulumi_resources/traefik.py +++ b/python-pulumi/src/ptd/pulumi_resources/traefik.py @@ -120,6 +120,16 @@ def _deploy(self, cert: aws.acm.Certificate | None): :return: """ + # Build tag string from cluster tags for NLB annotation + # Extract posit.team/true-name and posit.team/environment if present + tag_parts = [] + if "posit.team/true-name" in self.cluster.tags: + tag_parts.append(f"posit.team/true-name={self.cluster.tags['posit.team/true-name']}") + if "posit.team/environment" in self.cluster.tags: + tag_parts.append(f"posit.team/environment={self.cluster.tags['posit.team/environment']}") + tag_parts.append(f"Name={self.cluster.name}") + nlb_tags = ",".join(tag_parts) + self.traefik = k8s.helm.v3.Release( f"{self.cluster.name}-traefik", k8s.helm.v3.ReleaseArgs( @@ -147,6 +157,7 @@ def _deploy(self, cert: aws.acm.Certificate | None): "service.beta.kubernetes.io/aws-load-balancer-healthcheck-unhealthy-threshold": "3", "service.beta.kubernetes.io/aws-load-balancer-healthcheck-timeout": "10", "service.beta.kubernetes.io/aws-load-balancer-healthcheck-interval": "10", + "service.beta.kubernetes.io/aws-load-balancer-additional-resource-tags": nlb_tags, }, }, "ports": { From 5f84b5f580eb7e65e945976a11aea1daee46adcb Mon Sep 17 00:00:00 2001 From: ian-flores Date: Mon, 23 Feb 2026 13:08:30 -0800 Subject: [PATCH 03/27] Address code review findings for alerting implementation Fix RDS DatabaseConnections statistic from Sum to Average (it's a gauge, not a counter). Fix query.params referencing non-existent refId C in all alert YAML threshold conditions (including existing cloudwatch.yaml). Make NLB tagging unconditional to match ALB path. Add input validation helpers for LB tag annotation strings. Document NAT Gateway tag inheritance from VPC and dashboard-only metrics. --- .../src/ptd/grafana_alerts/cloudwatch.yaml | 6 ++-- .../src/ptd/grafana_alerts/loadbalancer.yaml | 8 ++--- .../src/ptd/grafana_alerts/natgateway.yaml | 4 +-- python-pulumi/src/ptd/grafana_alerts/rds.yaml | 12 +++---- .../ptd/pulumi_resources/aws_workload_helm.py | 25 ++++++++++++++- .../src/ptd/pulumi_resources/grafana_alloy.py | 6 +++- .../src/ptd/pulumi_resources/traefik.py | 32 ++++++++++++++----- 7 files changed, 68 insertions(+), 25 deletions(-) diff --git a/python-pulumi/src/ptd/grafana_alerts/cloudwatch.yaml b/python-pulumi/src/ptd/grafana_alerts/cloudwatch.yaml index d28856f..0e18559 100644 --- a/python-pulumi/src/ptd/grafana_alerts/cloudwatch.yaml +++ b/python-pulumi/src/ptd/grafana_alerts/cloudwatch.yaml @@ -46,7 +46,7 @@ groups: type: and query: params: - - C + - A reducer: params: [] type: last @@ -107,7 +107,7 @@ groups: type: and query: params: - - C + - A reducer: params: [] type: last @@ -164,7 +164,7 @@ groups: type: and query: params: - - C + - A reducer: params: [] type: last diff --git a/python-pulumi/src/ptd/grafana_alerts/loadbalancer.yaml b/python-pulumi/src/ptd/grafana_alerts/loadbalancer.yaml index 0faf1a0..668fd49 100644 --- a/python-pulumi/src/ptd/grafana_alerts/loadbalancer.yaml +++ b/python-pulumi/src/ptd/grafana_alerts/loadbalancer.yaml @@ -46,7 +46,7 @@ groups: type: and query: params: - - C + - A reducer: params: [] type: last @@ -101,7 +101,7 @@ groups: type: and query: params: - - C + - A reducer: params: [] type: last @@ -156,7 +156,7 @@ groups: type: and query: params: - - C + - A reducer: params: [] type: last @@ -211,7 +211,7 @@ groups: type: and query: params: - - C + - A reducer: params: [] type: last diff --git a/python-pulumi/src/ptd/grafana_alerts/natgateway.yaml b/python-pulumi/src/ptd/grafana_alerts/natgateway.yaml index 41e5798..7cb4947 100644 --- a/python-pulumi/src/ptd/grafana_alerts/natgateway.yaml +++ b/python-pulumi/src/ptd/grafana_alerts/natgateway.yaml @@ -46,7 +46,7 @@ groups: type: and query: params: - - C + - A reducer: params: [] type: last @@ -101,7 +101,7 @@ groups: type: and query: params: - - C + - A reducer: params: [] type: last diff --git a/python-pulumi/src/ptd/grafana_alerts/rds.yaml b/python-pulumi/src/ptd/grafana_alerts/rds.yaml index 80a0a93..a5447e4 100644 --- a/python-pulumi/src/ptd/grafana_alerts/rds.yaml +++ b/python-pulumi/src/ptd/grafana_alerts/rds.yaml @@ -46,7 +46,7 @@ groups: type: and query: params: - - C + - A reducer: params: [] type: last @@ -101,7 +101,7 @@ groups: type: and query: params: - - C + - A reducer: params: [] type: last @@ -156,7 +156,7 @@ groups: type: and query: params: - - C + - A reducer: params: [] type: last @@ -211,7 +211,7 @@ groups: type: and query: params: - - C + - A reducer: params: [] type: last @@ -244,7 +244,7 @@ groups: datasourceUid: mimir model: editorMode: code - expr: aws_rds_database_connections_sum{job="integrations/cloudwatch"} + expr: aws_rds_database_connections_average{job="integrations/cloudwatch"} instant: true intervalMs: 1000 legendFormat: __auto @@ -266,7 +266,7 @@ groups: type: and query: params: - - C + - A reducer: params: [] type: last diff --git a/python-pulumi/src/ptd/pulumi_resources/aws_workload_helm.py b/python-pulumi/src/ptd/pulumi_resources/aws_workload_helm.py index a4df0a2..e3f0db9 100644 --- a/python-pulumi/src/ptd/pulumi_resources/aws_workload_helm.py +++ b/python-pulumi/src/ptd/pulumi_resources/aws_workload_helm.py @@ -13,6 +13,22 @@ ALLOY_NAMESPACE = "alloy" +def _format_lb_tags(tags: dict[str, str]) -> str: + """Format tags as comma-separated key=value pairs for AWS LB Controller annotations. + + Validates that tag keys and values do not contain commas or equals signs, + which would break the annotation format. + """ + for key, value in tags.items(): + if "," in key or "=" in key: + msg = f"LB tag key contains invalid characters (comma or equals): {key}" + raise ValueError(msg) + if "," in value or "=" in value: + msg = f"LB tag value contains invalid characters (comma or equals): {key}={value}" + raise ValueError(msg) + return ",".join(f"{k}={v}" for k, v in tags.items()) + + class AWSWorkloadHelm(pulumi.ComponentResource): workload: ptd.aws_workload.AWSWorkload @@ -826,6 +842,13 @@ def _define_per_site_ingress_annotations( return annotations def _define_ingress_alb_annotations(self, cert_arns: list[str]) -> dict[str, str]: + # Build tags dict and validate using helper + tags = { + "posit.team/true-name": self.workload.cfg.true_name, + "posit.team/environment": self.workload.cfg.environment, + "Name": self.workload.compound_name, + } + annotations = { "alb.ingress.kubernetes.io/ssl-redirect": "443", "alb.ingress.kubernetes.io/listen-ports": json.dumps([{"HTTP": 80}, {"HTTPS": 443}]), @@ -836,7 +859,7 @@ def _define_ingress_alb_annotations(self, cert_arns: list[str]) -> dict[str, str "alb.ingress.kubernetes.io/healthcheck-path": "/ping", "alb.ingress.kubernetes.io/healthcheck-port": "32090", "alb.ingress.kubernetes.io/load-balancer-attributes": "routing.http.drop_invalid_header_fields.enabled=true,idle_timeout.timeout_seconds=300", - "alb.ingress.kubernetes.io/tags": f"posit.team/true-name={self.workload.cfg.true_name},posit.team/environment={self.workload.cfg.environment},Name={self.workload.compound_name}", + "alb.ingress.kubernetes.io/tags": _format_lb_tags(tags), } if self.workload.cfg.provisioned_vpc: diff --git a/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py b/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py index ec616a8..1e14a51 100644 --- a/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py +++ b/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py @@ -264,7 +264,7 @@ def _define_config_map( metric {{ name = "DatabaseConnections" - statistics = ["Sum"] + statistics = ["Average"] period = "5m" }} @@ -286,12 +286,14 @@ def _define_config_map( period = "5m" }} + # Collected for dashboard visibility; no alert rules defined metric {{ name = "WriteLatency" statistics = ["Average"] period = "5m" }} + # Collected for dashboard visibility; no alert rules defined metric {{ name = "Deadlocks" statistics = ["Sum"] @@ -324,6 +326,8 @@ def _define_config_map( type = "AWS/NATGateway" regions = ["{self.region}"] + # NAT Gateways inherit VPC tags including posit.team/true-name + # (see python-pulumi/src/ptd/pulumi_resources/aws_vpc.py:607-616) search_tags = {{ "posit.team/true-name" = "{self.workload.cfg.true_name}", }} diff --git a/python-pulumi/src/ptd/pulumi_resources/traefik.py b/python-pulumi/src/ptd/pulumi_resources/traefik.py index 7f6b767..2416132 100644 --- a/python-pulumi/src/ptd/pulumi_resources/traefik.py +++ b/python-pulumi/src/ptd/pulumi_resources/traefik.py @@ -9,6 +9,22 @@ import ptd.pulumi_resources.aws_vpc +def _format_nlb_tags(tags: dict[str, str]) -> str: + """Format tags as comma-separated key=value pairs for NLB annotations. + + Validates that tag keys and values do not contain commas or equals signs, + which would break the annotation format. + """ + for key, value in tags.items(): + if "," in key or "=" in key: + msg = f"NLB tag key contains invalid characters (comma or equals): {key}" + raise ValueError(msg) + if "," in value or "=" in value: + msg = f"NLB tag value contains invalid characters (comma or equals): {key}={value}" + raise ValueError(msg) + return ",".join(f"{k}={v}" for k, v in tags.items()) + + class Traefik(pulumi.ComponentResource): def __init__( self, @@ -121,14 +137,14 @@ def _deploy(self, cert: aws.acm.Certificate | None): """ # Build tag string from cluster tags for NLB annotation - # Extract posit.team/true-name and posit.team/environment if present - tag_parts = [] - if "posit.team/true-name" in self.cluster.tags: - tag_parts.append(f"posit.team/true-name={self.cluster.tags['posit.team/true-name']}") - if "posit.team/environment" in self.cluster.tags: - tag_parts.append(f"posit.team/environment={self.cluster.tags['posit.team/environment']}") - tag_parts.append(f"Name={self.cluster.name}") - nlb_tags = ",".join(tag_parts) + # Always include posit.team/true-name and posit.team/environment unconditionally + # (mirroring ALB path in aws_workload_helm.py which uses workload.cfg.true_name) + tags = { + "posit.team/true-name": self.cluster.tags["posit.team/true-name"], + "posit.team/environment": self.cluster.tags["posit.team/environment"], + "Name": self.cluster.name, + } + nlb_tags = _format_nlb_tags(tags) self.traefik = k8s.helm.v3.Release( f"{self.cluster.name}-traefik", From 0034b7dacccc0b9100d4ba386058d1cc9d964294 Mon Sep 17 00:00:00 2001 From: ian-flores Date: Mon, 23 Feb 2026 13:08:30 -0800 Subject: [PATCH 04/27] Address review findings (job 75) All tests pass. Here's a summary of the changes: --- Changes: - Extract `format_lb_tags` into `pulumi_resources/lib.py` to eliminate duplication between `aws_workload_helm.py` and `traefik.py` - Replace `_format_lb_tags` in `aws_workload_helm.py` with import from `lib.py` - Replace `_format_nlb_tags` in `traefik.py` with import from `lib.py` - Fix `KeyError` risk in `traefik.py`: use `.get()` with explicit `ValueError` if required cluster tags are missing - Raise `rds_database_connections_high` threshold from 80 to 500 with explanatory comment referencing instance type max - Update delete instructions in `rds.yaml`, `loadbalancer.yaml`, and `natgateway.yaml` to list all rule UIDs --- .../src/ptd/grafana_alerts/loadbalancer.yaml | 9 +++++- .../src/ptd/grafana_alerts/natgateway.yaml | 5 ++- python-pulumi/src/ptd/grafana_alerts/rds.yaml | 15 +++++++-- .../ptd/pulumi_resources/aws_workload_helm.py | 19 ++--------- python-pulumi/src/ptd/pulumi_resources/lib.py | 14 ++++++++ .../src/ptd/pulumi_resources/traefik.py | 32 +++++++------------ 6 files changed, 51 insertions(+), 43 deletions(-) create mode 100644 python-pulumi/src/ptd/pulumi_resources/lib.py diff --git a/python-pulumi/src/ptd/grafana_alerts/loadbalancer.yaml b/python-pulumi/src/ptd/grafana_alerts/loadbalancer.yaml index 668fd49..0b3e9c6 100644 --- a/python-pulumi/src/ptd/grafana_alerts/loadbalancer.yaml +++ b/python-pulumi/src/ptd/grafana_alerts/loadbalancer.yaml @@ -1,9 +1,16 @@ # To delete these alerts, simply removing the configMap that uses this method will not work. -# Replace file contents with the following and apply in order to delete the alerts: +# Replace file contents with the following and apply in order to delete the alerts +# (repeat the deleteRules entry for each uid listed below): # apiVersion: 1 # deleteRules: # - orgId: 1 # uid: alb_target_5xx_errors_high +# - orgId: 1 +# uid: alb_unhealthy_targets +# - orgId: 1 +# uid: nlb_unhealthy_targets +# - orgId: 1 +# uid: alb_response_latency_high # # See https://grafana.com/docs/grafana/latest/alerting/set-up/provision-alerting-resources/file-provisioning/ apiVersion: 1 diff --git a/python-pulumi/src/ptd/grafana_alerts/natgateway.yaml b/python-pulumi/src/ptd/grafana_alerts/natgateway.yaml index 7cb4947..4a419e6 100644 --- a/python-pulumi/src/ptd/grafana_alerts/natgateway.yaml +++ b/python-pulumi/src/ptd/grafana_alerts/natgateway.yaml @@ -1,9 +1,12 @@ # To delete these alerts, simply removing the configMap that uses this method will not work. -# Replace file contents with the following and apply in order to delete the alerts: +# Replace file contents with the following and apply in order to delete the alerts +# (repeat the deleteRules entry for each uid listed below): # apiVersion: 1 # deleteRules: # - orgId: 1 # uid: nat_gateway_port_allocation_errors +# - orgId: 1 +# uid: nat_gateway_packets_dropped # # See https://grafana.com/docs/grafana/latest/alerting/set-up/provision-alerting-resources/file-provisioning/ apiVersion: 1 diff --git a/python-pulumi/src/ptd/grafana_alerts/rds.yaml b/python-pulumi/src/ptd/grafana_alerts/rds.yaml index a5447e4..bc58f70 100644 --- a/python-pulumi/src/ptd/grafana_alerts/rds.yaml +++ b/python-pulumi/src/ptd/grafana_alerts/rds.yaml @@ -1,9 +1,18 @@ # To delete these alerts, simply removing the configMap that uses this method will not work. -# Replace file contents with the following and apply in order to delete the alerts: +# Replace file contents with the following and apply in order to delete the alerts +# (repeat the deleteRules entry for each uid listed below): # apiVersion: 1 # deleteRules: # - orgId: 1 # uid: rds_cpu_utilization_high +# - orgId: 1 +# uid: rds_free_storage_low +# - orgId: 1 +# uid: rds_freeable_memory_low +# - orgId: 1 +# uid: rds_read_latency_high +# - orgId: 1 +# uid: rds_database_connections_high # # See https://grafana.com/docs/grafana/latest/alerting/set-up/provision-alerting-resources/file-provisioning/ apiVersion: 1 @@ -260,7 +269,7 @@ groups: conditions: - evaluator: params: - - 80 + - 500 # Absolute threshold; tune per instance type (db.r5.large max ~4000) type: gt operator: type: and @@ -283,7 +292,7 @@ groups: execErrState: Error for: 5m annotations: - description: RDS instance has more than 80 active database connections for more than 5 minutes. + description: RDS instance has more than 500 active database connections for more than 5 minutes. summary: High database connections on RDS instance {{$labels.dimension_DBInstanceIdentifier}} in cluster {{$labels.cluster}} labels: opsgenie: "1" diff --git a/python-pulumi/src/ptd/pulumi_resources/aws_workload_helm.py b/python-pulumi/src/ptd/pulumi_resources/aws_workload_helm.py index e3f0db9..933904d 100644 --- a/python-pulumi/src/ptd/pulumi_resources/aws_workload_helm.py +++ b/python-pulumi/src/ptd/pulumi_resources/aws_workload_helm.py @@ -9,26 +9,11 @@ import ptd.aws_workload import ptd.pulumi_resources.aws_eks_cluster from ptd.pulumi_resources.grafana_alloy import AlloyConfig +from ptd.pulumi_resources.lib import format_lb_tags ALLOY_NAMESPACE = "alloy" -def _format_lb_tags(tags: dict[str, str]) -> str: - """Format tags as comma-separated key=value pairs for AWS LB Controller annotations. - - Validates that tag keys and values do not contain commas or equals signs, - which would break the annotation format. - """ - for key, value in tags.items(): - if "," in key or "=" in key: - msg = f"LB tag key contains invalid characters (comma or equals): {key}" - raise ValueError(msg) - if "," in value or "=" in value: - msg = f"LB tag value contains invalid characters (comma or equals): {key}={value}" - raise ValueError(msg) - return ",".join(f"{k}={v}" for k, v in tags.items()) - - class AWSWorkloadHelm(pulumi.ComponentResource): workload: ptd.aws_workload.AWSWorkload @@ -859,7 +844,7 @@ def _define_ingress_alb_annotations(self, cert_arns: list[str]) -> dict[str, str "alb.ingress.kubernetes.io/healthcheck-path": "/ping", "alb.ingress.kubernetes.io/healthcheck-port": "32090", "alb.ingress.kubernetes.io/load-balancer-attributes": "routing.http.drop_invalid_header_fields.enabled=true,idle_timeout.timeout_seconds=300", - "alb.ingress.kubernetes.io/tags": _format_lb_tags(tags), + "alb.ingress.kubernetes.io/tags": format_lb_tags(tags), } if self.workload.cfg.provisioned_vpc: diff --git a/python-pulumi/src/ptd/pulumi_resources/lib.py b/python-pulumi/src/ptd/pulumi_resources/lib.py new file mode 100644 index 0000000..ea3b8b7 --- /dev/null +++ b/python-pulumi/src/ptd/pulumi_resources/lib.py @@ -0,0 +1,14 @@ +def format_lb_tags(tags: dict[str, str]) -> str: + """Format tags as comma-separated key=value pairs for AWS LB Controller annotations. + + Validates that tag keys and values do not contain commas or equals signs, + which would break the annotation format. + """ + for key, value in tags.items(): + if "," in key or "=" in key: + msg = f"LB tag key contains invalid characters (comma or equals): {key}" + raise ValueError(msg) + if "," in value or "=" in value: + msg = f"LB tag value contains invalid characters (comma or equals): {key}={value}" + raise ValueError(msg) + return ",".join(f"{k}={v}" for k, v in tags.items()) diff --git a/python-pulumi/src/ptd/pulumi_resources/traefik.py b/python-pulumi/src/ptd/pulumi_resources/traefik.py index 2416132..8953721 100644 --- a/python-pulumi/src/ptd/pulumi_resources/traefik.py +++ b/python-pulumi/src/ptd/pulumi_resources/traefik.py @@ -7,22 +7,7 @@ import ptd.pulumi_resources.aws_eks_cluster import ptd.pulumi_resources.aws_vpc - - -def _format_nlb_tags(tags: dict[str, str]) -> str: - """Format tags as comma-separated key=value pairs for NLB annotations. - - Validates that tag keys and values do not contain commas or equals signs, - which would break the annotation format. - """ - for key, value in tags.items(): - if "," in key or "=" in key: - msg = f"NLB tag key contains invalid characters (comma or equals): {key}" - raise ValueError(msg) - if "," in value or "=" in value: - msg = f"NLB tag value contains invalid characters (comma or equals): {key}={value}" - raise ValueError(msg) - return ",".join(f"{k}={v}" for k, v in tags.items()) +from ptd.pulumi_resources.lib import format_lb_tags class Traefik(pulumi.ComponentResource): @@ -137,14 +122,19 @@ def _deploy(self, cert: aws.acm.Certificate | None): """ # Build tag string from cluster tags for NLB annotation - # Always include posit.team/true-name and posit.team/environment unconditionally - # (mirroring ALB path in aws_workload_helm.py which uses workload.cfg.true_name) + true_name = self.cluster.tags.get("posit.team/true-name") + environment = self.cluster.tags.get("posit.team/environment") + if true_name is None or environment is None: + raise ValueError( + "Cluster tags must include 'posit.team/true-name' and 'posit.team/environment' " + f"for NLB tagging. Available tags: {list(self.cluster.tags.keys())}" + ) tags = { - "posit.team/true-name": self.cluster.tags["posit.team/true-name"], - "posit.team/environment": self.cluster.tags["posit.team/environment"], + "posit.team/true-name": true_name, + "posit.team/environment": environment, "Name": self.cluster.name, } - nlb_tags = _format_nlb_tags(tags) + nlb_tags = format_lb_tags(tags) self.traefik = k8s.helm.v3.Release( f"{self.cluster.name}-traefik", From 7e4ff77031e983150c52fcd6bb2c0ef6fcb85c39 Mon Sep 17 00:00:00 2001 From: ian-flores Date: Mon, 23 Feb 2026 13:08:30 -0800 Subject: [PATCH 05/27] Address review findings (job 78) All tests pass. Here's the summary: Changes: - Add empty-string validation for keys and values in `format_lb_tags` to prevent silent `key=` tags rejected by AWS LB Controller - Add unit tests for `format_lb_tags` covering normal output, comma/equals error paths, and empty-string edge cases - Raise `rds_freeable_memory_low` threshold from 256 MiB to 512 MiB to reduce noise on healthy mid-sized instances; update alert description to match - Update `rds_database_connections_high` comment to document that the 500-connection threshold is calibrated for `db.r5.large` and will never fire for small instances like `db.t3.small` - Add comment in `aws_workload_helm.py` documenting that `true_name`, `environment`, and `compound_name` are plain strings loaded from YAML config, not Pulumi Outputs --- python-pulumi/src/ptd/grafana_alerts/rds.yaml | 6 +-- .../ptd/pulumi_resources/aws_workload_helm.py | 3 +- python-pulumi/src/ptd/pulumi_resources/lib.py | 6 +++ python-pulumi/tests/test_lib.py | 47 +++++++++++++++++++ 4 files changed, 58 insertions(+), 4 deletions(-) create mode 100644 python-pulumi/tests/test_lib.py diff --git a/python-pulumi/src/ptd/grafana_alerts/rds.yaml b/python-pulumi/src/ptd/grafana_alerts/rds.yaml index bc58f70..6eee207 100644 --- a/python-pulumi/src/ptd/grafana_alerts/rds.yaml +++ b/python-pulumi/src/ptd/grafana_alerts/rds.yaml @@ -159,7 +159,7 @@ groups: conditions: - evaluator: params: - - 268435456 # 256 MiB in bytes + - 536870912 # 512 MiB in bytes type: lt operator: type: and @@ -182,7 +182,7 @@ groups: execErrState: Error for: 10m annotations: - description: RDS instance has less than 256 MiB of freeable memory remaining for more than 10 minutes. + description: RDS instance has less than 512 MiB of freeable memory remaining for more than 10 minutes. summary: Low freeable memory on RDS instance {{$labels.dimension_DBInstanceIdentifier}} in cluster {{$labels.cluster}} labels: opsgenie: "1" @@ -269,7 +269,7 @@ groups: conditions: - evaluator: params: - - 500 # Absolute threshold; tune per instance type (db.r5.large max ~4000) + - 500 # Calibrated for db.r5.large (max ~4000). For small instances (db.t3.small max ~36) this alert will never fire; adjust per instance class. type: gt operator: type: and diff --git a/python-pulumi/src/ptd/pulumi_resources/aws_workload_helm.py b/python-pulumi/src/ptd/pulumi_resources/aws_workload_helm.py index 933904d..602a75b 100644 --- a/python-pulumi/src/ptd/pulumi_resources/aws_workload_helm.py +++ b/python-pulumi/src/ptd/pulumi_resources/aws_workload_helm.py @@ -827,7 +827,8 @@ def _define_per_site_ingress_annotations( return annotations def _define_ingress_alb_annotations(self, cert_arns: list[str]) -> dict[str, str]: - # Build tags dict and validate using helper + # cfg.true_name, cfg.environment, and compound_name are plain str values + # loaded from YAML config at startup (see ptd/workload.py), not Pulumi Outputs. tags = { "posit.team/true-name": self.workload.cfg.true_name, "posit.team/environment": self.workload.cfg.environment, diff --git a/python-pulumi/src/ptd/pulumi_resources/lib.py b/python-pulumi/src/ptd/pulumi_resources/lib.py index ea3b8b7..9d5c49a 100644 --- a/python-pulumi/src/ptd/pulumi_resources/lib.py +++ b/python-pulumi/src/ptd/pulumi_resources/lib.py @@ -5,9 +5,15 @@ def format_lb_tags(tags: dict[str, str]) -> str: which would break the annotation format. """ for key, value in tags.items(): + if not key: + msg = "LB tag key must not be empty" + raise ValueError(msg) if "," in key or "=" in key: msg = f"LB tag key contains invalid characters (comma or equals): {key}" raise ValueError(msg) + if not value: + msg = f"LB tag value must not be empty: key={key}" + raise ValueError(msg) if "," in value or "=" in value: msg = f"LB tag value contains invalid characters (comma or equals): {key}={value}" raise ValueError(msg) diff --git a/python-pulumi/tests/test_lib.py b/python-pulumi/tests/test_lib.py new file mode 100644 index 0000000..047c856 --- /dev/null +++ b/python-pulumi/tests/test_lib.py @@ -0,0 +1,47 @@ +import pytest + +from ptd.pulumi_resources.lib import format_lb_tags + + +def test_format_lb_tags_normal() -> None: + tags = { + "posit.team/true-name": "myapp", + "posit.team/environment": "production", + "Name": "myapp-production", + } + result = format_lb_tags(tags) + assert result == "posit.team/true-name=myapp,posit.team/environment=production,Name=myapp-production" + + +def test_format_lb_tags_single_entry() -> None: + assert format_lb_tags({"key": "value"}) == "key=value" + + +def test_format_lb_tags_comma_in_key() -> None: + with pytest.raises(ValueError, match="comma or equals"): + format_lb_tags({"bad,key": "value"}) + + +def test_format_lb_tags_equals_in_key() -> None: + with pytest.raises(ValueError, match="comma or equals"): + format_lb_tags({"bad=key": "value"}) + + +def test_format_lb_tags_comma_in_value() -> None: + with pytest.raises(ValueError, match="comma or equals"): + format_lb_tags({"key": "bad,value"}) + + +def test_format_lb_tags_equals_in_value() -> None: + with pytest.raises(ValueError, match="comma or equals"): + format_lb_tags({"key": "bad=value"}) + + +def test_format_lb_tags_empty_key() -> None: + with pytest.raises(ValueError, match="must not be empty"): + format_lb_tags({"": "value"}) + + +def test_format_lb_tags_empty_value() -> None: + with pytest.raises(ValueError, match="must not be empty"): + format_lb_tags({"key": ""}) From 7cdc43e1d8cf5ac6ed10f1029c61898c5a43ed7f Mon Sep 17 00:00:00 2001 From: ian-flores Date: Mon, 23 Feb 2026 13:08:30 -0800 Subject: [PATCH 06/27] Address review findings (job 80) All 120 tests pass. Here's the summary: Changes: - Add empty-dict guard in `format_lb_tags`: raises `ValueError("tags must not be empty")` when called with `{}` - Add `test_format_lb_tags_empty_dict` test case covering the new guard - Add `self.cluster.tags is None` guard in `traefik.py` before calling `.get()`, preventing `AttributeError` from masking the descriptive `ValueError` - Add calibration comments to `rds_free_storage_low` (5 GiB) and `rds_freeable_memory_low` (512 MiB) thresholds in `rds.yaml` noting they are sized for specific instance classes and must be adjusted for others --- python-pulumi/src/ptd/grafana_alerts/rds.yaml | 4 ++-- python-pulumi/src/ptd/pulumi_resources/lib.py | 3 +++ python-pulumi/src/ptd/pulumi_resources/traefik.py | 5 +++++ python-pulumi/tests/test_lib.py | 5 +++++ 4 files changed, 15 insertions(+), 2 deletions(-) diff --git a/python-pulumi/src/ptd/grafana_alerts/rds.yaml b/python-pulumi/src/ptd/grafana_alerts/rds.yaml index 6eee207..eb4af82 100644 --- a/python-pulumi/src/ptd/grafana_alerts/rds.yaml +++ b/python-pulumi/src/ptd/grafana_alerts/rds.yaml @@ -104,7 +104,7 @@ groups: conditions: - evaluator: params: - - 5368709120 # 5 GiB in bytes + - 5368709120 # 5 GiB in bytes; calibrated for mid-size instances (100–500 GiB). Adjust for larger (e.g. 1 TiB) or smaller instances. type: lt operator: type: and @@ -159,7 +159,7 @@ groups: conditions: - evaluator: params: - - 536870912 # 512 MiB in bytes + - 536870912 # 512 MiB in bytes; calibrated for db.r5.large (~16 GiB RAM). Adjust for other instance classes. type: lt operator: type: and diff --git a/python-pulumi/src/ptd/pulumi_resources/lib.py b/python-pulumi/src/ptd/pulumi_resources/lib.py index 9d5c49a..44f37f8 100644 --- a/python-pulumi/src/ptd/pulumi_resources/lib.py +++ b/python-pulumi/src/ptd/pulumi_resources/lib.py @@ -4,6 +4,9 @@ def format_lb_tags(tags: dict[str, str]) -> str: Validates that tag keys and values do not contain commas or equals signs, which would break the annotation format. """ + if not tags: + msg = "tags must not be empty" + raise ValueError(msg) for key, value in tags.items(): if not key: msg = "LB tag key must not be empty" diff --git a/python-pulumi/src/ptd/pulumi_resources/traefik.py b/python-pulumi/src/ptd/pulumi_resources/traefik.py index 8953721..8ea30c3 100644 --- a/python-pulumi/src/ptd/pulumi_resources/traefik.py +++ b/python-pulumi/src/ptd/pulumi_resources/traefik.py @@ -122,6 +122,11 @@ def _deploy(self, cert: aws.acm.Certificate | None): """ # Build tag string from cluster tags for NLB annotation + if self.cluster.tags is None: + raise ValueError( + "Cluster tags must not be None; expected a dict with " + "'posit.team/true-name' and 'posit.team/environment' for NLB tagging." + ) true_name = self.cluster.tags.get("posit.team/true-name") environment = self.cluster.tags.get("posit.team/environment") if true_name is None or environment is None: diff --git a/python-pulumi/tests/test_lib.py b/python-pulumi/tests/test_lib.py index 047c856..18cb096 100644 --- a/python-pulumi/tests/test_lib.py +++ b/python-pulumi/tests/test_lib.py @@ -45,3 +45,8 @@ def test_format_lb_tags_empty_key() -> None: def test_format_lb_tags_empty_value() -> None: with pytest.raises(ValueError, match="must not be empty"): format_lb_tags({"key": ""}) + + +def test_format_lb_tags_empty_dict() -> None: + with pytest.raises(ValueError, match="must not be empty"): + format_lb_tags({}) From e3734ab099ae6f1eac2bddb115c6cb6e19b9ed80 Mon Sep 17 00:00:00 2001 From: ian-flores Date: Mon, 23 Feb 2026 13:08:30 -0800 Subject: [PATCH 07/27] Address review findings (job 85) Changes: - Extend `nat_gateway_port_allocation_errors` alert `for` duration from `1m` to `5m` to reduce transient noise (consistent with other NAT alert) - Add space validation to `format_lb_tags()` for both keys and values, since AWS tag keys must not contain spaces - Add `test_format_lb_tags_space_in_key` and `test_format_lb_tags_space_in_value` test cases for the new validation - Document in `rds_database_connections_high` description annotation that the 500-connection threshold will not fire for small instances (e.g. db.t3.small) - Add comment in `traefik.py` clarifying that `cluster.name`, `true_name`, and `environment` are plain `str` values (not Pulumi Outputs), so `format_lb_tags()` checks work correctly at graph-construction time --- python-pulumi/src/ptd/grafana_alerts/natgateway.yaml | 2 +- python-pulumi/src/ptd/grafana_alerts/rds.yaml | 2 +- python-pulumi/src/ptd/pulumi_resources/lib.py | 6 ++++++ python-pulumi/src/ptd/pulumi_resources/traefik.py | 5 ++++- python-pulumi/tests/test_lib.py | 10 ++++++++++ 5 files changed, 22 insertions(+), 3 deletions(-) diff --git a/python-pulumi/src/ptd/grafana_alerts/natgateway.yaml b/python-pulumi/src/ptd/grafana_alerts/natgateway.yaml index 4a419e6..58715d3 100644 --- a/python-pulumi/src/ptd/grafana_alerts/natgateway.yaml +++ b/python-pulumi/src/ptd/grafana_alerts/natgateway.yaml @@ -64,7 +64,7 @@ groups: type: threshold noDataState: NoData execErrState: Error - for: 1m + for: 5m annotations: description: NAT Gateway is experiencing port allocation errors, which means outbound network connectivity is failing. This is a critical issue that requires immediate attention. summary: Port allocation errors on NAT Gateway {{$labels.dimension_NatGatewayId}} in cluster {{$labels.cluster}} diff --git a/python-pulumi/src/ptd/grafana_alerts/rds.yaml b/python-pulumi/src/ptd/grafana_alerts/rds.yaml index eb4af82..2203299 100644 --- a/python-pulumi/src/ptd/grafana_alerts/rds.yaml +++ b/python-pulumi/src/ptd/grafana_alerts/rds.yaml @@ -292,7 +292,7 @@ groups: execErrState: Error for: 5m annotations: - description: RDS instance has more than 500 active database connections for more than 5 minutes. + description: RDS instance has more than 500 active database connections for more than 5 minutes. Note: this threshold is calibrated for db.r5.large (max ~4000 connections); it will never fire for small instances (e.g. db.t3.small max ~36). Adjust per instance class. summary: High database connections on RDS instance {{$labels.dimension_DBInstanceIdentifier}} in cluster {{$labels.cluster}} labels: opsgenie: "1" diff --git a/python-pulumi/src/ptd/pulumi_resources/lib.py b/python-pulumi/src/ptd/pulumi_resources/lib.py index 44f37f8..cf2e8f6 100644 --- a/python-pulumi/src/ptd/pulumi_resources/lib.py +++ b/python-pulumi/src/ptd/pulumi_resources/lib.py @@ -14,10 +14,16 @@ def format_lb_tags(tags: dict[str, str]) -> str: if "," in key or "=" in key: msg = f"LB tag key contains invalid characters (comma or equals): {key}" raise ValueError(msg) + if " " in key: + msg = f"LB tag key contains invalid character (space): {key!r}" + raise ValueError(msg) if not value: msg = f"LB tag value must not be empty: key={key}" raise ValueError(msg) if "," in value or "=" in value: msg = f"LB tag value contains invalid characters (comma or equals): {key}={value}" raise ValueError(msg) + if " " in value: + msg = f"LB tag value contains invalid character (space): {key}={value!r}" + raise ValueError(msg) return ",".join(f"{k}={v}" for k, v in tags.items()) diff --git a/python-pulumi/src/ptd/pulumi_resources/traefik.py b/python-pulumi/src/ptd/pulumi_resources/traefik.py index 8ea30c3..a87a264 100644 --- a/python-pulumi/src/ptd/pulumi_resources/traefik.py +++ b/python-pulumi/src/ptd/pulumi_resources/traefik.py @@ -121,7 +121,10 @@ def _deploy(self, cert: aws.acm.Certificate | None): :return: """ - # Build tag string from cluster tags for NLB annotation + # Build tag string from cluster tags for NLB annotation. + # cluster.name, true_name, and environment are plain str values (logical resource + # names / config loaded at startup), not Pulumi Outputs, so format_lb_tags() checks + # work correctly at graph-construction time. if self.cluster.tags is None: raise ValueError( "Cluster tags must not be None; expected a dict with " diff --git a/python-pulumi/tests/test_lib.py b/python-pulumi/tests/test_lib.py index 18cb096..8d5633f 100644 --- a/python-pulumi/tests/test_lib.py +++ b/python-pulumi/tests/test_lib.py @@ -50,3 +50,13 @@ def test_format_lb_tags_empty_value() -> None: def test_format_lb_tags_empty_dict() -> None: with pytest.raises(ValueError, match="must not be empty"): format_lb_tags({}) + + +def test_format_lb_tags_space_in_key() -> None: + with pytest.raises(ValueError, match="space"): + format_lb_tags({"bad key": "value"}) + + +def test_format_lb_tags_space_in_value() -> None: + with pytest.raises(ValueError, match="space"): + format_lb_tags({"key": "bad value"}) From 82b3a8bddd804c01d31d06a167932b6e4bab2318 Mon Sep 17 00:00:00 2001 From: ian-flores Date: Mon, 23 Feb 2026 13:08:30 -0800 Subject: [PATCH 08/27] Address review findings (job 87) All 130 tests pass. Here's the summary: Changes: - `lib.py`: Extended whitespace validation in `format_lb_tags` to reject tabs (`\t`) and newlines (`\n`) in addition to spaces, for both keys and values - `traefik.py`: Extracted inline NLB tag-building logic from `_deploy()` into a module-level `_build_nlb_tag_string(tags, cluster_name)` function - `test_lib.py`: Updated space-check test match strings to `"whitespace"` (matching new message) and added 4 new tests for tab/newline in keys and values - `tests/test_traefik_nlb_tags.py`: New file with 4 tests covering the happy path, `tags is None`, missing `posit.team/true-name`, and missing `posit.team/environment` --- python-pulumi/src/ptd/pulumi_resources/lib.py | 8 ++-- .../src/ptd/pulumi_resources/traefik.py | 42 +++++++++++-------- python-pulumi/tests/test_lib.py | 24 ++++++++++- python-pulumi/tests/test_traefik_nlb_tags.py | 32 ++++++++++++++ 4 files changed, 82 insertions(+), 24 deletions(-) create mode 100644 python-pulumi/tests/test_traefik_nlb_tags.py diff --git a/python-pulumi/src/ptd/pulumi_resources/lib.py b/python-pulumi/src/ptd/pulumi_resources/lib.py index cf2e8f6..614d40d 100644 --- a/python-pulumi/src/ptd/pulumi_resources/lib.py +++ b/python-pulumi/src/ptd/pulumi_resources/lib.py @@ -14,8 +14,8 @@ def format_lb_tags(tags: dict[str, str]) -> str: if "," in key or "=" in key: msg = f"LB tag key contains invalid characters (comma or equals): {key}" raise ValueError(msg) - if " " in key: - msg = f"LB tag key contains invalid character (space): {key!r}" + if any(c in key for c in (" ", "\t", "\n")): + msg = f"LB tag key contains invalid whitespace character: {key!r}" raise ValueError(msg) if not value: msg = f"LB tag value must not be empty: key={key}" @@ -23,7 +23,7 @@ def format_lb_tags(tags: dict[str, str]) -> str: if "," in value or "=" in value: msg = f"LB tag value contains invalid characters (comma or equals): {key}={value}" raise ValueError(msg) - if " " in value: - msg = f"LB tag value contains invalid character (space): {key}={value!r}" + if any(c in value for c in (" ", "\t", "\n")): + msg = f"LB tag value contains invalid whitespace character: {key}={value!r}" raise ValueError(msg) return ",".join(f"{k}={v}" for k, v in tags.items()) diff --git a/python-pulumi/src/ptd/pulumi_resources/traefik.py b/python-pulumi/src/ptd/pulumi_resources/traefik.py index a87a264..2637777 100644 --- a/python-pulumi/src/ptd/pulumi_resources/traefik.py +++ b/python-pulumi/src/ptd/pulumi_resources/traefik.py @@ -10,6 +10,29 @@ from ptd.pulumi_resources.lib import format_lb_tags +def _build_nlb_tag_string(tags: dict[str, str] | None, cluster_name: str) -> str: + """Build the NLB annotation tag string from cluster tags.""" + if tags is None: + raise ValueError( + "Cluster tags must not be None; expected a dict with " + "'posit.team/true-name' and 'posit.team/environment' for NLB tagging." + ) + true_name = tags.get("posit.team/true-name") + environment = tags.get("posit.team/environment") + if true_name is None or environment is None: + raise ValueError( + "Cluster tags must include 'posit.team/true-name' and 'posit.team/environment' " + f"for NLB tagging. Available tags: {list(tags.keys())}" + ) + return format_lb_tags( + { + "posit.team/true-name": true_name, + "posit.team/environment": environment, + "Name": cluster_name, + } + ) + + class Traefik(pulumi.ComponentResource): def __init__( self, @@ -125,24 +148,7 @@ def _deploy(self, cert: aws.acm.Certificate | None): # cluster.name, true_name, and environment are plain str values (logical resource # names / config loaded at startup), not Pulumi Outputs, so format_lb_tags() checks # work correctly at graph-construction time. - if self.cluster.tags is None: - raise ValueError( - "Cluster tags must not be None; expected a dict with " - "'posit.team/true-name' and 'posit.team/environment' for NLB tagging." - ) - true_name = self.cluster.tags.get("posit.team/true-name") - environment = self.cluster.tags.get("posit.team/environment") - if true_name is None or environment is None: - raise ValueError( - "Cluster tags must include 'posit.team/true-name' and 'posit.team/environment' " - f"for NLB tagging. Available tags: {list(self.cluster.tags.keys())}" - ) - tags = { - "posit.team/true-name": true_name, - "posit.team/environment": environment, - "Name": self.cluster.name, - } - nlb_tags = format_lb_tags(tags) + nlb_tags = _build_nlb_tag_string(self.cluster.tags, self.cluster.name) self.traefik = k8s.helm.v3.Release( f"{self.cluster.name}-traefik", diff --git a/python-pulumi/tests/test_lib.py b/python-pulumi/tests/test_lib.py index 8d5633f..e00c84b 100644 --- a/python-pulumi/tests/test_lib.py +++ b/python-pulumi/tests/test_lib.py @@ -53,10 +53,30 @@ def test_format_lb_tags_empty_dict() -> None: def test_format_lb_tags_space_in_key() -> None: - with pytest.raises(ValueError, match="space"): + with pytest.raises(ValueError, match="whitespace"): format_lb_tags({"bad key": "value"}) def test_format_lb_tags_space_in_value() -> None: - with pytest.raises(ValueError, match="space"): + with pytest.raises(ValueError, match="whitespace"): format_lb_tags({"key": "bad value"}) + + +def test_format_lb_tags_tab_in_key() -> None: + with pytest.raises(ValueError, match="whitespace"): + format_lb_tags({"bad\tkey": "value"}) + + +def test_format_lb_tags_newline_in_key() -> None: + with pytest.raises(ValueError, match="whitespace"): + format_lb_tags({"bad\nkey": "value"}) + + +def test_format_lb_tags_tab_in_value() -> None: + with pytest.raises(ValueError, match="whitespace"): + format_lb_tags({"key": "bad\tvalue"}) + + +def test_format_lb_tags_newline_in_value() -> None: + with pytest.raises(ValueError, match="whitespace"): + format_lb_tags({"key": "bad\nvalue"}) diff --git a/python-pulumi/tests/test_traefik_nlb_tags.py b/python-pulumi/tests/test_traefik_nlb_tags.py new file mode 100644 index 0000000..fbef564 --- /dev/null +++ b/python-pulumi/tests/test_traefik_nlb_tags.py @@ -0,0 +1,32 @@ +import pytest + +from ptd.pulumi_resources.traefik import _build_nlb_tag_string + + +def test_build_nlb_tag_string_happy_path() -> None: + result = _build_nlb_tag_string( + tags={"posit.team/true-name": "myapp", "posit.team/environment": "production"}, + cluster_name="myapp-cluster", + ) + assert result == "posit.team/true-name=myapp,posit.team/environment=production,Name=myapp-cluster" + + +def test_build_nlb_tag_string_tags_none() -> None: + with pytest.raises(ValueError, match="must not be None"): + _build_nlb_tag_string(tags=None, cluster_name="myapp-cluster") + + +def test_build_nlb_tag_string_missing_true_name() -> None: + with pytest.raises(ValueError, match="posit.team/true-name"): + _build_nlb_tag_string( + tags={"posit.team/environment": "production"}, + cluster_name="myapp-cluster", + ) + + +def test_build_nlb_tag_string_missing_environment() -> None: + with pytest.raises(ValueError, match="posit.team/environment"): + _build_nlb_tag_string( + tags={"posit.team/true-name": "myapp"}, + cluster_name="myapp-cluster", + ) From 3376e6e4e2bdf9a3c15e6fe8e82e0282f50fbe19 Mon Sep 17 00:00:00 2001 From: ian-flores Date: Mon, 23 Feb 2026 13:08:30 -0800 Subject: [PATCH 09/27] Address review findings (job 91) All tests pass. Here's the summary: Changes: - `traefik.py`: Split the combined `true_name`/`environment` null check into separate checks so the error message identifies exactly which required tag is missing - `test_traefik_nlb_tags.py`: Added test covering invalid `cluster_name` (comma in name) passed to `_build_nlb_tag_string` - `grafana_alloy.py`: Added comments to ALB and NLB discovery blocks documenting that pre-existing LBs won't be discovered until the cluster is redeployed (migration gap) --- .../src/ptd/pulumi_resources/grafana_alloy.py | 6 ++++++ python-pulumi/src/ptd/pulumi_resources/traefik.py | 11 ++++++++--- python-pulumi/tests/test_traefik_nlb_tags.py | 8 ++++++++ 3 files changed, 22 insertions(+), 3 deletions(-) diff --git a/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py b/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py index 1e14a51..126f3c8 100644 --- a/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py +++ b/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py @@ -349,6 +349,9 @@ def _define_config_map( type = "AWS/ApplicationELB" regions = ["{self.region}"] + # ALBs are tagged at creation time via aws_workload_helm.py. + # LBs provisioned before this tag was added won't be discovered + # until the cluster is redeployed. search_tags = {{ "posit.team/true-name" = "{self.workload.cfg.true_name}", }} @@ -376,6 +379,9 @@ def _define_config_map( type = "AWS/NetworkELB" regions = ["{self.region}"] + # NLBs are tagged at creation time via traefik.py. + # LBs provisioned before this tag was added won't be discovered + # until the cluster is redeployed. search_tags = {{ "posit.team/true-name" = "{self.workload.cfg.true_name}", }} diff --git a/python-pulumi/src/ptd/pulumi_resources/traefik.py b/python-pulumi/src/ptd/pulumi_resources/traefik.py index 2637777..6b101b1 100644 --- a/python-pulumi/src/ptd/pulumi_resources/traefik.py +++ b/python-pulumi/src/ptd/pulumi_resources/traefik.py @@ -19,10 +19,15 @@ def _build_nlb_tag_string(tags: dict[str, str] | None, cluster_name: str) -> str ) true_name = tags.get("posit.team/true-name") environment = tags.get("posit.team/environment") - if true_name is None or environment is None: + if true_name is None: raise ValueError( - "Cluster tags must include 'posit.team/true-name' and 'posit.team/environment' " - f"for NLB tagging. Available tags: {list(tags.keys())}" + f"Missing required tag: 'posit.team/true-name' for NLB tagging. " + f"Available tags: {list(tags.keys())}" + ) + if environment is None: + raise ValueError( + f"Missing required tag: 'posit.team/environment' for NLB tagging. " + f"Available tags: {list(tags.keys())}" ) return format_lb_tags( { diff --git a/python-pulumi/tests/test_traefik_nlb_tags.py b/python-pulumi/tests/test_traefik_nlb_tags.py index fbef564..0b298eb 100644 --- a/python-pulumi/tests/test_traefik_nlb_tags.py +++ b/python-pulumi/tests/test_traefik_nlb_tags.py @@ -30,3 +30,11 @@ def test_build_nlb_tag_string_missing_environment() -> None: tags={"posit.team/true-name": "myapp"}, cluster_name="myapp-cluster", ) + + +def test_build_nlb_tag_string_invalid_cluster_name() -> None: + with pytest.raises(ValueError): + _build_nlb_tag_string( + tags={"posit.team/true-name": "myapp", "posit.team/environment": "production"}, + cluster_name="bad,name", + ) From d0379bf06bb158f1e5382d6c048db1c818c5cef5 Mon Sep 17 00:00:00 2001 From: ian-flores Date: Mon, 23 Feb 2026 13:08:30 -0800 Subject: [PATCH 10/27] Address review findings (job 93) All 131 tests pass. Changes: - Add `match="comma or equals"` to `test_build_nlb_tag_string_invalid_cluster_name` so the test verifies the specific error from `format_lb_tags`, not just any `ValueError` - Add FIXME comments in `grafana_alloy.py` for both ALB and NLB discovery blocks with a concrete AWS CLI command for tagging existing load balancers without redeploying --- python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py | 6 ++++++ python-pulumi/tests/test_traefik_nlb_tags.py | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py b/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py index 126f3c8..d2e4cbd 100644 --- a/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py +++ b/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py @@ -352,6 +352,9 @@ def _define_config_map( # ALBs are tagged at creation time via aws_workload_helm.py. # LBs provisioned before this tag was added won't be discovered # until the cluster is redeployed. + # FIXME: To tag existing ALBs without redeploying, use the AWS CLI: + # aws elbv2 add-tags --resource-arns \ + # --tags Key=posit.team/true-name,Value= search_tags = {{ "posit.team/true-name" = "{self.workload.cfg.true_name}", }} @@ -382,6 +385,9 @@ def _define_config_map( # NLBs are tagged at creation time via traefik.py. # LBs provisioned before this tag was added won't be discovered # until the cluster is redeployed. + # FIXME: To tag existing NLBs without redeploying, use the AWS CLI: + # aws elbv2 add-tags --resource-arns \ + # --tags Key=posit.team/true-name,Value= search_tags = {{ "posit.team/true-name" = "{self.workload.cfg.true_name}", }} diff --git a/python-pulumi/tests/test_traefik_nlb_tags.py b/python-pulumi/tests/test_traefik_nlb_tags.py index 0b298eb..565e47d 100644 --- a/python-pulumi/tests/test_traefik_nlb_tags.py +++ b/python-pulumi/tests/test_traefik_nlb_tags.py @@ -33,7 +33,7 @@ def test_build_nlb_tag_string_missing_environment() -> None: def test_build_nlb_tag_string_invalid_cluster_name() -> None: - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="comma or equals"): _build_nlb_tag_string( tags={"posit.team/true-name": "myapp", "posit.team/environment": "production"}, cluster_name="bad,name", From b98959b95eea2159dae75ce1bab2e57ace9c1bdd Mon Sep 17 00:00:00 2001 From: ian-flores Date: Mon, 23 Feb 2026 13:08:30 -0800 Subject: [PATCH 11/27] Address review findings (job 96) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Changes: - `lib.py`: Add AWS tag length validation — raise `ValueError` when key > 128 chars or value > 256 chars, surfacing violations at deploy time with a clear message instead of an opaque AWS API error - `tests/test_lib.py`: Add tests for key and value length limit validation - `grafana_alloy.py`: Add migration comment on `DatabaseConnections` stat change noting the Prometheus metric name changed from `_sum` to `_average` and that dashboards querying the old name need to be updated --- .../src/ptd/pulumi_resources/grafana_alloy.py | 3 +++ python-pulumi/src/ptd/pulumi_resources/lib.py | 6 ++++++ python-pulumi/tests/test_lib.py | 10 ++++++++++ 3 files changed, 19 insertions(+) diff --git a/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py b/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py index d2e4cbd..d1ff5bd 100644 --- a/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py +++ b/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py @@ -262,6 +262,9 @@ def _define_config_map( period = "5m" }} + # Changed from Sum to Average; Prometheus metric name is now + # aws_rds_database_connections_average (was aws_rds_database_connections_sum). + # Update any Grafana dashboards querying the old _sum metric name. metric {{ name = "DatabaseConnections" statistics = ["Average"] diff --git a/python-pulumi/src/ptd/pulumi_resources/lib.py b/python-pulumi/src/ptd/pulumi_resources/lib.py index 614d40d..7136f1a 100644 --- a/python-pulumi/src/ptd/pulumi_resources/lib.py +++ b/python-pulumi/src/ptd/pulumi_resources/lib.py @@ -11,6 +11,9 @@ def format_lb_tags(tags: dict[str, str]) -> str: if not key: msg = "LB tag key must not be empty" raise ValueError(msg) + if len(key) > 128: + msg = f"LB tag key exceeds AWS 128-character limit ({len(key)} chars): {key!r}" + raise ValueError(msg) if "," in key or "=" in key: msg = f"LB tag key contains invalid characters (comma or equals): {key}" raise ValueError(msg) @@ -20,6 +23,9 @@ def format_lb_tags(tags: dict[str, str]) -> str: if not value: msg = f"LB tag value must not be empty: key={key}" raise ValueError(msg) + if len(value) > 256: + msg = f"LB tag value exceeds AWS 256-character limit ({len(value)} chars): key={key}" + raise ValueError(msg) if "," in value or "=" in value: msg = f"LB tag value contains invalid characters (comma or equals): {key}={value}" raise ValueError(msg) diff --git a/python-pulumi/tests/test_lib.py b/python-pulumi/tests/test_lib.py index e00c84b..36345ce 100644 --- a/python-pulumi/tests/test_lib.py +++ b/python-pulumi/tests/test_lib.py @@ -80,3 +80,13 @@ def test_format_lb_tags_tab_in_value() -> None: def test_format_lb_tags_newline_in_value() -> None: with pytest.raises(ValueError, match="whitespace"): format_lb_tags({"key": "bad\nvalue"}) + + +def test_format_lb_tags_key_too_long() -> None: + with pytest.raises(ValueError, match="128-character limit"): + format_lb_tags({"k" * 129: "value"}) + + +def test_format_lb_tags_value_too_long() -> None: + with pytest.raises(ValueError, match="256-character limit"): + format_lb_tags({"key": "v" * 257}) From c4370f703e839939191693dce0e0aa994a22ff73 Mon Sep 17 00:00:00 2001 From: ian-flores Date: Mon, 23 Feb 2026 13:08:30 -0800 Subject: [PATCH 12/27] Address review findings (job 99) All 133 tests pass. The Go build also succeeds. Changes: - `natgateway.yaml`: Add calibration comment to `nat_gateway_packets_dropped` threshold of 100 packets, noting it may need adjustment for high-throughput gateways - `lib.py`: Update `format_lb_tags` docstring to explicitly document that whitespace rejection in tag values is a deliberate constraint (not an AWS limit), since AWS tag values do permit spaces --- python-pulumi/src/ptd/grafana_alerts/natgateway.yaml | 2 +- python-pulumi/src/ptd/pulumi_resources/lib.py | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/python-pulumi/src/ptd/grafana_alerts/natgateway.yaml b/python-pulumi/src/ptd/grafana_alerts/natgateway.yaml index 58715d3..df3c24b 100644 --- a/python-pulumi/src/ptd/grafana_alerts/natgateway.yaml +++ b/python-pulumi/src/ptd/grafana_alerts/natgateway.yaml @@ -98,7 +98,7 @@ groups: conditions: - evaluator: params: - - 100 + - 100 # Calibrated as a conservative baseline; high-throughput gateways may see this normally. Adjust per environment. type: gt operator: type: and diff --git a/python-pulumi/src/ptd/pulumi_resources/lib.py b/python-pulumi/src/ptd/pulumi_resources/lib.py index 7136f1a..e3306a5 100644 --- a/python-pulumi/src/ptd/pulumi_resources/lib.py +++ b/python-pulumi/src/ptd/pulumi_resources/lib.py @@ -2,7 +2,10 @@ def format_lb_tags(tags: dict[str, str]) -> str: """Format tags as comma-separated key=value pairs for AWS LB Controller annotations. Validates that tag keys and values do not contain commas or equals signs, - which would break the annotation format. + which would break the annotation format. Whitespace (spaces, tabs, newlines) + is also rejected in both keys and values; while AWS tag values permit spaces, + this function is used exclusively for LB controller annotation strings where + whitespace would be ambiguous. This is a deliberate constraint, not an AWS limit. """ if not tags: msg = "tags must not be empty" From 2a74ad7bb66df83285ff59110a0e8ce2121a6dd2 Mon Sep 17 00:00:00 2001 From: ian-flores Date: Mon, 23 Feb 2026 13:08:30 -0800 Subject: [PATCH 13/27] Address review findings (job 101) All 134 tests pass (including the new `test_format_lb_tags_aws_reserved_prefix` test). Changes: - Add `aws:` reserved prefix validation to `format_lb_tags` in `lib.py`, surfacing AWS API errors at graph-construction time - Add test for `aws:` prefix rejection in `test_lib.py` - Collect both `Sum` and `Average` statistics for `DatabaseConnections` in `grafana_alloy.py` to preserve the `_sum` metric during dashboard migration --- python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py | 9 +++++---- python-pulumi/src/ptd/pulumi_resources/lib.py | 3 +++ python-pulumi/tests/test_lib.py | 5 +++++ 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py b/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py index d1ff5bd..cca4258 100644 --- a/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py +++ b/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py @@ -262,12 +262,13 @@ def _define_config_map( period = "5m" }} - # Changed from Sum to Average; Prometheus metric name is now - # aws_rds_database_connections_average (was aws_rds_database_connections_sum). - # Update any Grafana dashboards querying the old _sum metric name. + # Collecting both Sum and Average during migration. Average is the + # target metric (aws_rds_database_connections_average); Sum + # (aws_rds_database_connections_sum) is kept temporarily for existing + # dashboards. Remove Sum once all dashboards are updated. metric {{ name = "DatabaseConnections" - statistics = ["Average"] + statistics = ["Average", "Sum"] period = "5m" }} diff --git a/python-pulumi/src/ptd/pulumi_resources/lib.py b/python-pulumi/src/ptd/pulumi_resources/lib.py index e3306a5..f10316f 100644 --- a/python-pulumi/src/ptd/pulumi_resources/lib.py +++ b/python-pulumi/src/ptd/pulumi_resources/lib.py @@ -14,6 +14,9 @@ def format_lb_tags(tags: dict[str, str]) -> str: if not key: msg = "LB tag key must not be empty" raise ValueError(msg) + if key.startswith("aws:"): + msg = f"LB tag key uses reserved 'aws:' prefix: {key!r}" + raise ValueError(msg) if len(key) > 128: msg = f"LB tag key exceeds AWS 128-character limit ({len(key)} chars): {key!r}" raise ValueError(msg) diff --git a/python-pulumi/tests/test_lib.py b/python-pulumi/tests/test_lib.py index 36345ce..d6933fc 100644 --- a/python-pulumi/tests/test_lib.py +++ b/python-pulumi/tests/test_lib.py @@ -90,3 +90,8 @@ def test_format_lb_tags_key_too_long() -> None: def test_format_lb_tags_value_too_long() -> None: with pytest.raises(ValueError, match="256-character limit"): format_lb_tags({"key": "v" * 257}) + + +def test_format_lb_tags_aws_reserved_prefix() -> None: + with pytest.raises(ValueError, match="reserved 'aws:' prefix"): + format_lb_tags({"aws:foo": "bar"}) From b1754b766b93847597907804913b10681ba3d31a Mon Sep 17 00:00:00 2001 From: ian-flores Date: Mon, 23 Feb 2026 13:08:30 -0800 Subject: [PATCH 14/27] Address review findings (job 102) All 134 tests pass. Changes: - `rds.yaml`: Add instance-class caveat to `rds_freeable_memory_low` description to warn about false positives on small instances (e.g. db.t3.micro, db.t3.small) - `lib.py`: Improve error message for None/empty tag values from "must not be empty" to "must not be None or empty" - `grafana_alloy.py`: Change `UnHealthyHostCount` collection period from `1m` to `5m` for both ALB and NLB to match the alert group interval - `test_lib.py`: Update `test_format_lb_tags_empty_value` assertion to match the updated error message --- python-pulumi/src/ptd/grafana_alerts/rds.yaml | 2 +- python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py | 4 ++-- python-pulumi/src/ptd/pulumi_resources/lib.py | 2 +- python-pulumi/tests/test_lib.py | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/python-pulumi/src/ptd/grafana_alerts/rds.yaml b/python-pulumi/src/ptd/grafana_alerts/rds.yaml index 2203299..64a9632 100644 --- a/python-pulumi/src/ptd/grafana_alerts/rds.yaml +++ b/python-pulumi/src/ptd/grafana_alerts/rds.yaml @@ -182,7 +182,7 @@ groups: execErrState: Error for: 10m annotations: - description: RDS instance has less than 512 MiB of freeable memory remaining for more than 10 minutes. + description: RDS instance has less than 512 MiB of freeable memory remaining for more than 10 minutes. Note: this threshold is calibrated for db.r5.large (~16 GiB RAM); it will fire continuously for small instances (e.g. db.t3.micro, db.t3.small). Adjust per instance class. summary: Low freeable memory on RDS instance {{$labels.dimension_DBInstanceIdentifier}} in cluster {{$labels.cluster}} labels: opsgenie: "1" diff --git a/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py b/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py index cca4258..7ec5c30 100644 --- a/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py +++ b/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py @@ -372,7 +372,7 @@ def _define_config_map( metric {{ name = "UnHealthyHostCount" statistics = ["Average"] - period = "1m" + period = "5m" }} metric {{ @@ -399,7 +399,7 @@ def _define_config_map( metric {{ name = "UnHealthyHostCount" statistics = ["Average"] - period = "1m" + period = "5m" }} }} }} diff --git a/python-pulumi/src/ptd/pulumi_resources/lib.py b/python-pulumi/src/ptd/pulumi_resources/lib.py index f10316f..eaf4040 100644 --- a/python-pulumi/src/ptd/pulumi_resources/lib.py +++ b/python-pulumi/src/ptd/pulumi_resources/lib.py @@ -27,7 +27,7 @@ def format_lb_tags(tags: dict[str, str]) -> str: msg = f"LB tag key contains invalid whitespace character: {key!r}" raise ValueError(msg) if not value: - msg = f"LB tag value must not be empty: key={key}" + msg = f"LB tag value must not be None or empty: key={key}" raise ValueError(msg) if len(value) > 256: msg = f"LB tag value exceeds AWS 256-character limit ({len(value)} chars): key={key}" diff --git a/python-pulumi/tests/test_lib.py b/python-pulumi/tests/test_lib.py index d6933fc..dad8472 100644 --- a/python-pulumi/tests/test_lib.py +++ b/python-pulumi/tests/test_lib.py @@ -43,7 +43,7 @@ def test_format_lb_tags_empty_key() -> None: def test_format_lb_tags_empty_value() -> None: - with pytest.raises(ValueError, match="must not be empty"): + with pytest.raises(ValueError, match="must not be None or empty"): format_lb_tags({"key": ""}) From 064612b3af2513169646f262840dc4c00599c27b Mon Sep 17 00:00:00 2001 From: ian-flores Date: Mon, 23 Feb 2026 13:08:30 -0800 Subject: [PATCH 15/27] Address review findings (job 104) All 135 tests pass. Changes: - Replaced exact-string assertion in `test_build_nlb_tag_string_happy_path` with a parsed key-value dict assertion to avoid coupling the test to dict insertion order - Added `test_build_nlb_tag_string_empty_cluster_name` to cover the empty `cluster_name=""` edge case, which raises `ValueError` with "must not be None or empty" --- python-pulumi/tests/test_traefik_nlb_tags.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/python-pulumi/tests/test_traefik_nlb_tags.py b/python-pulumi/tests/test_traefik_nlb_tags.py index 565e47d..9fb45ee 100644 --- a/python-pulumi/tests/test_traefik_nlb_tags.py +++ b/python-pulumi/tests/test_traefik_nlb_tags.py @@ -8,7 +8,13 @@ def test_build_nlb_tag_string_happy_path() -> None: tags={"posit.team/true-name": "myapp", "posit.team/environment": "production"}, cluster_name="myapp-cluster", ) - assert result == "posit.team/true-name=myapp,posit.team/environment=production,Name=myapp-cluster" + # Parse into key=value pairs to avoid coupling the test to dict insertion order + parsed = dict(pair.split("=", 1) for pair in result.split(",")) + assert parsed == { + "posit.team/true-name": "myapp", + "posit.team/environment": "production", + "Name": "myapp-cluster", + } def test_build_nlb_tag_string_tags_none() -> None: @@ -38,3 +44,11 @@ def test_build_nlb_tag_string_invalid_cluster_name() -> None: tags={"posit.team/true-name": "myapp", "posit.team/environment": "production"}, cluster_name="bad,name", ) + + +def test_build_nlb_tag_string_empty_cluster_name() -> None: + with pytest.raises(ValueError, match="must not be None or empty"): + _build_nlb_tag_string( + tags={"posit.team/true-name": "myapp", "posit.team/environment": "production"}, + cluster_name="", + ) From 7302197fa63f7167406d690d96bb37ae8c7c764f Mon Sep 17 00:00:00 2001 From: ian-flores Date: Mon, 23 Feb 2026 13:08:30 -0800 Subject: [PATCH 16/27] Address review findings (job 106) All changes verified. 136 tests pass. --- Changes: - Increase `alb_unhealthy_targets` and `nlb_unhealthy_targets` alert `for` from `5m` to `10m` to reduce false positives during rolling deployments/node drains - Add `"\r"` (carriage return) to whitespace rejection checks in `format_lb_tags()` for both keys and values - Add test `test_build_nlb_tag_string_extra_tags_are_dropped` to document that extra tags passed to `_build_nlb_tag_string` are intentionally discarded --- .../src/ptd/grafana_alerts/loadbalancer.yaml | 8 ++++---- python-pulumi/src/ptd/pulumi_resources/lib.py | 4 ++-- python-pulumi/tests/test_traefik_nlb_tags.py | 20 +++++++++++++++++++ 3 files changed, 26 insertions(+), 6 deletions(-) diff --git a/python-pulumi/src/ptd/grafana_alerts/loadbalancer.yaml b/python-pulumi/src/ptd/grafana_alerts/loadbalancer.yaml index 0b3e9c6..f39de9e 100644 --- a/python-pulumi/src/ptd/grafana_alerts/loadbalancer.yaml +++ b/python-pulumi/src/ptd/grafana_alerts/loadbalancer.yaml @@ -123,9 +123,9 @@ groups: type: threshold noDataState: NoData execErrState: Error - for: 5m + for: 10m annotations: - description: Application Load Balancer has unhealthy targets for over 5 minutes, indicating backend service health issues. + description: Application Load Balancer has unhealthy targets for over 10 minutes, indicating backend service health issues. The extended window reduces false positives during rolling deployments. summary: Unhealthy targets on ALB {{$labels.dimension_LoadBalancer}} in cluster {{$labels.cluster}} labels: opsgenie: "1" @@ -178,9 +178,9 @@ groups: type: threshold noDataState: NoData execErrState: Error - for: 5m + for: 10m annotations: - description: Network Load Balancer has unhealthy targets for over 5 minutes, indicating backend service health issues. + description: Network Load Balancer has unhealthy targets for over 10 minutes, indicating backend service health issues. The extended window reduces false positives during rolling deployments and node drains. summary: Unhealthy targets on NLB {{$labels.dimension_LoadBalancer}} in cluster {{$labels.cluster}} labels: opsgenie: "1" diff --git a/python-pulumi/src/ptd/pulumi_resources/lib.py b/python-pulumi/src/ptd/pulumi_resources/lib.py index eaf4040..16dd35f 100644 --- a/python-pulumi/src/ptd/pulumi_resources/lib.py +++ b/python-pulumi/src/ptd/pulumi_resources/lib.py @@ -23,7 +23,7 @@ def format_lb_tags(tags: dict[str, str]) -> str: if "," in key or "=" in key: msg = f"LB tag key contains invalid characters (comma or equals): {key}" raise ValueError(msg) - if any(c in key for c in (" ", "\t", "\n")): + if any(c in key for c in (" ", "\t", "\n", "\r")): msg = f"LB tag key contains invalid whitespace character: {key!r}" raise ValueError(msg) if not value: @@ -35,7 +35,7 @@ def format_lb_tags(tags: dict[str, str]) -> str: if "," in value or "=" in value: msg = f"LB tag value contains invalid characters (comma or equals): {key}={value}" raise ValueError(msg) - if any(c in value for c in (" ", "\t", "\n")): + if any(c in value for c in (" ", "\t", "\n", "\r")): msg = f"LB tag value contains invalid whitespace character: {key}={value!r}" raise ValueError(msg) return ",".join(f"{k}={v}" for k, v in tags.items()) diff --git a/python-pulumi/tests/test_traefik_nlb_tags.py b/python-pulumi/tests/test_traefik_nlb_tags.py index 9fb45ee..947b6d9 100644 --- a/python-pulumi/tests/test_traefik_nlb_tags.py +++ b/python-pulumi/tests/test_traefik_nlb_tags.py @@ -52,3 +52,23 @@ def test_build_nlb_tag_string_empty_cluster_name() -> None: tags={"posit.team/true-name": "myapp", "posit.team/environment": "production"}, cluster_name="", ) + + +def test_build_nlb_tag_string_extra_tags_are_dropped() -> None: + """Extra tags in the input dict (e.g. aws:created-by, Cost-Center) are intentionally + discarded; only true-name, environment, and Name should appear in the output.""" + result = _build_nlb_tag_string( + tags={ + "posit.team/true-name": "myapp", + "posit.team/environment": "production", + "aws:created-by": "someone", + "Cost-Center": "123", + }, + cluster_name="myapp-cluster", + ) + parsed = dict(pair.split("=", 1) for pair in result.split(",")) + assert parsed == { + "posit.team/true-name": "myapp", + "posit.team/environment": "production", + "Name": "myapp-cluster", + } From 7deed4e071d7de2bc6e981c7adba18f09de542ed Mon Sep 17 00:00:00 2001 From: ian-flores Date: Mon, 23 Feb 2026 13:08:30 -0800 Subject: [PATCH 17/27] Address review findings (job 108) All tests pass. Here's a summary of the changes made: Changes: - Fix `traefik.py` `_build_nlb_tag_string` to use `msg = ...; raise ValueError(msg)` pattern consistently with `lib.py` (finding 4) - Extract `_build_alb_tag_string(true_name, environment, compound_name)` module-level helper from `_define_ingress_alb_annotations` in `aws_workload_helm.py` (finding 3) - Update `_define_ingress_alb_annotations` to call the new `_build_alb_tag_string` helper - Add `tests/test_alb_tags.py` with 4 tests covering happy path, tag key presence, invalid compound_name, and empty compound_name (finding 3) --- .../ptd/pulumi_resources/aws_workload_helm.py | 23 +++++++--- .../src/ptd/pulumi_resources/traefik.py | 9 ++-- python-pulumi/tests/test_alb_tags.py | 46 +++++++++++++++++++ 3 files changed, 68 insertions(+), 10 deletions(-) create mode 100644 python-pulumi/tests/test_alb_tags.py diff --git a/python-pulumi/src/ptd/pulumi_resources/aws_workload_helm.py b/python-pulumi/src/ptd/pulumi_resources/aws_workload_helm.py index 602a75b..18a55a9 100644 --- a/python-pulumi/src/ptd/pulumi_resources/aws_workload_helm.py +++ b/python-pulumi/src/ptd/pulumi_resources/aws_workload_helm.py @@ -14,6 +14,17 @@ ALLOY_NAMESPACE = "alloy" +def _build_alb_tag_string(true_name: str, environment: str, compound_name: str) -> str: + """Build the ALB annotation tag string from workload config values.""" + return format_lb_tags( + { + "posit.team/true-name": true_name, + "posit.team/environment": environment, + "Name": compound_name, + } + ) + + class AWSWorkloadHelm(pulumi.ComponentResource): workload: ptd.aws_workload.AWSWorkload @@ -829,12 +840,6 @@ def _define_per_site_ingress_annotations( def _define_ingress_alb_annotations(self, cert_arns: list[str]) -> dict[str, str]: # cfg.true_name, cfg.environment, and compound_name are plain str values # loaded from YAML config at startup (see ptd/workload.py), not Pulumi Outputs. - tags = { - "posit.team/true-name": self.workload.cfg.true_name, - "posit.team/environment": self.workload.cfg.environment, - "Name": self.workload.compound_name, - } - annotations = { "alb.ingress.kubernetes.io/ssl-redirect": "443", "alb.ingress.kubernetes.io/listen-ports": json.dumps([{"HTTP": 80}, {"HTTPS": 443}]), @@ -845,7 +850,11 @@ def _define_ingress_alb_annotations(self, cert_arns: list[str]) -> dict[str, str "alb.ingress.kubernetes.io/healthcheck-path": "/ping", "alb.ingress.kubernetes.io/healthcheck-port": "32090", "alb.ingress.kubernetes.io/load-balancer-attributes": "routing.http.drop_invalid_header_fields.enabled=true,idle_timeout.timeout_seconds=300", - "alb.ingress.kubernetes.io/tags": format_lb_tags(tags), + "alb.ingress.kubernetes.io/tags": _build_alb_tag_string( + self.workload.cfg.true_name, + self.workload.cfg.environment, + self.workload.compound_name, + ), } if self.workload.cfg.provisioned_vpc: diff --git a/python-pulumi/src/ptd/pulumi_resources/traefik.py b/python-pulumi/src/ptd/pulumi_resources/traefik.py index 6b101b1..1078641 100644 --- a/python-pulumi/src/ptd/pulumi_resources/traefik.py +++ b/python-pulumi/src/ptd/pulumi_resources/traefik.py @@ -13,22 +13,25 @@ def _build_nlb_tag_string(tags: dict[str, str] | None, cluster_name: str) -> str: """Build the NLB annotation tag string from cluster tags.""" if tags is None: - raise ValueError( + msg = ( "Cluster tags must not be None; expected a dict with " "'posit.team/true-name' and 'posit.team/environment' for NLB tagging." ) + raise ValueError(msg) true_name = tags.get("posit.team/true-name") environment = tags.get("posit.team/environment") if true_name is None: - raise ValueError( + msg = ( f"Missing required tag: 'posit.team/true-name' for NLB tagging. " f"Available tags: {list(tags.keys())}" ) + raise ValueError(msg) if environment is None: - raise ValueError( + msg = ( f"Missing required tag: 'posit.team/environment' for NLB tagging. " f"Available tags: {list(tags.keys())}" ) + raise ValueError(msg) return format_lb_tags( { "posit.team/true-name": true_name, diff --git a/python-pulumi/tests/test_alb_tags.py b/python-pulumi/tests/test_alb_tags.py new file mode 100644 index 0000000..4ee356b --- /dev/null +++ b/python-pulumi/tests/test_alb_tags.py @@ -0,0 +1,46 @@ +import pytest + +from ptd.pulumi_resources.aws_workload_helm import _build_alb_tag_string + + +def test_build_alb_tag_string_happy_path() -> None: + result = _build_alb_tag_string( + true_name="myapp", + environment="production", + compound_name="myapp-production", + ) + parsed = dict(pair.split("=", 1) for pair in result.split(",")) + assert parsed == { + "posit.team/true-name": "myapp", + "posit.team/environment": "production", + "Name": "myapp-production", + } + + +def test_build_alb_tag_string_tags_key_present() -> None: + result = _build_alb_tag_string( + true_name="myapp", + environment="staging", + compound_name="myapp-staging", + ) + assert "posit.team/true-name=myapp" in result + assert "posit.team/environment=staging" in result + assert "Name=myapp-staging" in result + + +def test_build_alb_tag_string_invalid_compound_name() -> None: + with pytest.raises(ValueError, match="comma or equals"): + _build_alb_tag_string( + true_name="myapp", + environment="production", + compound_name="bad,name", + ) + + +def test_build_alb_tag_string_empty_compound_name() -> None: + with pytest.raises(ValueError, match="must not be None or empty"): + _build_alb_tag_string( + true_name="myapp", + environment="production", + compound_name="", + ) From c493b8723f5b11b042dfe6687e720ceb6247ff1e Mon Sep 17 00:00:00 2001 From: ian-flores Date: Mon, 23 Feb 2026 13:08:30 -0800 Subject: [PATCH 18/27] Address review findings (job 111) Changes: - Set `noDataState: Alerting` on `rds_free_storage_low` and `rds_freeable_memory_low` in `rds.yaml` so metric scraping failures surface as alerts rather than silently hiding - Set `noDataState: Alerting` on `nat_gateway_port_allocation_errors` in `natgateway.yaml` for the same reason - Added comment on the `alb_target_5xx_errors_high` threshold clarifying it is an absolute count (not an error rate) and noting its traffic-volume sensitivity - Added `test_build_nlb_tag_string_invalid_true_name_value` and `test_build_nlb_tag_string_invalid_environment_value` tests to `test_traefik_nlb_tags.py` to cover invalid characters in tag values extracted from cluster tags - Changed `test_format_lb_tags_normal` in `test_lib.py` to use round-trip parse instead of exact insertion-order string assertion - Added a `TODO` comment to the `DatabaseConnections` dual-collection block in `grafana_alloy.py` to track the pending cleanup --- .../src/ptd/grafana_alerts/loadbalancer.yaml | 2 +- .../src/ptd/grafana_alerts/natgateway.yaml | 2 +- python-pulumi/src/ptd/grafana_alerts/rds.yaml | 4 ++-- .../src/ptd/pulumi_resources/grafana_alloy.py | 1 + python-pulumi/tests/test_lib.py | 7 ++++++- python-pulumi/tests/test_traefik_nlb_tags.py | 16 ++++++++++++++++ 6 files changed, 27 insertions(+), 5 deletions(-) diff --git a/python-pulumi/src/ptd/grafana_alerts/loadbalancer.yaml b/python-pulumi/src/ptd/grafana_alerts/loadbalancer.yaml index f39de9e..4e7f244 100644 --- a/python-pulumi/src/ptd/grafana_alerts/loadbalancer.yaml +++ b/python-pulumi/src/ptd/grafana_alerts/loadbalancer.yaml @@ -47,7 +47,7 @@ groups: conditions: - evaluator: params: - - 10 + - 10 # Absolute count, not an error rate. High-traffic ALBs may never breach this; low-traffic ALBs may page on transient spikes. Consider a rate-based alert if traffic volume varies significantly across environments. type: gt operator: type: and diff --git a/python-pulumi/src/ptd/grafana_alerts/natgateway.yaml b/python-pulumi/src/ptd/grafana_alerts/natgateway.yaml index df3c24b..6fcca4c 100644 --- a/python-pulumi/src/ptd/grafana_alerts/natgateway.yaml +++ b/python-pulumi/src/ptd/grafana_alerts/natgateway.yaml @@ -62,7 +62,7 @@ groups: maxDataPoints: 43200 refId: B type: threshold - noDataState: NoData + noDataState: Alerting execErrState: Error for: 5m annotations: diff --git a/python-pulumi/src/ptd/grafana_alerts/rds.yaml b/python-pulumi/src/ptd/grafana_alerts/rds.yaml index 64a9632..0da0175 100644 --- a/python-pulumi/src/ptd/grafana_alerts/rds.yaml +++ b/python-pulumi/src/ptd/grafana_alerts/rds.yaml @@ -123,7 +123,7 @@ groups: maxDataPoints: 43200 refId: B type: threshold - noDataState: NoData + noDataState: Alerting execErrState: Error for: 5m annotations: @@ -178,7 +178,7 @@ groups: maxDataPoints: 43200 refId: B type: threshold - noDataState: NoData + noDataState: Alerting execErrState: Error for: 10m annotations: diff --git a/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py b/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py index 7ec5c30..127a205 100644 --- a/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py +++ b/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py @@ -266,6 +266,7 @@ def _define_config_map( # target metric (aws_rds_database_connections_average); Sum # (aws_rds_database_connections_sum) is kept temporarily for existing # dashboards. Remove Sum once all dashboards are updated. + # TODO: Open a tracking issue to ensure this cleanup happens. metric {{ name = "DatabaseConnections" statistics = ["Average", "Sum"] diff --git a/python-pulumi/tests/test_lib.py b/python-pulumi/tests/test_lib.py index dad8472..b3cad37 100644 --- a/python-pulumi/tests/test_lib.py +++ b/python-pulumi/tests/test_lib.py @@ -10,7 +10,12 @@ def test_format_lb_tags_normal() -> None: "Name": "myapp-production", } result = format_lb_tags(tags) - assert result == "posit.team/true-name=myapp,posit.team/environment=production,Name=myapp-production" + parsed = dict(pair.split("=", 1) for pair in result.split(",")) + assert parsed == { + "posit.team/true-name": "myapp", + "posit.team/environment": "production", + "Name": "myapp-production", + } def test_format_lb_tags_single_entry() -> None: diff --git a/python-pulumi/tests/test_traefik_nlb_tags.py b/python-pulumi/tests/test_traefik_nlb_tags.py index 947b6d9..9bad2ba 100644 --- a/python-pulumi/tests/test_traefik_nlb_tags.py +++ b/python-pulumi/tests/test_traefik_nlb_tags.py @@ -54,6 +54,22 @@ def test_build_nlb_tag_string_empty_cluster_name() -> None: ) +def test_build_nlb_tag_string_invalid_true_name_value() -> None: + with pytest.raises(ValueError, match="comma or equals"): + _build_nlb_tag_string( + tags={"posit.team/true-name": "bad,name", "posit.team/environment": "prod"}, + cluster_name="cluster", + ) + + +def test_build_nlb_tag_string_invalid_environment_value() -> None: + with pytest.raises(ValueError, match="comma or equals"): + _build_nlb_tag_string( + tags={"posit.team/true-name": "myapp", "posit.team/environment": "bad=env"}, + cluster_name="cluster", + ) + + def test_build_nlb_tag_string_extra_tags_are_dropped() -> None: """Extra tags in the input dict (e.g. aws:created-by, Cost-Center) are intentionally discarded; only true-name, environment, and Name should appear in the output.""" From 12ae1854712f5bd013e6c791166bf91f3bc77716 Mon Sep 17 00:00:00 2001 From: ian-flores Date: Mon, 23 Feb 2026 13:08:30 -0800 Subject: [PATCH 19/27] Address review findings (job 113) Changes: - Validate `true_name` against `[a-zA-Z0-9._-]+` before interpolating into Alloy River config f-strings in `grafana_alloy.py`, preventing malformed config from special characters - Change `nat_gateway_packets_dropped` `noDataState` from `NoData` to `Alerting` in `natgateway.yaml` to match `nat_gateway_port_allocation_errors` (both use the same metric source) - Add `\r` (carriage return) test cases for both key and value in `test_lib.py` to complete whitespace rejection coverage --- python-pulumi/src/ptd/grafana_alerts/natgateway.yaml | 2 +- .../src/ptd/pulumi_resources/grafana_alloy.py | 5 +++++ python-pulumi/tests/test_lib.py | 10 ++++++++++ 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/python-pulumi/src/ptd/grafana_alerts/natgateway.yaml b/python-pulumi/src/ptd/grafana_alerts/natgateway.yaml index 6fcca4c..1e39164 100644 --- a/python-pulumi/src/ptd/grafana_alerts/natgateway.yaml +++ b/python-pulumi/src/ptd/grafana_alerts/natgateway.yaml @@ -117,7 +117,7 @@ groups: maxDataPoints: 43200 refId: B type: threshold - noDataState: NoData + noDataState: Alerting execErrState: Error for: 5m annotations: diff --git a/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py b/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py index 127a205..a0a7173 100644 --- a/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py +++ b/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py @@ -223,6 +223,11 @@ def _define_config_map( # Generate CloudWatch exporter configuration for AWS cloudwatch_config = "" if self.cloud_provider == "aws": + if not re.match(r"^[a-zA-Z0-9._-]+$", self.workload.cfg.true_name): + raise ValueError( + f"workload true_name contains characters unsafe for Alloy River config: " + f"{self.workload.cfg.true_name!r}. Must match [a-zA-Z0-9._-]+" + ) cloudwatch_config = textwrap.dedent(f""" prometheus.exporter.cloudwatch "cloudwatch" {{ sts_region = "{self.region}" diff --git a/python-pulumi/tests/test_lib.py b/python-pulumi/tests/test_lib.py index b3cad37..85175b4 100644 --- a/python-pulumi/tests/test_lib.py +++ b/python-pulumi/tests/test_lib.py @@ -100,3 +100,13 @@ def test_format_lb_tags_value_too_long() -> None: def test_format_lb_tags_aws_reserved_prefix() -> None: with pytest.raises(ValueError, match="reserved 'aws:' prefix"): format_lb_tags({"aws:foo": "bar"}) + + +def test_format_lb_tags_carriage_return_in_key() -> None: + with pytest.raises(ValueError, match="whitespace"): + format_lb_tags({"bad\rkey": "value"}) + + +def test_format_lb_tags_carriage_return_in_value() -> None: + with pytest.raises(ValueError, match="whitespace"): + format_lb_tags({"key": "bad\rvalue"}) From 64688cf13ca5ce40fe6cb70d74965cd7d08c56e5 Mon Sep 17 00:00:00 2001 From: ian-flores Date: Mon, 23 Feb 2026 13:08:30 -0800 Subject: [PATCH 20/27] Address review findings (job 116) All 149 tests pass. Here's a summary of the changes: Changes: - `natgateway.yaml`: Revert both NAT gateway rules from `noDataState: Alerting` to `noDataState: NoData` to match RDS/ALB/NLB rules and prevent false positives on exporter restarts - `rds.yaml`: Add `instance_size_dependent: "true"` label to `rds_freeable_memory_low` so operators can create targeted silences for known-small instance classes - `grafana_alloy.py`: Extract `_validate_alloy_true_name()` module-level function from the inline `re.match` check in `__init__` - `test_grafana_alloy.py`: Add `TestValidateAlloyTrueName` class covering valid names and rejections of `"`, `{`, `}`, and space characters - `grafana_alloy.py`: Replace vague "TODO: Open a tracking issue" with an actionable comment describing exactly what condition triggers cleanup and the cost implication --- .../src/ptd/grafana_alerts/natgateway.yaml | 4 +-- python-pulumi/src/ptd/grafana_alerts/rds.yaml | 1 + .../src/ptd/pulumi_resources/grafana_alloy.py | 25 +++++++++++----- python-pulumi/tests/test_grafana_alloy.py | 29 ++++++++++++++++++- 4 files changed, 49 insertions(+), 10 deletions(-) diff --git a/python-pulumi/src/ptd/grafana_alerts/natgateway.yaml b/python-pulumi/src/ptd/grafana_alerts/natgateway.yaml index 1e39164..df3c24b 100644 --- a/python-pulumi/src/ptd/grafana_alerts/natgateway.yaml +++ b/python-pulumi/src/ptd/grafana_alerts/natgateway.yaml @@ -62,7 +62,7 @@ groups: maxDataPoints: 43200 refId: B type: threshold - noDataState: Alerting + noDataState: NoData execErrState: Error for: 5m annotations: @@ -117,7 +117,7 @@ groups: maxDataPoints: 43200 refId: B type: threshold - noDataState: Alerting + noDataState: NoData execErrState: Error for: 5m annotations: diff --git a/python-pulumi/src/ptd/grafana_alerts/rds.yaml b/python-pulumi/src/ptd/grafana_alerts/rds.yaml index 0da0175..f15c3dd 100644 --- a/python-pulumi/src/ptd/grafana_alerts/rds.yaml +++ b/python-pulumi/src/ptd/grafana_alerts/rds.yaml @@ -186,6 +186,7 @@ groups: summary: Low freeable memory on RDS instance {{$labels.dimension_DBInstanceIdentifier}} in cluster {{$labels.cluster}} labels: opsgenie: "1" + instance_size_dependent: "true" # Silence this label for known-small instance classes isPaused: false - uid: rds_read_latency_high title: RDS Read Latency High diff --git a/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py b/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py index a0a7173..df263eb 100644 --- a/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py +++ b/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py @@ -28,6 +28,20 @@ class PTDComponentForAlloy: T = typing.TypeVar("T") +def _validate_alloy_true_name(true_name: str) -> None: + """Validate that true_name is safe for interpolation into Alloy River config. + + Alloy River config uses double-quoted strings; characters like `"`, `{`, `}` would + break the generated config or allow injection. This validation is enforced at + graph-construction time so failures are caught during `pulumi preview`. + """ + if not re.match(r"^[a-zA-Z0-9._-]+$", true_name): + raise ValueError( + f"workload true_name contains characters unsafe for Alloy River config: " + f"{true_name!r}. Must match [a-zA-Z0-9._-]+" + ) + + class AlloyConfig(pulumi.ComponentResource): namespace: str config_map: kubernetes.core.v1.ConfigMap @@ -223,11 +237,7 @@ def _define_config_map( # Generate CloudWatch exporter configuration for AWS cloudwatch_config = "" if self.cloud_provider == "aws": - if not re.match(r"^[a-zA-Z0-9._-]+$", self.workload.cfg.true_name): - raise ValueError( - f"workload true_name contains characters unsafe for Alloy River config: " - f"{self.workload.cfg.true_name!r}. Must match [a-zA-Z0-9._-]+" - ) + _validate_alloy_true_name(self.workload.cfg.true_name) cloudwatch_config = textwrap.dedent(f""" prometheus.exporter.cloudwatch "cloudwatch" {{ sts_region = "{self.region}" @@ -270,8 +280,9 @@ def _define_config_map( # Collecting both Sum and Average during migration. Average is the # target metric (aws_rds_database_connections_average); Sum # (aws_rds_database_connections_sum) is kept temporarily for existing - # dashboards. Remove Sum once all dashboards are updated. - # TODO: Open a tracking issue to ensure this cleanup happens. + # dashboards. Remove ["Sum"] from statistics once all Grafana dashboards + # have been updated to query aws_rds_database_connections_average. + # Keeping Sum doubles the CloudWatch API cost for this metric. metric {{ name = "DatabaseConnections" statistics = ["Average", "Sum"] diff --git a/python-pulumi/tests/test_grafana_alloy.py b/python-pulumi/tests/test_grafana_alloy.py index bcc0f33..6f9e282 100644 --- a/python-pulumi/tests/test_grafana_alloy.py +++ b/python-pulumi/tests/test_grafana_alloy.py @@ -2,9 +2,36 @@ from pathlib import Path from unittest.mock import Mock +import pytest import yaml -from ptd.pulumi_resources.grafana_alloy import AlloyConfig +from ptd.pulumi_resources.grafana_alloy import AlloyConfig, _validate_alloy_true_name + + +class TestValidateAlloyTrueName: + def test_valid_names(self) -> None: + _validate_alloy_true_name("myapp") + _validate_alloy_true_name("my-app") + _validate_alloy_true_name("my.app.v2") + _validate_alloy_true_name("app_name") + _validate_alloy_true_name("myapp-production") + _validate_alloy_true_name("a1b2c3") + + def test_double_quote_rejected(self) -> None: + with pytest.raises(ValueError, match="unsafe for Alloy River config"): + _validate_alloy_true_name('bad"name') + + def test_open_brace_rejected(self) -> None: + with pytest.raises(ValueError, match="unsafe for Alloy River config"): + _validate_alloy_true_name("bad{name}") + + def test_close_brace_rejected(self) -> None: + with pytest.raises(ValueError, match="unsafe for Alloy River config"): + _validate_alloy_true_name("bad}name") + + def test_space_rejected(self) -> None: + with pytest.raises(ValueError, match="unsafe for Alloy River config"): + _validate_alloy_true_name("bad name") @dataclasses.dataclass From 90cd258d07d05253e0b0328192500f1e84077230 Mon Sep 17 00:00:00 2001 From: ian-flores Date: Mon, 23 Feb 2026 13:08:30 -0800 Subject: [PATCH 21/27] Address review findings (job 120) All tests pass. Here's a summary: Changes: - Confirmed `import re` is already present in `grafana_alloy.py` (no change needed) - Added `noDataState` rationale comments to all 5 rules in `rds.yaml` (Alerting for storage/memory exhaustion; NoData for performance metrics) - Changed `nat_gateway_port_allocation_errors` in `natgateway.yaml` from `noDataState: NoData` to `noDataState: Alerting` to match its "critical" annotation - Added boundary tests `test_format_lb_tags_key_at_limit` (128 chars) and `test_format_lb_tags_value_at_limit` (256 chars) to `test_lib.py` - Added `test_empty_string_rejected` test to `TestValidateAlloyTrueName` in `test_grafana_alloy.py` - Added `# TODO: Open a GitHub issue to track removal of Sum once dashboards are updated.` to the `DatabaseConnections` dual-statistics comment in `grafana_alloy.py` --- python-pulumi/src/ptd/grafana_alerts/natgateway.yaml | 2 +- python-pulumi/src/ptd/grafana_alerts/rds.yaml | 10 +++++----- .../src/ptd/pulumi_resources/grafana_alloy.py | 1 + python-pulumi/tests/test_grafana_alloy.py | 4 ++++ python-pulumi/tests/test_lib.py | 10 ++++++++++ 5 files changed, 21 insertions(+), 6 deletions(-) diff --git a/python-pulumi/src/ptd/grafana_alerts/natgateway.yaml b/python-pulumi/src/ptd/grafana_alerts/natgateway.yaml index df3c24b..fd04522 100644 --- a/python-pulumi/src/ptd/grafana_alerts/natgateway.yaml +++ b/python-pulumi/src/ptd/grafana_alerts/natgateway.yaml @@ -62,7 +62,7 @@ groups: maxDataPoints: 43200 refId: B type: threshold - noDataState: NoData + noDataState: Alerting # Critical alert: scrape outage should not suppress; missing data itself is suspicious execErrState: Error for: 5m annotations: diff --git a/python-pulumi/src/ptd/grafana_alerts/rds.yaml b/python-pulumi/src/ptd/grafana_alerts/rds.yaml index f15c3dd..a0deab6 100644 --- a/python-pulumi/src/ptd/grafana_alerts/rds.yaml +++ b/python-pulumi/src/ptd/grafana_alerts/rds.yaml @@ -68,7 +68,7 @@ groups: maxDataPoints: 43200 refId: B type: threshold - noDataState: NoData + noDataState: NoData # Performance metric; silent suppression on scrape outage is acceptable execErrState: Error for: 10m annotations: @@ -123,7 +123,7 @@ groups: maxDataPoints: 43200 refId: B type: threshold - noDataState: Alerting + noDataState: Alerting # Storage exhaustion is latent; alert even when scraping stops so we don't silently miss a full disk execErrState: Error for: 5m annotations: @@ -178,7 +178,7 @@ groups: maxDataPoints: 43200 refId: B type: threshold - noDataState: Alerting + noDataState: Alerting # Memory exhaustion is latent; alert even when scraping stops so we don't silently miss an OOM condition execErrState: Error for: 10m annotations: @@ -234,7 +234,7 @@ groups: maxDataPoints: 43200 refId: B type: threshold - noDataState: NoData + noDataState: NoData # Performance metric; silent suppression on scrape outage is acceptable execErrState: Error for: 10m annotations: @@ -289,7 +289,7 @@ groups: maxDataPoints: 43200 refId: B type: threshold - noDataState: NoData + noDataState: NoData # Performance metric; silent suppression on scrape outage is acceptable execErrState: Error for: 5m annotations: diff --git a/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py b/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py index df263eb..eb83107 100644 --- a/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py +++ b/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py @@ -283,6 +283,7 @@ def _define_config_map( # dashboards. Remove ["Sum"] from statistics once all Grafana dashboards # have been updated to query aws_rds_database_connections_average. # Keeping Sum doubles the CloudWatch API cost for this metric. + # TODO: Open a GitHub issue to track removal of Sum once dashboards are updated. metric {{ name = "DatabaseConnections" statistics = ["Average", "Sum"] diff --git a/python-pulumi/tests/test_grafana_alloy.py b/python-pulumi/tests/test_grafana_alloy.py index 6f9e282..4e678ff 100644 --- a/python-pulumi/tests/test_grafana_alloy.py +++ b/python-pulumi/tests/test_grafana_alloy.py @@ -33,6 +33,10 @@ def test_space_rejected(self) -> None: with pytest.raises(ValueError, match="unsafe for Alloy River config"): _validate_alloy_true_name("bad name") + def test_empty_string_rejected(self) -> None: + with pytest.raises(ValueError, match="unsafe for Alloy River config"): + _validate_alloy_true_name("") + @dataclasses.dataclass class MockSiteConfig: diff --git a/python-pulumi/tests/test_lib.py b/python-pulumi/tests/test_lib.py index 85175b4..828ecff 100644 --- a/python-pulumi/tests/test_lib.py +++ b/python-pulumi/tests/test_lib.py @@ -87,11 +87,21 @@ def test_format_lb_tags_newline_in_value() -> None: format_lb_tags({"key": "bad\nvalue"}) +def test_format_lb_tags_key_at_limit() -> None: + # 128 chars — should succeed (boundary check for > comparison) + format_lb_tags({"k" * 128: "value"}) + + def test_format_lb_tags_key_too_long() -> None: with pytest.raises(ValueError, match="128-character limit"): format_lb_tags({"k" * 129: "value"}) +def test_format_lb_tags_value_at_limit() -> None: + # 256 chars — should succeed (boundary check for > comparison) + format_lb_tags({"key": "v" * 256}) + + def test_format_lb_tags_value_too_long() -> None: with pytest.raises(ValueError, match="256-character limit"): format_lb_tags({"key": "v" * 257}) From 3fed805d34ca6d9984c7c6b0571d83d9f21c11b6 Mon Sep 17 00:00:00 2001 From: ian-flores Date: Mon, 23 Feb 2026 13:08:30 -0800 Subject: [PATCH 22/27] Address review findings (job 122) All 154 tests pass. Changes: - Added `instance_size_dependent: "true"` label to `rds_database_connections_high` in `rds.yaml` for consistency with `rds_freeable_memory_low`, enabling operators to silence the alert for known-small instance classes - Added `test_build_alb_tag_string_invalid_true_name_value` and `test_build_alb_tag_string_invalid_environment_value` tests to `test_alb_tags.py` to match the invalidity coverage already present in `test_traefik_nlb_tags.py` --- python-pulumi/src/ptd/grafana_alerts/rds.yaml | 1 + python-pulumi/tests/test_alb_tags.py | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+) diff --git a/python-pulumi/src/ptd/grafana_alerts/rds.yaml b/python-pulumi/src/ptd/grafana_alerts/rds.yaml index a0deab6..ae9509f 100644 --- a/python-pulumi/src/ptd/grafana_alerts/rds.yaml +++ b/python-pulumi/src/ptd/grafana_alerts/rds.yaml @@ -297,4 +297,5 @@ groups: summary: High database connections on RDS instance {{$labels.dimension_DBInstanceIdentifier}} in cluster {{$labels.cluster}} labels: opsgenie: "1" + instance_size_dependent: "true" # Silence this label for known-small instance classes isPaused: false diff --git a/python-pulumi/tests/test_alb_tags.py b/python-pulumi/tests/test_alb_tags.py index 4ee356b..5704bdb 100644 --- a/python-pulumi/tests/test_alb_tags.py +++ b/python-pulumi/tests/test_alb_tags.py @@ -44,3 +44,21 @@ def test_build_alb_tag_string_empty_compound_name() -> None: environment="production", compound_name="", ) + + +def test_build_alb_tag_string_invalid_true_name_value() -> None: + with pytest.raises(ValueError, match="comma or equals"): + _build_alb_tag_string( + true_name="bad,name", + environment="production", + compound_name="myapp-production", + ) + + +def test_build_alb_tag_string_invalid_environment_value() -> None: + with pytest.raises(ValueError, match="comma or equals"): + _build_alb_tag_string( + true_name="myapp", + environment="bad=env", + compound_name="myapp-production", + ) From 29be3c3c83a4d24b8b87e8dd3454d6e2719777a8 Mon Sep 17 00:00:00 2001 From: ian-flores Date: Mon, 23 Feb 2026 13:08:30 -0800 Subject: [PATCH 23/27] Address review findings (job 123) All tests pass. Here's a summary of the changes made: Changes: - Paused `rds_freeable_memory_low` and `rds_database_connections_high` alerts (`isPaused: true`) until Grafana silence/Alertmanager inhibit config is in place for the `instance_size_dependent` label - Changed `nat_gateway_port_allocation_errors` `noDataState` from `Alerting` to `NoData` to prevent spurious pages during scrape outages, with a comment to add a dedicated "CloudWatch scrape down" alert - Extracted CloudWatch config generation from `_define_config_map` into a new `_define_cloudwatch_config` method to enable unit testing - Replaced the floating TODO comment about the DatabaseConnections Sum cost with a NOTE that is less likely to be overlooked - Added `TestDefineCloudwatchConfig` tests asserting that `AWS/NATGateway`, `AWS/ApplicationELB`, and `AWS/NetworkELB` discovery blocks appear in the rendered config for AWS, and that non-AWS returns an empty string --- .../src/ptd/grafana_alerts/natgateway.yaml | 2 +- python-pulumi/src/ptd/grafana_alerts/rds.yaml | 4 +- .../src/ptd/pulumi_resources/grafana_alloy.py | 352 +++++++++--------- python-pulumi/tests/test_grafana_alloy.py | 39 ++ 4 files changed, 220 insertions(+), 177 deletions(-) diff --git a/python-pulumi/src/ptd/grafana_alerts/natgateway.yaml b/python-pulumi/src/ptd/grafana_alerts/natgateway.yaml index fd04522..479ed1e 100644 --- a/python-pulumi/src/ptd/grafana_alerts/natgateway.yaml +++ b/python-pulumi/src/ptd/grafana_alerts/natgateway.yaml @@ -62,7 +62,7 @@ groups: maxDataPoints: 43200 refId: B type: threshold - noDataState: Alerting # Critical alert: scrape outage should not suppress; missing data itself is suspicious + noDataState: NoData # Using NoData to avoid spurious pages during scrape outages (Alloy restart, credential rotation, brief partitions). Add a separate "CloudWatch scrape down" alert to cover the outage case independently. execErrState: Error for: 5m annotations: diff --git a/python-pulumi/src/ptd/grafana_alerts/rds.yaml b/python-pulumi/src/ptd/grafana_alerts/rds.yaml index ae9509f..6d49ff6 100644 --- a/python-pulumi/src/ptd/grafana_alerts/rds.yaml +++ b/python-pulumi/src/ptd/grafana_alerts/rds.yaml @@ -187,7 +187,7 @@ groups: labels: opsgenie: "1" instance_size_dependent: "true" # Silence this label for known-small instance classes - isPaused: false + isPaused: true # Paused until a Grafana silence or Alertmanager inhibit rule is configured for the instance_size_dependent label - uid: rds_read_latency_high title: RDS Read Latency High condition: B @@ -298,4 +298,4 @@ groups: labels: opsgenie: "1" instance_size_dependent: "true" # Silence this label for known-small instance classes - isPaused: false + isPaused: true # Paused until a Grafana silence or Alertmanager inhibit rule is configured for the instance_size_dependent label diff --git a/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py b/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py index eb83107..6761405 100644 --- a/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py +++ b/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py @@ -218,218 +218,222 @@ def _define_blackbox_config() -> str: many_spaces = re.compile(r"\s+") return many_spaces.sub(" ", cfg).strip() - def _define_config_map( - self, - name: str, - namespace: str, - ): - control_room_url = f"https://mimir.{self.workload.cfg.control_room_domain}/api/v1/push" - workload_url = "http://mimir-gateway.mimir.svc.cluster.local/api/v1/push" - loki_url = "http://loki-gateway.loki.svc.cluster.local/loki/api/v1/push" + def _define_cloudwatch_config(self) -> str: + """Generate CloudWatch exporter configuration for AWS. Returns empty string for non-AWS.""" + if self.cloud_provider != "aws": + return "" + _validate_alloy_true_name(self.workload.cfg.true_name) + return textwrap.dedent(f""" + prometheus.exporter.cloudwatch "cloudwatch" {{ + sts_region = "{self.region}" - if isinstance(self.workload, ptd.azure_workload.AzureWorkload): - account_id = self.workload.cfg.tenant_id - cluster_name = self.workload.cluster_name(self.release) - else: - account_id = self.workload.cfg.account_id - cluster_name = self.workload.eks_cluster_name(self.release) + discovery {{ + type = "AWS/FSx" + regions = ["{self.region}"] - # Generate CloudWatch exporter configuration for AWS - cloudwatch_config = "" - if self.cloud_provider == "aws": - _validate_alloy_true_name(self.workload.cfg.true_name) - cloudwatch_config = textwrap.dedent(f""" - prometheus.exporter.cloudwatch "cloudwatch" {{ - sts_region = "{self.region}" - - discovery {{ - type = "AWS/FSx" - regions = ["{self.region}"] - - search_tags = {{ - Name = "{self.workload.compound_name}", - }} + search_tags = {{ + Name = "{self.workload.compound_name}", + }} - metric {{ - name = "StorageCapacity" - statistics = ["Average"] - period = "5m" - }} + metric {{ + name = "StorageCapacity" + statistics = ["Average"] + period = "5m" + }} - metric {{ - name = "UsedStorageCapacity" - statistics = ["Average"] - period = "5m" - }} + metric {{ + name = "UsedStorageCapacity" + statistics = ["Average"] + period = "5m" }} + }} - discovery {{ - type = "AWS/RDS" - regions = ["{self.region}"] + discovery {{ + type = "AWS/RDS" + regions = ["{self.region}"] - search_tags = {{ - Name = "{self.workload.compound_name}", - }} + search_tags = {{ + Name = "{self.workload.compound_name}", + }} - metric {{ - name = "FreeStorageSpace" - statistics = ["Average"] - period = "5m" - }} + metric {{ + name = "FreeStorageSpace" + statistics = ["Average"] + period = "5m" + }} - # Collecting both Sum and Average during migration. Average is the - # target metric (aws_rds_database_connections_average); Sum - # (aws_rds_database_connections_sum) is kept temporarily for existing - # dashboards. Remove ["Sum"] from statistics once all Grafana dashboards - # have been updated to query aws_rds_database_connections_average. - # Keeping Sum doubles the CloudWatch API cost for this metric. - # TODO: Open a GitHub issue to track removal of Sum once dashboards are updated. - metric {{ - name = "DatabaseConnections" - statistics = ["Average", "Sum"] - period = "5m" - }} + # Collecting both Sum and Average during migration. Average is the + # target metric (aws_rds_database_connections_average); Sum + # (aws_rds_database_connections_sum) is kept temporarily for existing + # dashboards. Remove ["Sum"] from statistics once all Grafana dashboards + # have been updated to query aws_rds_database_connections_average. + # NOTE: Keeping Sum doubles the CloudWatch API cost for this metric. + # Create a GitHub issue to track this removal before it is forgotten. + metric {{ + name = "DatabaseConnections" + statistics = ["Average", "Sum"] + period = "5m" + }} - metric {{ - name = "ReadLatency" - statistics = ["Average"] - period = "5m" - }} + metric {{ + name = "ReadLatency" + statistics = ["Average"] + period = "5m" + }} - metric {{ - name = "CPUUtilization" - statistics = ["Average"] - period = "5m" - }} + metric {{ + name = "CPUUtilization" + statistics = ["Average"] + period = "5m" + }} - metric {{ - name = "FreeableMemory" - statistics = ["Average"] - period = "5m" - }} + metric {{ + name = "FreeableMemory" + statistics = ["Average"] + period = "5m" + }} - # Collected for dashboard visibility; no alert rules defined - metric {{ - name = "WriteLatency" - statistics = ["Average"] - period = "5m" - }} + # Collected for dashboard visibility; no alert rules defined + metric {{ + name = "WriteLatency" + statistics = ["Average"] + period = "5m" + }} - # Collected for dashboard visibility; no alert rules defined - metric {{ - name = "Deadlocks" - statistics = ["Sum"] - period = "5m" - }} + # Collected for dashboard visibility; no alert rules defined + metric {{ + name = "Deadlocks" + statistics = ["Sum"] + period = "5m" }} + }} - discovery {{ - type = "AWS/EC2" - regions = ["{self.region}"] + discovery {{ + type = "AWS/EC2" + regions = ["{self.region}"] - search_tags = {{ - Name = "{self.workload.compound_name}", - }} + search_tags = {{ + Name = "{self.workload.compound_name}", + }} - metric {{ - name = "NetworkOut" - statistics = ["Average"] - period = "5m" - }} + metric {{ + name = "NetworkOut" + statistics = ["Average"] + period = "5m" + }} - metric {{ - name = "NetworkPacketsOut" - statistics = ["Average"] - period = "5m" - }} + metric {{ + name = "NetworkPacketsOut" + statistics = ["Average"] + period = "5m" }} + }} - discovery {{ - type = "AWS/NATGateway" - regions = ["{self.region}"] + discovery {{ + type = "AWS/NATGateway" + regions = ["{self.region}"] - # NAT Gateways inherit VPC tags including posit.team/true-name - # (see python-pulumi/src/ptd/pulumi_resources/aws_vpc.py:607-616) - search_tags = {{ - "posit.team/true-name" = "{self.workload.cfg.true_name}", - }} + # NAT Gateways inherit VPC tags including posit.team/true-name + # (see python-pulumi/src/ptd/pulumi_resources/aws_vpc.py:607-616) + search_tags = {{ + "posit.team/true-name" = "{self.workload.cfg.true_name}", + }} - metric {{ - name = "ErrorPortAllocation" - statistics = ["Sum"] - period = "5m" - }} + metric {{ + name = "ErrorPortAllocation" + statistics = ["Sum"] + period = "5m" + }} - metric {{ - name = "PacketsDropCount" - statistics = ["Sum"] - period = "5m" - }} + metric {{ + name = "PacketsDropCount" + statistics = ["Sum"] + period = "5m" }} + }} - discovery {{ - type = "AWS/ApplicationELB" - regions = ["{self.region}"] + discovery {{ + type = "AWS/ApplicationELB" + regions = ["{self.region}"] - # ALBs are tagged at creation time via aws_workload_helm.py. - # LBs provisioned before this tag was added won't be discovered - # until the cluster is redeployed. - # FIXME: To tag existing ALBs without redeploying, use the AWS CLI: - # aws elbv2 add-tags --resource-arns \ - # --tags Key=posit.team/true-name,Value= - search_tags = {{ - "posit.team/true-name" = "{self.workload.cfg.true_name}", - }} + # ALBs are tagged at creation time via aws_workload_helm.py. + # LBs provisioned before this tag was added won't be discovered + # until the cluster is redeployed. + # FIXME: To tag existing ALBs without redeploying, use the AWS CLI: + # aws elbv2 add-tags --resource-arns \ + # --tags Key=posit.team/true-name,Value= + search_tags = {{ + "posit.team/true-name" = "{self.workload.cfg.true_name}", + }} - metric {{ - name = "HTTPCode_Target_5XX_Count" - statistics = ["Sum"] - period = "5m" - }} + metric {{ + name = "HTTPCode_Target_5XX_Count" + statistics = ["Sum"] + period = "5m" + }} - metric {{ - name = "UnHealthyHostCount" - statistics = ["Average"] - period = "5m" - }} + metric {{ + name = "UnHealthyHostCount" + statistics = ["Average"] + period = "5m" + }} - metric {{ - name = "TargetResponseTime" - statistics = ["Average"] - period = "5m" - }} + metric {{ + name = "TargetResponseTime" + statistics = ["Average"] + period = "5m" }} + }} - discovery {{ - type = "AWS/NetworkELB" - regions = ["{self.region}"] + discovery {{ + type = "AWS/NetworkELB" + regions = ["{self.region}"] - # NLBs are tagged at creation time via traefik.py. - # LBs provisioned before this tag was added won't be discovered - # until the cluster is redeployed. - # FIXME: To tag existing NLBs without redeploying, use the AWS CLI: - # aws elbv2 add-tags --resource-arns \ - # --tags Key=posit.team/true-name,Value= - search_tags = {{ - "posit.team/true-name" = "{self.workload.cfg.true_name}", - }} + # NLBs are tagged at creation time via traefik.py. + # LBs provisioned before this tag was added won't be discovered + # until the cluster is redeployed. + # FIXME: To tag existing NLBs without redeploying, use the AWS CLI: + # aws elbv2 add-tags --resource-arns \ + # --tags Key=posit.team/true-name,Value= + search_tags = {{ + "posit.team/true-name" = "{self.workload.cfg.true_name}", + }} - metric {{ - name = "UnHealthyHostCount" - statistics = ["Average"] - period = "5m" - }} + metric {{ + name = "UnHealthyHostCount" + statistics = ["Average"] + period = "5m" }} }} + }} - prometheus.scrape "cloudwatch" {{ - targets = prometheus.exporter.cloudwatch.cloudwatch.targets - forward_to = [prometheus.relabel.default.receiver] - clustering {{ - enabled = true - }} + prometheus.scrape "cloudwatch" {{ + targets = prometheus.exporter.cloudwatch.cloudwatch.targets + forward_to = [prometheus.relabel.default.receiver] + clustering {{ + enabled = true }} - """) + }} + """) + + def _define_config_map( + self, + name: str, + namespace: str, + ): + control_room_url = f"https://mimir.{self.workload.cfg.control_room_domain}/api/v1/push" + workload_url = "http://mimir-gateway.mimir.svc.cluster.local/api/v1/push" + loki_url = "http://loki-gateway.loki.svc.cluster.local/loki/api/v1/push" + + if isinstance(self.workload, ptd.azure_workload.AzureWorkload): + account_id = self.workload.cfg.tenant_id + cluster_name = self.workload.cluster_name(self.release) + else: + account_id = self.workload.cfg.account_id + cluster_name = self.workload.eks_cluster_name(self.release) + + # Generate CloudWatch exporter configuration for AWS + cloudwatch_config = self._define_cloudwatch_config() # Generate system log scraping configuration system_logs_config = "" diff --git a/python-pulumi/tests/test_grafana_alloy.py b/python-pulumi/tests/test_grafana_alloy.py index 4e678ff..d3c2cf5 100644 --- a/python-pulumi/tests/test_grafana_alloy.py +++ b/python-pulumi/tests/test_grafana_alloy.py @@ -575,3 +575,42 @@ def test_multiple_sites_with_different_replica_counts(self): assert 'name = "site-two-workbench-fqdn"' in result assert 'name = "site-two-connect"' not in result assert 'name = "site-two-connect-fqdn"' not in result + + +def _make_alloy_for_cloudwatch(cloud_provider_name: str, true_name: str = "myapp", compound_name: str = "myapp-production") -> AlloyConfig: + """Helper to create an AlloyConfig instance with mocked attributes for cloudwatch tests.""" + alloy = AlloyConfig.__new__(AlloyConfig) + mock_workload = Mock() + mock_workload.cfg.true_name = true_name + mock_workload.compound_name = compound_name + mock_cloud_provider = Mock() + mock_cloud_provider.name = cloud_provider_name + mock_workload.cloud_provider = mock_cloud_provider + alloy.workload = mock_workload + alloy.cloud_provider = cloud_provider_name.lower() + alloy.region = "us-east-1" + return alloy + + +class TestDefineCloudwatchConfig: + """Tests for _define_cloudwatch_config method.""" + + def test_aws_contains_natgateway_discovery_block(self) -> None: + alloy = _make_alloy_for_cloudwatch("aws") + result = alloy._define_cloudwatch_config() # noqa: SLF001 + assert 'AWS/NATGateway' in result + + def test_aws_contains_applicationelb_discovery_block(self) -> None: + alloy = _make_alloy_for_cloudwatch("aws") + result = alloy._define_cloudwatch_config() # noqa: SLF001 + assert 'AWS/ApplicationELB' in result + + def test_aws_contains_networkelb_discovery_block(self) -> None: + alloy = _make_alloy_for_cloudwatch("aws") + result = alloy._define_cloudwatch_config() # noqa: SLF001 + assert 'AWS/NetworkELB' in result + + def test_non_aws_returns_empty_string(self) -> None: + alloy = _make_alloy_for_cloudwatch("azure") + result = alloy._define_cloudwatch_config() # noqa: SLF001 + assert result == "" From 64bd4e297360ee31bde17b620f544c87ad3ba305 Mon Sep 17 00:00:00 2001 From: ian-flores Date: Mon, 23 Feb 2026 13:08:30 -0800 Subject: [PATCH 24/27] Address review findings (job 126) All 160 tests pass and no new lint issues were introduced. --- Changes: - Validate `compound_name` in `_define_cloudwatch_config` with `_validate_alloy_true_name` to close the injection gap alongside `true_name` - Add test `test_invalid_true_name_raises_value_error` and `test_invalid_compound_name_raises_value_error` in `TestDefineCloudwatchConfig` to cover the validation code path - Add provisioning false-positive note to `rds_free_storage_low` annotation documenting expected `noDataState=Alerting` behavior during CloudWatch scrape initialization - Replace open-ended "Create a GitHub issue" comment with `TODO:` prefix on the `DatabaseConnections` Sum metric to make it searchable - Fix line length of `_make_alloy_for_cloudwatch` test helper signature (split across multiple lines) --- python-pulumi/src/ptd/grafana_alerts/rds.yaml | 2 +- .../src/ptd/pulumi_resources/grafana_alloy.py | 8 ++++---- python-pulumi/tests/test_grafana_alloy.py | 16 +++++++++++++++- 3 files changed, 20 insertions(+), 6 deletions(-) diff --git a/python-pulumi/src/ptd/grafana_alerts/rds.yaml b/python-pulumi/src/ptd/grafana_alerts/rds.yaml index 6d49ff6..fb4d04b 100644 --- a/python-pulumi/src/ptd/grafana_alerts/rds.yaml +++ b/python-pulumi/src/ptd/grafana_alerts/rds.yaml @@ -127,7 +127,7 @@ groups: execErrState: Error for: 5m annotations: - description: RDS instance has less than 5 GiB of free storage space remaining. + description: RDS instance has less than 5 GiB of free storage space remaining. Note: on new cluster deployments where CloudWatch scraping has not yet initialized, noDataState=Alerting may produce a false positive after the for:5m window; this is expected during provisioning. summary: Low free storage on RDS instance {{$labels.dimension_DBInstanceIdentifier}} in cluster {{$labels.cluster}} labels: opsgenie: "1" diff --git a/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py b/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py index 6761405..2fa283e 100644 --- a/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py +++ b/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py @@ -223,6 +223,7 @@ def _define_cloudwatch_config(self) -> str: if self.cloud_provider != "aws": return "" _validate_alloy_true_name(self.workload.cfg.true_name) + _validate_alloy_true_name(self.workload.compound_name) return textwrap.dedent(f""" prometheus.exporter.cloudwatch "cloudwatch" {{ sts_region = "{self.region}" @@ -262,13 +263,12 @@ def _define_cloudwatch_config(self) -> str: period = "5m" }} + # TODO: Remove ["Sum"] from statistics once all Grafana dashboards have + # been updated to query aws_rds_database_connections_average. # Collecting both Sum and Average during migration. Average is the # target metric (aws_rds_database_connections_average); Sum # (aws_rds_database_connections_sum) is kept temporarily for existing - # dashboards. Remove ["Sum"] from statistics once all Grafana dashboards - # have been updated to query aws_rds_database_connections_average. - # NOTE: Keeping Sum doubles the CloudWatch API cost for this metric. - # Create a GitHub issue to track this removal before it is forgotten. + # dashboards. NOTE: Keeping Sum doubles the CloudWatch API cost for this metric. metric {{ name = "DatabaseConnections" statistics = ["Average", "Sum"] diff --git a/python-pulumi/tests/test_grafana_alloy.py b/python-pulumi/tests/test_grafana_alloy.py index d3c2cf5..c248b03 100644 --- a/python-pulumi/tests/test_grafana_alloy.py +++ b/python-pulumi/tests/test_grafana_alloy.py @@ -577,7 +577,11 @@ def test_multiple_sites_with_different_replica_counts(self): assert 'name = "site-two-connect-fqdn"' not in result -def _make_alloy_for_cloudwatch(cloud_provider_name: str, true_name: str = "myapp", compound_name: str = "myapp-production") -> AlloyConfig: +def _make_alloy_for_cloudwatch( + cloud_provider_name: str, + true_name: str = "myapp", + compound_name: str = "myapp-production", +) -> AlloyConfig: """Helper to create an AlloyConfig instance with mocked attributes for cloudwatch tests.""" alloy = AlloyConfig.__new__(AlloyConfig) mock_workload = Mock() @@ -614,3 +618,13 @@ def test_non_aws_returns_empty_string(self) -> None: alloy = _make_alloy_for_cloudwatch("azure") result = alloy._define_cloudwatch_config() # noqa: SLF001 assert result == "" + + def test_invalid_true_name_raises_value_error(self) -> None: + alloy = _make_alloy_for_cloudwatch("aws", true_name='bad"name') + with pytest.raises(ValueError, match="unsafe for Alloy River config"): + alloy._define_cloudwatch_config() # noqa: SLF001 + + def test_invalid_compound_name_raises_value_error(self) -> None: + alloy = _make_alloy_for_cloudwatch("aws", compound_name="bad{name}") + with pytest.raises(ValueError, match="unsafe for Alloy River config"): + alloy._define_cloudwatch_config() # noqa: SLF001 From be4d986db10ff6fe37bbdf40073cea5b35354aed Mon Sep 17 00:00:00 2001 From: ian-flores Date: Mon, 23 Feb 2026 13:08:30 -0800 Subject: [PATCH 25/27] Address review findings (job 127) All 163 tests pass and Go builds cleanly. Changes: - `tests/test_lib.py`: Add test for `format_lb_tags` with `/` in key, exercising the real production key format (`posit.team/true-name`) - `tests/test_grafana_alloy.py`: Strengthen `TestDefineCloudwatchConfig` tests with assertions that interpolated `true_name` and `compound_name` values appear in the correct `search_tags` positions; add two dedicated tests for these - `aws_workload_helm.py`: Add docstring comment to `_build_alb_tag_string` clarifying that `format_lb_tags` validates for annotation safety only, and that Alloy River injection safety is enforced separately by `_validate_alloy_true_name` in `grafana_alloy.py` - `grafana_alerts/loadbalancer.yaml`, `natgateway.yaml`, `rds.yaml`: Add comment documenting that `{{$labels.cluster}}` in alert annotations is injected by the `prometheus.relabel.default` block in Alloy config, and will be blank if relabeling is absent --- .../src/ptd/grafana_alerts/loadbalancer.yaml | 5 +++++ .../src/ptd/grafana_alerts/natgateway.yaml | 5 +++++ python-pulumi/src/ptd/grafana_alerts/rds.yaml | 5 +++++ .../src/ptd/pulumi_resources/aws_workload_helm.py | 9 ++++++++- python-pulumi/tests/test_grafana_alloy.py | 13 +++++++++++++ python-pulumi/tests/test_lib.py | 7 +++++++ 6 files changed, 43 insertions(+), 1 deletion(-) diff --git a/python-pulumi/src/ptd/grafana_alerts/loadbalancer.yaml b/python-pulumi/src/ptd/grafana_alerts/loadbalancer.yaml index 4e7f244..19e77ec 100644 --- a/python-pulumi/src/ptd/grafana_alerts/loadbalancer.yaml +++ b/python-pulumi/src/ptd/grafana_alerts/loadbalancer.yaml @@ -13,6 +13,11 @@ # uid: alb_response_latency_high # # See https://grafana.com/docs/grafana/latest/alerting/set-up/provision-alerting-resources/file-provisioning/ +# +# Note: alert annotations reference {{$labels.cluster}}. For CloudWatch-sourced metrics, +# this label is injected by the prometheus.relabel.default block in grafana_alloy.py. +# If Alloy is not running or relabeling is misconfigured, the label will be absent and +# the annotation will render as "in cluster " (blank). apiVersion: 1 groups: - orgId: 1 diff --git a/python-pulumi/src/ptd/grafana_alerts/natgateway.yaml b/python-pulumi/src/ptd/grafana_alerts/natgateway.yaml index 479ed1e..0baf178 100644 --- a/python-pulumi/src/ptd/grafana_alerts/natgateway.yaml +++ b/python-pulumi/src/ptd/grafana_alerts/natgateway.yaml @@ -9,6 +9,11 @@ # uid: nat_gateway_packets_dropped # # See https://grafana.com/docs/grafana/latest/alerting/set-up/provision-alerting-resources/file-provisioning/ +# +# Note: alert annotations reference {{$labels.cluster}}. For CloudWatch-sourced metrics, +# this label is injected by the prometheus.relabel.default block in grafana_alloy.py. +# If Alloy is not running or relabeling is misconfigured, the label will be absent and +# the annotation will render as "in cluster " (blank). apiVersion: 1 groups: - orgId: 1 diff --git a/python-pulumi/src/ptd/grafana_alerts/rds.yaml b/python-pulumi/src/ptd/grafana_alerts/rds.yaml index fb4d04b..6b94f53 100644 --- a/python-pulumi/src/ptd/grafana_alerts/rds.yaml +++ b/python-pulumi/src/ptd/grafana_alerts/rds.yaml @@ -15,6 +15,11 @@ # uid: rds_database_connections_high # # See https://grafana.com/docs/grafana/latest/alerting/set-up/provision-alerting-resources/file-provisioning/ +# +# Note: alert annotations reference {{$labels.cluster}}. For CloudWatch-sourced metrics, +# this label is injected by the prometheus.relabel.default block in grafana_alloy.py. +# If Alloy is not running or relabeling is misconfigured, the label will be absent and +# the annotation will render as "in cluster " (blank). apiVersion: 1 groups: - orgId: 1 diff --git a/python-pulumi/src/ptd/pulumi_resources/aws_workload_helm.py b/python-pulumi/src/ptd/pulumi_resources/aws_workload_helm.py index 18a55a9..029bfeb 100644 --- a/python-pulumi/src/ptd/pulumi_resources/aws_workload_helm.py +++ b/python-pulumi/src/ptd/pulumi_resources/aws_workload_helm.py @@ -15,7 +15,14 @@ def _build_alb_tag_string(true_name: str, environment: str, compound_name: str) -> str: - """Build the ALB annotation tag string from workload config values.""" + """Build the ALB annotation tag string from workload config values. + + Uses format_lb_tags, which validates that values are safe for LB controller + annotation strings (no commas, equals, or whitespace). Note: format_lb_tags does + NOT validate for Alloy River config safety (e.g. it permits `{`, `}`, `"`). + Alloy River injection safety is enforced separately by _validate_alloy_true_name + in grafana_alloy.py before values are interpolated into the CloudWatch config. + """ return format_lb_tags( { "posit.team/true-name": true_name, diff --git a/python-pulumi/tests/test_grafana_alloy.py b/python-pulumi/tests/test_grafana_alloy.py index c248b03..df0cb20 100644 --- a/python-pulumi/tests/test_grafana_alloy.py +++ b/python-pulumi/tests/test_grafana_alloy.py @@ -603,16 +603,29 @@ def test_aws_contains_natgateway_discovery_block(self) -> None: alloy = _make_alloy_for_cloudwatch("aws") result = alloy._define_cloudwatch_config() # noqa: SLF001 assert 'AWS/NATGateway' in result + assert '"posit.team/true-name" = "myapp"' in result def test_aws_contains_applicationelb_discovery_block(self) -> None: alloy = _make_alloy_for_cloudwatch("aws") result = alloy._define_cloudwatch_config() # noqa: SLF001 assert 'AWS/ApplicationELB' in result + assert '"posit.team/true-name" = "myapp"' in result def test_aws_contains_networkelb_discovery_block(self) -> None: alloy = _make_alloy_for_cloudwatch("aws") result = alloy._define_cloudwatch_config() # noqa: SLF001 assert 'AWS/NetworkELB' in result + assert '"posit.team/true-name" = "myapp"' in result + + def test_aws_search_tags_use_true_name(self) -> None: + alloy = _make_alloy_for_cloudwatch("aws", true_name="customapp") + result = alloy._define_cloudwatch_config() # noqa: SLF001 + assert '"posit.team/true-name" = "customapp"' in result + + def test_aws_fsx_rds_ec2_use_compound_name(self) -> None: + alloy = _make_alloy_for_cloudwatch("aws", compound_name="customapp-staging") + result = alloy._define_cloudwatch_config() # noqa: SLF001 + assert 'Name = "customapp-staging"' in result def test_non_aws_returns_empty_string(self) -> None: alloy = _make_alloy_for_cloudwatch("azure") diff --git a/python-pulumi/tests/test_lib.py b/python-pulumi/tests/test_lib.py index 828ecff..4b1f8ed 100644 --- a/python-pulumi/tests/test_lib.py +++ b/python-pulumi/tests/test_lib.py @@ -120,3 +120,10 @@ def test_format_lb_tags_carriage_return_in_key() -> None: def test_format_lb_tags_carriage_return_in_value() -> None: with pytest.raises(ValueError, match="whitespace"): format_lb_tags({"key": "bad\rvalue"}) + + +def test_format_lb_tags_slash_in_key() -> None: + # Production keys like posit.team/true-name contain /; must be accepted + result = format_lb_tags({"posit.team/true-name": "myapp", "posit.team/environment": "production"}) + assert "posit.team/true-name=myapp" in result + assert "posit.team/environment=production" in result From b7a6b57bdb882c5c6d41b98b8e618f7da242a953 Mon Sep 17 00:00:00 2001 From: ian-flores Date: Wed, 25 Feb 2026 11:59:21 -0800 Subject: [PATCH 26/27] Fix EventBridge IAM permissions chicken-and-egg problem Set SupportsManagedByCondition to false for all EventBridge actions. The managed-by tag condition prevents managing EventBridge rules that don't already have the posit.team/managed-by tag, blocking operations like DescribeRule and TagResource. Account-scoping is sufficient. --- lib/aws/iam.go | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/lib/aws/iam.go b/lib/aws/iam.go index 9c303b5..9361faa 100644 --- a/lib/aws/iam.go +++ b/lib/aws/iam.go @@ -1123,13 +1123,18 @@ func SqsActions() []Action { // EventsActions returns EventBridge-related actions func EventsActions() []Action { + // SupportsManagedByCondition is false for all resource-level actions because + // EventBridge rules may not have the posit.team/managed-by tag (e.g., rules + // created by Karpenter). Requiring the tag creates a chicken-and-egg problem + // where you can't tag, describe, or manage rules that lack the tag. + // Account-scoping via ResourceConditionLimitingActions is sufficient. return []Action{ {"events:ListRules", false, false}, - {"events:ListTagsForResource", true, true}, - {"events:*Rule", true, true}, - {"events:*Targets", true, true}, - {"events:*tag*", true, true}, - {"events:PutRule", true, true}, + {"events:ListTagsForResource", true, false}, + {"events:*Rule", true, false}, + {"events:*Targets", true, false}, + {"events:*tag*", true, false}, + {"events:PutRule", true, false}, } } From fade18d4849f0b7bf710c43cf8ee4f458841e6dd Mon Sep 17 00:00:00 2001 From: ian-flores Date: Wed, 25 Feb 2026 12:04:08 -0800 Subject: [PATCH 27/27] Fix Python lint and formatting errors --- .../src/ptd/pulumi_resources/grafana_alloy.py | 3 ++- python-pulumi/src/ptd/pulumi_resources/lib.py | 8 ++++++-- python-pulumi/src/ptd/pulumi_resources/traefik.py | 10 ++-------- python-pulumi/tests/test_grafana_alloy.py | 6 +++--- 4 files changed, 13 insertions(+), 14 deletions(-) diff --git a/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py b/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py index 2fa283e..84c51d6 100644 --- a/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py +++ b/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py @@ -36,10 +36,11 @@ def _validate_alloy_true_name(true_name: str) -> None: graph-construction time so failures are caught during `pulumi preview`. """ if not re.match(r"^[a-zA-Z0-9._-]+$", true_name): - raise ValueError( + msg = ( f"workload true_name contains characters unsafe for Alloy River config: " f"{true_name!r}. Must match [a-zA-Z0-9._-]+" ) + raise ValueError(msg) class AlloyConfig(pulumi.ComponentResource): diff --git a/python-pulumi/src/ptd/pulumi_resources/lib.py b/python-pulumi/src/ptd/pulumi_resources/lib.py index 16dd35f..f292ad1 100644 --- a/python-pulumi/src/ptd/pulumi_resources/lib.py +++ b/python-pulumi/src/ptd/pulumi_resources/lib.py @@ -1,3 +1,7 @@ +_AWS_TAG_KEY_MAX_LENGTH = 128 +_AWS_TAG_VALUE_MAX_LENGTH = 256 + + def format_lb_tags(tags: dict[str, str]) -> str: """Format tags as comma-separated key=value pairs for AWS LB Controller annotations. @@ -17,7 +21,7 @@ def format_lb_tags(tags: dict[str, str]) -> str: if key.startswith("aws:"): msg = f"LB tag key uses reserved 'aws:' prefix: {key!r}" raise ValueError(msg) - if len(key) > 128: + if len(key) > _AWS_TAG_KEY_MAX_LENGTH: msg = f"LB tag key exceeds AWS 128-character limit ({len(key)} chars): {key!r}" raise ValueError(msg) if "," in key or "=" in key: @@ -29,7 +33,7 @@ def format_lb_tags(tags: dict[str, str]) -> str: if not value: msg = f"LB tag value must not be None or empty: key={key}" raise ValueError(msg) - if len(value) > 256: + if len(value) > _AWS_TAG_VALUE_MAX_LENGTH: msg = f"LB tag value exceeds AWS 256-character limit ({len(value)} chars): key={key}" raise ValueError(msg) if "," in value or "=" in value: diff --git a/python-pulumi/src/ptd/pulumi_resources/traefik.py b/python-pulumi/src/ptd/pulumi_resources/traefik.py index 1078641..8ea5d51 100644 --- a/python-pulumi/src/ptd/pulumi_resources/traefik.py +++ b/python-pulumi/src/ptd/pulumi_resources/traefik.py @@ -21,16 +21,10 @@ def _build_nlb_tag_string(tags: dict[str, str] | None, cluster_name: str) -> str true_name = tags.get("posit.team/true-name") environment = tags.get("posit.team/environment") if true_name is None: - msg = ( - f"Missing required tag: 'posit.team/true-name' for NLB tagging. " - f"Available tags: {list(tags.keys())}" - ) + msg = f"Missing required tag: 'posit.team/true-name' for NLB tagging. Available tags: {list(tags.keys())}" raise ValueError(msg) if environment is None: - msg = ( - f"Missing required tag: 'posit.team/environment' for NLB tagging. " - f"Available tags: {list(tags.keys())}" - ) + msg = f"Missing required tag: 'posit.team/environment' for NLB tagging. Available tags: {list(tags.keys())}" raise ValueError(msg) return format_lb_tags( { diff --git a/python-pulumi/tests/test_grafana_alloy.py b/python-pulumi/tests/test_grafana_alloy.py index df0cb20..2b28fc3 100644 --- a/python-pulumi/tests/test_grafana_alloy.py +++ b/python-pulumi/tests/test_grafana_alloy.py @@ -602,19 +602,19 @@ class TestDefineCloudwatchConfig: def test_aws_contains_natgateway_discovery_block(self) -> None: alloy = _make_alloy_for_cloudwatch("aws") result = alloy._define_cloudwatch_config() # noqa: SLF001 - assert 'AWS/NATGateway' in result + assert "AWS/NATGateway" in result assert '"posit.team/true-name" = "myapp"' in result def test_aws_contains_applicationelb_discovery_block(self) -> None: alloy = _make_alloy_for_cloudwatch("aws") result = alloy._define_cloudwatch_config() # noqa: SLF001 - assert 'AWS/ApplicationELB' in result + assert "AWS/ApplicationELB" in result assert '"posit.team/true-name" = "myapp"' in result def test_aws_contains_networkelb_discovery_block(self) -> None: alloy = _make_alloy_for_cloudwatch("aws") result = alloy._define_cloudwatch_config() # noqa: SLF001 - assert 'AWS/NetworkELB' in result + assert "AWS/NetworkELB" in result assert '"posit.team/true-name" = "myapp"' in result def test_aws_search_tags_use_true_name(self) -> None: