diff --git a/lib/aws/iam.go b/lib/aws/iam.go index 9c303b5..9361faa 100644 --- a/lib/aws/iam.go +++ b/lib/aws/iam.go @@ -1123,13 +1123,18 @@ func SqsActions() []Action { // EventsActions returns EventBridge-related actions func EventsActions() []Action { + // SupportsManagedByCondition is false for all resource-level actions because + // EventBridge rules may not have the posit.team/managed-by tag (e.g., rules + // created by Karpenter). Requiring the tag creates a chicken-and-egg problem + // where you can't tag, describe, or manage rules that lack the tag. + // Account-scoping via ResourceConditionLimitingActions is sufficient. return []Action{ {"events:ListRules", false, false}, - {"events:ListTagsForResource", true, true}, - {"events:*Rule", true, true}, - {"events:*Targets", true, true}, - {"events:*tag*", true, true}, - {"events:PutRule", true, true}, + {"events:ListTagsForResource", true, false}, + {"events:*Rule", true, false}, + {"events:*Targets", true, false}, + {"events:*tag*", true, false}, + {"events:PutRule", true, false}, } } diff --git a/python-pulumi/src/ptd/grafana_alerts/cloudwatch.yaml b/python-pulumi/src/ptd/grafana_alerts/cloudwatch.yaml index d28856f..0e18559 100644 --- a/python-pulumi/src/ptd/grafana_alerts/cloudwatch.yaml +++ b/python-pulumi/src/ptd/grafana_alerts/cloudwatch.yaml @@ -46,7 +46,7 @@ groups: type: and query: params: - - C + - A reducer: params: [] type: last @@ -107,7 +107,7 @@ groups: type: and query: params: - - C + - A reducer: params: [] type: last @@ -164,7 +164,7 @@ groups: type: and query: params: - - C + - A reducer: params: [] type: last diff --git a/python-pulumi/src/ptd/grafana_alerts/loadbalancer.yaml b/python-pulumi/src/ptd/grafana_alerts/loadbalancer.yaml new file mode 100644 index 0000000..19e77ec --- /dev/null +++ b/python-pulumi/src/ptd/grafana_alerts/loadbalancer.yaml @@ -0,0 +1,247 @@ +# To delete these alerts, simply removing the configMap that uses this method will not work. +# Replace file contents with the following and apply in order to delete the alerts +# (repeat the deleteRules entry for each uid listed below): +# apiVersion: 1 +# deleteRules: +# - orgId: 1 +# uid: alb_target_5xx_errors_high +# - orgId: 1 +# uid: alb_unhealthy_targets +# - orgId: 1 +# uid: nlb_unhealthy_targets +# - orgId: 1 +# uid: alb_response_latency_high +# +# See https://grafana.com/docs/grafana/latest/alerting/set-up/provision-alerting-resources/file-provisioning/ +# +# Note: alert annotations reference {{$labels.cluster}}. For CloudWatch-sourced metrics, +# this label is injected by the prometheus.relabel.default block in grafana_alloy.py. +# If Alloy is not running or relabeling is misconfigured, the label will be absent and +# the annotation will render as "in cluster " (blank). +apiVersion: 1 +groups: + - orgId: 1 + name: LoadBalancer + folder: Posit Alerts + interval: 5m + rules: + - uid: alb_target_5xx_errors_high + title: ALB Target 5XX Errors High + condition: B + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: mimir + model: + editorMode: code + expr: aws_applicationelb_httpcode_target_5_xx_count_sum{job="integrations/cloudwatch"} + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: A + - refId: B + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 10 # Absolute count, not an error rate. High-traffic ALBs may never breach this; low-traffic ALBs may page on transient spikes. Consider a rate-based alert if traffic volume varies significantly across environments. + type: gt + operator: + type: and + query: + params: + - A + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + refId: B + type: threshold + noDataState: NoData + execErrState: Error + for: 5m + annotations: + description: Application Load Balancer has more than 10 target 5XX errors for over 5 minutes, indicating backend service failures. + summary: High 5XX errors on ALB {{$labels.dimension_LoadBalancer}} in cluster {{$labels.cluster}} + labels: + opsgenie: "1" + isPaused: false + - uid: alb_unhealthy_targets + title: ALB Unhealthy Targets + condition: B + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: mimir + model: + editorMode: code + expr: aws_applicationelb_un_healthy_host_count_average{job="integrations/cloudwatch"} + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: A + - refId: B + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 0 + type: gt + operator: + type: and + query: + params: + - A + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + refId: B + type: threshold + noDataState: NoData + execErrState: Error + for: 10m + annotations: + description: Application Load Balancer has unhealthy targets for over 10 minutes, indicating backend service health issues. The extended window reduces false positives during rolling deployments. + summary: Unhealthy targets on ALB {{$labels.dimension_LoadBalancer}} in cluster {{$labels.cluster}} + labels: + opsgenie: "1" + isPaused: false + - uid: nlb_unhealthy_targets + title: NLB Unhealthy Targets + condition: B + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: mimir + model: + editorMode: code + expr: aws_networkelb_un_healthy_host_count_average{job="integrations/cloudwatch"} + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: A + - refId: B + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 0 + type: gt + operator: + type: and + query: + params: + - A + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + refId: B + type: threshold + noDataState: NoData + execErrState: Error + for: 10m + annotations: + description: Network Load Balancer has unhealthy targets for over 10 minutes, indicating backend service health issues. The extended window reduces false positives during rolling deployments and node drains. + summary: Unhealthy targets on NLB {{$labels.dimension_LoadBalancer}} in cluster {{$labels.cluster}} + labels: + opsgenie: "1" + isPaused: false + - uid: alb_response_latency_high + title: ALB Response Latency High + condition: B + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: mimir + model: + editorMode: code + expr: aws_applicationelb_target_response_time_average{job="integrations/cloudwatch"} + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: A + - refId: B + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 2 # 2 seconds + type: gt + operator: + type: and + query: + params: + - A + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + refId: B + type: threshold + noDataState: NoData + execErrState: Error + for: 10m + annotations: + description: Application Load Balancer target response time is above 2 seconds for more than 10 minutes, indicating performance degradation. + summary: High response latency on ALB {{$labels.dimension_LoadBalancer}} in cluster {{$labels.cluster}} + labels: + opsgenie: "1" + isPaused: false diff --git a/python-pulumi/src/ptd/grafana_alerts/natgateway.yaml b/python-pulumi/src/ptd/grafana_alerts/natgateway.yaml new file mode 100644 index 0000000..0baf178 --- /dev/null +++ b/python-pulumi/src/ptd/grafana_alerts/natgateway.yaml @@ -0,0 +1,133 @@ +# To delete these alerts, simply removing the configMap that uses this method will not work. +# Replace file contents with the following and apply in order to delete the alerts +# (repeat the deleteRules entry for each uid listed below): +# apiVersion: 1 +# deleteRules: +# - orgId: 1 +# uid: nat_gateway_port_allocation_errors +# - orgId: 1 +# uid: nat_gateway_packets_dropped +# +# See https://grafana.com/docs/grafana/latest/alerting/set-up/provision-alerting-resources/file-provisioning/ +# +# Note: alert annotations reference {{$labels.cluster}}. For CloudWatch-sourced metrics, +# this label is injected by the prometheus.relabel.default block in grafana_alloy.py. +# If Alloy is not running or relabeling is misconfigured, the label will be absent and +# the annotation will render as "in cluster " (blank). +apiVersion: 1 +groups: + - orgId: 1 + name: NATGateway + folder: Posit Alerts + interval: 5m + rules: + - uid: nat_gateway_port_allocation_errors + title: NAT Gateway Port Allocation Errors + condition: B + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: mimir + model: + editorMode: code + expr: aws_natgateway_error_port_allocation_sum{job="integrations/cloudwatch"} + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: A + - refId: B + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 0 + type: gt + operator: + type: and + query: + params: + - A + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + refId: B + type: threshold + noDataState: NoData # Using NoData to avoid spurious pages during scrape outages (Alloy restart, credential rotation, brief partitions). Add a separate "CloudWatch scrape down" alert to cover the outage case independently. + execErrState: Error + for: 5m + annotations: + description: NAT Gateway is experiencing port allocation errors, which means outbound network connectivity is failing. This is a critical issue that requires immediate attention. + summary: Port allocation errors on NAT Gateway {{$labels.dimension_NatGatewayId}} in cluster {{$labels.cluster}} + labels: + opsgenie: "1" + isPaused: false + - uid: nat_gateway_packets_dropped + title: NAT Gateway Packets Dropped + condition: B + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: mimir + model: + editorMode: code + expr: aws_natgateway_packets_drop_count_sum{job="integrations/cloudwatch"} + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: A + - refId: B + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 100 # Calibrated as a conservative baseline; high-throughput gateways may see this normally. Adjust per environment. + type: gt + operator: + type: and + query: + params: + - A + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + refId: B + type: threshold + noDataState: NoData + execErrState: Error + for: 5m + annotations: + description: NAT Gateway has dropped more than 100 packets for over 5 minutes, indicating potential network issues. + summary: High packet drop rate on NAT Gateway {{$labels.dimension_NatGatewayId}} in cluster {{$labels.cluster}} + labels: + opsgenie: "1" + isPaused: false diff --git a/python-pulumi/src/ptd/grafana_alerts/rds.yaml b/python-pulumi/src/ptd/grafana_alerts/rds.yaml new file mode 100644 index 0000000..6b94f53 --- /dev/null +++ b/python-pulumi/src/ptd/grafana_alerts/rds.yaml @@ -0,0 +1,306 @@ +# To delete these alerts, simply removing the configMap that uses this method will not work. +# Replace file contents with the following and apply in order to delete the alerts +# (repeat the deleteRules entry for each uid listed below): +# apiVersion: 1 +# deleteRules: +# - orgId: 1 +# uid: rds_cpu_utilization_high +# - orgId: 1 +# uid: rds_free_storage_low +# - orgId: 1 +# uid: rds_freeable_memory_low +# - orgId: 1 +# uid: rds_read_latency_high +# - orgId: 1 +# uid: rds_database_connections_high +# +# See https://grafana.com/docs/grafana/latest/alerting/set-up/provision-alerting-resources/file-provisioning/ +# +# Note: alert annotations reference {{$labels.cluster}}. For CloudWatch-sourced metrics, +# this label is injected by the prometheus.relabel.default block in grafana_alloy.py. +# If Alloy is not running or relabeling is misconfigured, the label will be absent and +# the annotation will render as "in cluster " (blank). +apiVersion: 1 +groups: + - orgId: 1 + name: RDS + folder: Posit Alerts + interval: 5m + rules: + - uid: rds_cpu_utilization_high + title: RDS CPU Utilization High + condition: B + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: mimir + model: + editorMode: code + expr: aws_rds_cpuutilization_average{job="integrations/cloudwatch"} + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: A + - refId: B + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 80 + type: gt + operator: + type: and + query: + params: + - A + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + refId: B + type: threshold + noDataState: NoData # Performance metric; silent suppression on scrape outage is acceptable + execErrState: Error + for: 10m + annotations: + description: RDS instance CPU utilization is above 80% for more than 10 minutes. + summary: High CPU utilization on RDS instance {{$labels.dimension_DBInstanceIdentifier}} in cluster {{$labels.cluster}} + labels: + opsgenie: "1" + isPaused: false + - uid: rds_free_storage_low + title: RDS Free Storage Low + condition: B + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: mimir + model: + editorMode: code + expr: aws_rds_free_storage_space_average{job="integrations/cloudwatch"} + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: A + - refId: B + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 5368709120 # 5 GiB in bytes; calibrated for mid-size instances (100–500 GiB). Adjust for larger (e.g. 1 TiB) or smaller instances. + type: lt + operator: + type: and + query: + params: + - A + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + refId: B + type: threshold + noDataState: Alerting # Storage exhaustion is latent; alert even when scraping stops so we don't silently miss a full disk + execErrState: Error + for: 5m + annotations: + description: RDS instance has less than 5 GiB of free storage space remaining. Note: on new cluster deployments where CloudWatch scraping has not yet initialized, noDataState=Alerting may produce a false positive after the for:5m window; this is expected during provisioning. + summary: Low free storage on RDS instance {{$labels.dimension_DBInstanceIdentifier}} in cluster {{$labels.cluster}} + labels: + opsgenie: "1" + isPaused: false + - uid: rds_freeable_memory_low + title: RDS Freeable Memory Low + condition: B + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: mimir + model: + editorMode: code + expr: aws_rds_freeable_memory_average{job="integrations/cloudwatch"} + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: A + - refId: B + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 536870912 # 512 MiB in bytes; calibrated for db.r5.large (~16 GiB RAM). Adjust for other instance classes. + type: lt + operator: + type: and + query: + params: + - A + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + refId: B + type: threshold + noDataState: Alerting # Memory exhaustion is latent; alert even when scraping stops so we don't silently miss an OOM condition + execErrState: Error + for: 10m + annotations: + description: RDS instance has less than 512 MiB of freeable memory remaining for more than 10 minutes. Note: this threshold is calibrated for db.r5.large (~16 GiB RAM); it will fire continuously for small instances (e.g. db.t3.micro, db.t3.small). Adjust per instance class. + summary: Low freeable memory on RDS instance {{$labels.dimension_DBInstanceIdentifier}} in cluster {{$labels.cluster}} + labels: + opsgenie: "1" + instance_size_dependent: "true" # Silence this label for known-small instance classes + isPaused: true # Paused until a Grafana silence or Alertmanager inhibit rule is configured for the instance_size_dependent label + - uid: rds_read_latency_high + title: RDS Read Latency High + condition: B + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: mimir + model: + editorMode: code + expr: aws_rds_read_latency_average{job="integrations/cloudwatch"} + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: A + - refId: B + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 0.1 # 100ms in seconds + type: gt + operator: + type: and + query: + params: + - A + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + refId: B + type: threshold + noDataState: NoData # Performance metric; silent suppression on scrape outage is acceptable + execErrState: Error + for: 10m + annotations: + description: RDS instance read latency is above 100ms for more than 10 minutes. + summary: High read latency on RDS instance {{$labels.dimension_DBInstanceIdentifier}} in cluster {{$labels.cluster}} + labels: + opsgenie: "1" + isPaused: false + - uid: rds_database_connections_high + title: RDS Database Connections High + condition: B + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: mimir + model: + editorMode: code + expr: aws_rds_database_connections_average{job="integrations/cloudwatch"} + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: A + - refId: B + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 500 # Calibrated for db.r5.large (max ~4000). For small instances (db.t3.small max ~36) this alert will never fire; adjust per instance class. + type: gt + operator: + type: and + query: + params: + - A + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + refId: B + type: threshold + noDataState: NoData # Performance metric; silent suppression on scrape outage is acceptable + execErrState: Error + for: 5m + annotations: + description: RDS instance has more than 500 active database connections for more than 5 minutes. Note: this threshold is calibrated for db.r5.large (max ~4000 connections); it will never fire for small instances (e.g. db.t3.small max ~36). Adjust per instance class. + summary: High database connections on RDS instance {{$labels.dimension_DBInstanceIdentifier}} in cluster {{$labels.cluster}} + labels: + opsgenie: "1" + instance_size_dependent: "true" # Silence this label for known-small instance classes + isPaused: true # Paused until a Grafana silence or Alertmanager inhibit rule is configured for the instance_size_dependent label diff --git a/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py b/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py index 463c703..a7023af 100644 --- a/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py +++ b/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py @@ -1949,6 +1949,9 @@ def with_grafana( self._create_alert_configmap("pods", grafana_ns) self._create_alert_configmap("cloudwatch", grafana_ns) + self._create_alert_configmap("rds", grafana_ns) + self._create_alert_configmap("natgateway", grafana_ns) + self._create_alert_configmap("loadbalancer", grafana_ns) self._create_alert_configmap("healthchecks", grafana_ns) self._create_alert_configmap("nodes", grafana_ns) self._create_alert_configmap("applications", grafana_ns) diff --git a/python-pulumi/src/ptd/pulumi_resources/aws_workload_helm.py b/python-pulumi/src/ptd/pulumi_resources/aws_workload_helm.py index 1e454b2..029bfeb 100644 --- a/python-pulumi/src/ptd/pulumi_resources/aws_workload_helm.py +++ b/python-pulumi/src/ptd/pulumi_resources/aws_workload_helm.py @@ -9,10 +9,29 @@ import ptd.aws_workload import ptd.pulumi_resources.aws_eks_cluster from ptd.pulumi_resources.grafana_alloy import AlloyConfig +from ptd.pulumi_resources.lib import format_lb_tags ALLOY_NAMESPACE = "alloy" +def _build_alb_tag_string(true_name: str, environment: str, compound_name: str) -> str: + """Build the ALB annotation tag string from workload config values. + + Uses format_lb_tags, which validates that values are safe for LB controller + annotation strings (no commas, equals, or whitespace). Note: format_lb_tags does + NOT validate for Alloy River config safety (e.g. it permits `{`, `}`, `"`). + Alloy River injection safety is enforced separately by _validate_alloy_true_name + in grafana_alloy.py before values are interpolated into the CloudWatch config. + """ + return format_lb_tags( + { + "posit.team/true-name": true_name, + "posit.team/environment": environment, + "Name": compound_name, + } + ) + + class AWSWorkloadHelm(pulumi.ComponentResource): workload: ptd.aws_workload.AWSWorkload @@ -826,6 +845,8 @@ def _define_per_site_ingress_annotations( return annotations def _define_ingress_alb_annotations(self, cert_arns: list[str]) -> dict[str, str]: + # cfg.true_name, cfg.environment, and compound_name are plain str values + # loaded from YAML config at startup (see ptd/workload.py), not Pulumi Outputs. annotations = { "alb.ingress.kubernetes.io/ssl-redirect": "443", "alb.ingress.kubernetes.io/listen-ports": json.dumps([{"HTTP": 80}, {"HTTPS": 443}]), @@ -836,6 +857,11 @@ def _define_ingress_alb_annotations(self, cert_arns: list[str]) -> dict[str, str "alb.ingress.kubernetes.io/healthcheck-path": "/ping", "alb.ingress.kubernetes.io/healthcheck-port": "32090", "alb.ingress.kubernetes.io/load-balancer-attributes": "routing.http.drop_invalid_header_fields.enabled=true,idle_timeout.timeout_seconds=300", + "alb.ingress.kubernetes.io/tags": _build_alb_tag_string( + self.workload.cfg.true_name, + self.workload.cfg.environment, + self.workload.compound_name, + ), } if self.workload.cfg.provisioned_vpc: diff --git a/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py b/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py index 4a75bec..84c51d6 100644 --- a/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py +++ b/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py @@ -28,6 +28,21 @@ class PTDComponentForAlloy: T = typing.TypeVar("T") +def _validate_alloy_true_name(true_name: str) -> None: + """Validate that true_name is safe for interpolation into Alloy River config. + + Alloy River config uses double-quoted strings; characters like `"`, `{`, `}` would + break the generated config or allow injection. This validation is enforced at + graph-construction time so failures are caught during `pulumi preview`. + """ + if not re.match(r"^[a-zA-Z0-9._-]+$", true_name): + msg = ( + f"workload true_name contains characters unsafe for Alloy River config: " + f"{true_name!r}. Must match [a-zA-Z0-9._-]+" + ) + raise ValueError(msg) + + class AlloyConfig(pulumi.ComponentResource): namespace: str config_map: kubernetes.core.v1.ConfigMap @@ -204,119 +219,222 @@ def _define_blackbox_config() -> str: many_spaces = re.compile(r"\s+") return many_spaces.sub(" ", cfg).strip() - def _define_config_map( - self, - name: str, - namespace: str, - ): - control_room_url = f"https://mimir.{self.workload.cfg.control_room_domain}/api/v1/push" - workload_url = "http://mimir-gateway.mimir.svc.cluster.local/api/v1/push" - loki_url = "http://loki-gateway.loki.svc.cluster.local/loki/api/v1/push" + def _define_cloudwatch_config(self) -> str: + """Generate CloudWatch exporter configuration for AWS. Returns empty string for non-AWS.""" + if self.cloud_provider != "aws": + return "" + _validate_alloy_true_name(self.workload.cfg.true_name) + _validate_alloy_true_name(self.workload.compound_name) + return textwrap.dedent(f""" + prometheus.exporter.cloudwatch "cloudwatch" {{ + sts_region = "{self.region}" - if isinstance(self.workload, ptd.azure_workload.AzureWorkload): - account_id = self.workload.cfg.tenant_id - cluster_name = self.workload.cluster_name(self.release) - else: - account_id = self.workload.cfg.account_id - cluster_name = self.workload.eks_cluster_name(self.release) + discovery {{ + type = "AWS/FSx" + regions = ["{self.region}"] - # Generate CloudWatch exporter configuration for AWS - cloudwatch_config = "" - if self.cloud_provider == "aws": - cloudwatch_config = textwrap.dedent(f""" - prometheus.exporter.cloudwatch "cloudwatch" {{ - sts_region = "{self.region}" - - discovery {{ - type = "AWS/FSx" - regions = ["{self.region}"] - - search_tags = {{ - Name = "{self.workload.compound_name}", - }} + search_tags = {{ + Name = "{self.workload.compound_name}", + }} - metric {{ - name = "StorageCapacity" - statistics = ["Average"] - period = "5m" - }} + metric {{ + name = "StorageCapacity" + statistics = ["Average"] + period = "5m" + }} - metric {{ - name = "UsedStorageCapacity" - statistics = ["Average"] - period = "5m" - }} + metric {{ + name = "UsedStorageCapacity" + statistics = ["Average"] + period = "5m" }} + }} - discovery {{ - type = "AWS/RDS" - regions = ["{self.region}"] + discovery {{ + type = "AWS/RDS" + regions = ["{self.region}"] - search_tags = {{ - Name = "{self.workload.compound_name}", - }} + search_tags = {{ + Name = "{self.workload.compound_name}", + }} - metric {{ - name = "FreeStorageSpace" - statistics = ["Average"] - period = "5m" - }} + metric {{ + name = "FreeStorageSpace" + statistics = ["Average"] + period = "5m" + }} - metric {{ - name = "DatabaseConnections" - statistics = ["Sum"] - period = "5m" - }} + # TODO: Remove ["Sum"] from statistics once all Grafana dashboards have + # been updated to query aws_rds_database_connections_average. + # Collecting both Sum and Average during migration. Average is the + # target metric (aws_rds_database_connections_average); Sum + # (aws_rds_database_connections_sum) is kept temporarily for existing + # dashboards. NOTE: Keeping Sum doubles the CloudWatch API cost for this metric. + metric {{ + name = "DatabaseConnections" + statistics = ["Average", "Sum"] + period = "5m" + }} - metric {{ - name = "ReadLatency" - statistics = ["Average"] - period = "5m" - }} + metric {{ + name = "ReadLatency" + statistics = ["Average"] + period = "5m" + }} - metric {{ - name = "CPUUtilization" - statistics = ["Average"] - period = "5m" - }} + metric {{ + name = "CPUUtilization" + statistics = ["Average"] + period = "5m" + }} - metric {{ - name = "FreeableMemory" - statistics = ["Average"] - period = "5m" - }} + metric {{ + name = "FreeableMemory" + statistics = ["Average"] + period = "5m" }} - discovery {{ - type = "AWS/EC2" - regions = ["{self.region}"] + # Collected for dashboard visibility; no alert rules defined + metric {{ + name = "WriteLatency" + statistics = ["Average"] + period = "5m" + }} - search_tags = {{ - Name = "{self.workload.compound_name}", - }} + # Collected for dashboard visibility; no alert rules defined + metric {{ + name = "Deadlocks" + statistics = ["Sum"] + period = "5m" + }} + }} - metric {{ - name = "NetworkOut" - statistics = ["Average"] - period = "5m" - }} + discovery {{ + type = "AWS/EC2" + regions = ["{self.region}"] - metric {{ - name = "NetworkPacketsOut" - statistics = ["Average"] - period = "5m" - }} + search_tags = {{ + Name = "{self.workload.compound_name}", + }} + + metric {{ + name = "NetworkOut" + statistics = ["Average"] + period = "5m" + }} + + metric {{ + name = "NetworkPacketsOut" + statistics = ["Average"] + period = "5m" }} }} - prometheus.scrape "cloudwatch" {{ - targets = prometheus.exporter.cloudwatch.cloudwatch.targets - forward_to = [prometheus.relabel.default.receiver] - clustering {{ - enabled = true + discovery {{ + type = "AWS/NATGateway" + regions = ["{self.region}"] + + # NAT Gateways inherit VPC tags including posit.team/true-name + # (see python-pulumi/src/ptd/pulumi_resources/aws_vpc.py:607-616) + search_tags = {{ + "posit.team/true-name" = "{self.workload.cfg.true_name}", + }} + + metric {{ + name = "ErrorPortAllocation" + statistics = ["Sum"] + period = "5m" + }} + + metric {{ + name = "PacketsDropCount" + statistics = ["Sum"] + period = "5m" }} }} - """) + + discovery {{ + type = "AWS/ApplicationELB" + regions = ["{self.region}"] + + # ALBs are tagged at creation time via aws_workload_helm.py. + # LBs provisioned before this tag was added won't be discovered + # until the cluster is redeployed. + # FIXME: To tag existing ALBs without redeploying, use the AWS CLI: + # aws elbv2 add-tags --resource-arns \ + # --tags Key=posit.team/true-name,Value= + search_tags = {{ + "posit.team/true-name" = "{self.workload.cfg.true_name}", + }} + + metric {{ + name = "HTTPCode_Target_5XX_Count" + statistics = ["Sum"] + period = "5m" + }} + + metric {{ + name = "UnHealthyHostCount" + statistics = ["Average"] + period = "5m" + }} + + metric {{ + name = "TargetResponseTime" + statistics = ["Average"] + period = "5m" + }} + }} + + discovery {{ + type = "AWS/NetworkELB" + regions = ["{self.region}"] + + # NLBs are tagged at creation time via traefik.py. + # LBs provisioned before this tag was added won't be discovered + # until the cluster is redeployed. + # FIXME: To tag existing NLBs without redeploying, use the AWS CLI: + # aws elbv2 add-tags --resource-arns \ + # --tags Key=posit.team/true-name,Value= + search_tags = {{ + "posit.team/true-name" = "{self.workload.cfg.true_name}", + }} + + metric {{ + name = "UnHealthyHostCount" + statistics = ["Average"] + period = "5m" + }} + }} + }} + + prometheus.scrape "cloudwatch" {{ + targets = prometheus.exporter.cloudwatch.cloudwatch.targets + forward_to = [prometheus.relabel.default.receiver] + clustering {{ + enabled = true + }} + }} + """) + + def _define_config_map( + self, + name: str, + namespace: str, + ): + control_room_url = f"https://mimir.{self.workload.cfg.control_room_domain}/api/v1/push" + workload_url = "http://mimir-gateway.mimir.svc.cluster.local/api/v1/push" + loki_url = "http://loki-gateway.loki.svc.cluster.local/loki/api/v1/push" + + if isinstance(self.workload, ptd.azure_workload.AzureWorkload): + account_id = self.workload.cfg.tenant_id + cluster_name = self.workload.cluster_name(self.release) + else: + account_id = self.workload.cfg.account_id + cluster_name = self.workload.eks_cluster_name(self.release) + + # Generate CloudWatch exporter configuration for AWS + cloudwatch_config = self._define_cloudwatch_config() # Generate system log scraping configuration system_logs_config = "" diff --git a/python-pulumi/src/ptd/pulumi_resources/lib.py b/python-pulumi/src/ptd/pulumi_resources/lib.py new file mode 100644 index 0000000..f292ad1 --- /dev/null +++ b/python-pulumi/src/ptd/pulumi_resources/lib.py @@ -0,0 +1,45 @@ +_AWS_TAG_KEY_MAX_LENGTH = 128 +_AWS_TAG_VALUE_MAX_LENGTH = 256 + + +def format_lb_tags(tags: dict[str, str]) -> str: + """Format tags as comma-separated key=value pairs for AWS LB Controller annotations. + + Validates that tag keys and values do not contain commas or equals signs, + which would break the annotation format. Whitespace (spaces, tabs, newlines) + is also rejected in both keys and values; while AWS tag values permit spaces, + this function is used exclusively for LB controller annotation strings where + whitespace would be ambiguous. This is a deliberate constraint, not an AWS limit. + """ + if not tags: + msg = "tags must not be empty" + raise ValueError(msg) + for key, value in tags.items(): + if not key: + msg = "LB tag key must not be empty" + raise ValueError(msg) + if key.startswith("aws:"): + msg = f"LB tag key uses reserved 'aws:' prefix: {key!r}" + raise ValueError(msg) + if len(key) > _AWS_TAG_KEY_MAX_LENGTH: + msg = f"LB tag key exceeds AWS 128-character limit ({len(key)} chars): {key!r}" + raise ValueError(msg) + if "," in key or "=" in key: + msg = f"LB tag key contains invalid characters (comma or equals): {key}" + raise ValueError(msg) + if any(c in key for c in (" ", "\t", "\n", "\r")): + msg = f"LB tag key contains invalid whitespace character: {key!r}" + raise ValueError(msg) + if not value: + msg = f"LB tag value must not be None or empty: key={key}" + raise ValueError(msg) + if len(value) > _AWS_TAG_VALUE_MAX_LENGTH: + msg = f"LB tag value exceeds AWS 256-character limit ({len(value)} chars): key={key}" + raise ValueError(msg) + if "," in value or "=" in value: + msg = f"LB tag value contains invalid characters (comma or equals): {key}={value}" + raise ValueError(msg) + if any(c in value for c in (" ", "\t", "\n", "\r")): + msg = f"LB tag value contains invalid whitespace character: {key}={value!r}" + raise ValueError(msg) + return ",".join(f"{k}={v}" for k, v in tags.items()) diff --git a/python-pulumi/src/ptd/pulumi_resources/traefik.py b/python-pulumi/src/ptd/pulumi_resources/traefik.py index 874d2cf..8ea5d51 100644 --- a/python-pulumi/src/ptd/pulumi_resources/traefik.py +++ b/python-pulumi/src/ptd/pulumi_resources/traefik.py @@ -7,6 +7,32 @@ import ptd.pulumi_resources.aws_eks_cluster import ptd.pulumi_resources.aws_vpc +from ptd.pulumi_resources.lib import format_lb_tags + + +def _build_nlb_tag_string(tags: dict[str, str] | None, cluster_name: str) -> str: + """Build the NLB annotation tag string from cluster tags.""" + if tags is None: + msg = ( + "Cluster tags must not be None; expected a dict with " + "'posit.team/true-name' and 'posit.team/environment' for NLB tagging." + ) + raise ValueError(msg) + true_name = tags.get("posit.team/true-name") + environment = tags.get("posit.team/environment") + if true_name is None: + msg = f"Missing required tag: 'posit.team/true-name' for NLB tagging. Available tags: {list(tags.keys())}" + raise ValueError(msg) + if environment is None: + msg = f"Missing required tag: 'posit.team/environment' for NLB tagging. Available tags: {list(tags.keys())}" + raise ValueError(msg) + return format_lb_tags( + { + "posit.team/true-name": true_name, + "posit.team/environment": environment, + "Name": cluster_name, + } + ) class Traefik(pulumi.ComponentResource): @@ -120,6 +146,12 @@ def _deploy(self, cert: aws.acm.Certificate | None): :return: """ + # Build tag string from cluster tags for NLB annotation. + # cluster.name, true_name, and environment are plain str values (logical resource + # names / config loaded at startup), not Pulumi Outputs, so format_lb_tags() checks + # work correctly at graph-construction time. + nlb_tags = _build_nlb_tag_string(self.cluster.tags, self.cluster.name) + self.traefik = k8s.helm.v3.Release( f"{self.cluster.name}-traefik", k8s.helm.v3.ReleaseArgs( @@ -147,6 +179,7 @@ def _deploy(self, cert: aws.acm.Certificate | None): "service.beta.kubernetes.io/aws-load-balancer-healthcheck-unhealthy-threshold": "3", "service.beta.kubernetes.io/aws-load-balancer-healthcheck-timeout": "10", "service.beta.kubernetes.io/aws-load-balancer-healthcheck-interval": "10", + "service.beta.kubernetes.io/aws-load-balancer-additional-resource-tags": nlb_tags, }, }, "ports": { diff --git a/python-pulumi/tests/test_alb_tags.py b/python-pulumi/tests/test_alb_tags.py new file mode 100644 index 0000000..5704bdb --- /dev/null +++ b/python-pulumi/tests/test_alb_tags.py @@ -0,0 +1,64 @@ +import pytest + +from ptd.pulumi_resources.aws_workload_helm import _build_alb_tag_string + + +def test_build_alb_tag_string_happy_path() -> None: + result = _build_alb_tag_string( + true_name="myapp", + environment="production", + compound_name="myapp-production", + ) + parsed = dict(pair.split("=", 1) for pair in result.split(",")) + assert parsed == { + "posit.team/true-name": "myapp", + "posit.team/environment": "production", + "Name": "myapp-production", + } + + +def test_build_alb_tag_string_tags_key_present() -> None: + result = _build_alb_tag_string( + true_name="myapp", + environment="staging", + compound_name="myapp-staging", + ) + assert "posit.team/true-name=myapp" in result + assert "posit.team/environment=staging" in result + assert "Name=myapp-staging" in result + + +def test_build_alb_tag_string_invalid_compound_name() -> None: + with pytest.raises(ValueError, match="comma or equals"): + _build_alb_tag_string( + true_name="myapp", + environment="production", + compound_name="bad,name", + ) + + +def test_build_alb_tag_string_empty_compound_name() -> None: + with pytest.raises(ValueError, match="must not be None or empty"): + _build_alb_tag_string( + true_name="myapp", + environment="production", + compound_name="", + ) + + +def test_build_alb_tag_string_invalid_true_name_value() -> None: + with pytest.raises(ValueError, match="comma or equals"): + _build_alb_tag_string( + true_name="bad,name", + environment="production", + compound_name="myapp-production", + ) + + +def test_build_alb_tag_string_invalid_environment_value() -> None: + with pytest.raises(ValueError, match="comma or equals"): + _build_alb_tag_string( + true_name="myapp", + environment="bad=env", + compound_name="myapp-production", + ) diff --git a/python-pulumi/tests/test_grafana_alloy.py b/python-pulumi/tests/test_grafana_alloy.py index bcc0f33..2b28fc3 100644 --- a/python-pulumi/tests/test_grafana_alloy.py +++ b/python-pulumi/tests/test_grafana_alloy.py @@ -2,9 +2,40 @@ from pathlib import Path from unittest.mock import Mock +import pytest import yaml -from ptd.pulumi_resources.grafana_alloy import AlloyConfig +from ptd.pulumi_resources.grafana_alloy import AlloyConfig, _validate_alloy_true_name + + +class TestValidateAlloyTrueName: + def test_valid_names(self) -> None: + _validate_alloy_true_name("myapp") + _validate_alloy_true_name("my-app") + _validate_alloy_true_name("my.app.v2") + _validate_alloy_true_name("app_name") + _validate_alloy_true_name("myapp-production") + _validate_alloy_true_name("a1b2c3") + + def test_double_quote_rejected(self) -> None: + with pytest.raises(ValueError, match="unsafe for Alloy River config"): + _validate_alloy_true_name('bad"name') + + def test_open_brace_rejected(self) -> None: + with pytest.raises(ValueError, match="unsafe for Alloy River config"): + _validate_alloy_true_name("bad{name}") + + def test_close_brace_rejected(self) -> None: + with pytest.raises(ValueError, match="unsafe for Alloy River config"): + _validate_alloy_true_name("bad}name") + + def test_space_rejected(self) -> None: + with pytest.raises(ValueError, match="unsafe for Alloy River config"): + _validate_alloy_true_name("bad name") + + def test_empty_string_rejected(self) -> None: + with pytest.raises(ValueError, match="unsafe for Alloy River config"): + _validate_alloy_true_name("") @dataclasses.dataclass @@ -544,3 +575,69 @@ def test_multiple_sites_with_different_replica_counts(self): assert 'name = "site-two-workbench-fqdn"' in result assert 'name = "site-two-connect"' not in result assert 'name = "site-two-connect-fqdn"' not in result + + +def _make_alloy_for_cloudwatch( + cloud_provider_name: str, + true_name: str = "myapp", + compound_name: str = "myapp-production", +) -> AlloyConfig: + """Helper to create an AlloyConfig instance with mocked attributes for cloudwatch tests.""" + alloy = AlloyConfig.__new__(AlloyConfig) + mock_workload = Mock() + mock_workload.cfg.true_name = true_name + mock_workload.compound_name = compound_name + mock_cloud_provider = Mock() + mock_cloud_provider.name = cloud_provider_name + mock_workload.cloud_provider = mock_cloud_provider + alloy.workload = mock_workload + alloy.cloud_provider = cloud_provider_name.lower() + alloy.region = "us-east-1" + return alloy + + +class TestDefineCloudwatchConfig: + """Tests for _define_cloudwatch_config method.""" + + def test_aws_contains_natgateway_discovery_block(self) -> None: + alloy = _make_alloy_for_cloudwatch("aws") + result = alloy._define_cloudwatch_config() # noqa: SLF001 + assert "AWS/NATGateway" in result + assert '"posit.team/true-name" = "myapp"' in result + + def test_aws_contains_applicationelb_discovery_block(self) -> None: + alloy = _make_alloy_for_cloudwatch("aws") + result = alloy._define_cloudwatch_config() # noqa: SLF001 + assert "AWS/ApplicationELB" in result + assert '"posit.team/true-name" = "myapp"' in result + + def test_aws_contains_networkelb_discovery_block(self) -> None: + alloy = _make_alloy_for_cloudwatch("aws") + result = alloy._define_cloudwatch_config() # noqa: SLF001 + assert "AWS/NetworkELB" in result + assert '"posit.team/true-name" = "myapp"' in result + + def test_aws_search_tags_use_true_name(self) -> None: + alloy = _make_alloy_for_cloudwatch("aws", true_name="customapp") + result = alloy._define_cloudwatch_config() # noqa: SLF001 + assert '"posit.team/true-name" = "customapp"' in result + + def test_aws_fsx_rds_ec2_use_compound_name(self) -> None: + alloy = _make_alloy_for_cloudwatch("aws", compound_name="customapp-staging") + result = alloy._define_cloudwatch_config() # noqa: SLF001 + assert 'Name = "customapp-staging"' in result + + def test_non_aws_returns_empty_string(self) -> None: + alloy = _make_alloy_for_cloudwatch("azure") + result = alloy._define_cloudwatch_config() # noqa: SLF001 + assert result == "" + + def test_invalid_true_name_raises_value_error(self) -> None: + alloy = _make_alloy_for_cloudwatch("aws", true_name='bad"name') + with pytest.raises(ValueError, match="unsafe for Alloy River config"): + alloy._define_cloudwatch_config() # noqa: SLF001 + + def test_invalid_compound_name_raises_value_error(self) -> None: + alloy = _make_alloy_for_cloudwatch("aws", compound_name="bad{name}") + with pytest.raises(ValueError, match="unsafe for Alloy River config"): + alloy._define_cloudwatch_config() # noqa: SLF001 diff --git a/python-pulumi/tests/test_lib.py b/python-pulumi/tests/test_lib.py new file mode 100644 index 0000000..4b1f8ed --- /dev/null +++ b/python-pulumi/tests/test_lib.py @@ -0,0 +1,129 @@ +import pytest + +from ptd.pulumi_resources.lib import format_lb_tags + + +def test_format_lb_tags_normal() -> None: + tags = { + "posit.team/true-name": "myapp", + "posit.team/environment": "production", + "Name": "myapp-production", + } + result = format_lb_tags(tags) + parsed = dict(pair.split("=", 1) for pair in result.split(",")) + assert parsed == { + "posit.team/true-name": "myapp", + "posit.team/environment": "production", + "Name": "myapp-production", + } + + +def test_format_lb_tags_single_entry() -> None: + assert format_lb_tags({"key": "value"}) == "key=value" + + +def test_format_lb_tags_comma_in_key() -> None: + with pytest.raises(ValueError, match="comma or equals"): + format_lb_tags({"bad,key": "value"}) + + +def test_format_lb_tags_equals_in_key() -> None: + with pytest.raises(ValueError, match="comma or equals"): + format_lb_tags({"bad=key": "value"}) + + +def test_format_lb_tags_comma_in_value() -> None: + with pytest.raises(ValueError, match="comma or equals"): + format_lb_tags({"key": "bad,value"}) + + +def test_format_lb_tags_equals_in_value() -> None: + with pytest.raises(ValueError, match="comma or equals"): + format_lb_tags({"key": "bad=value"}) + + +def test_format_lb_tags_empty_key() -> None: + with pytest.raises(ValueError, match="must not be empty"): + format_lb_tags({"": "value"}) + + +def test_format_lb_tags_empty_value() -> None: + with pytest.raises(ValueError, match="must not be None or empty"): + format_lb_tags({"key": ""}) + + +def test_format_lb_tags_empty_dict() -> None: + with pytest.raises(ValueError, match="must not be empty"): + format_lb_tags({}) + + +def test_format_lb_tags_space_in_key() -> None: + with pytest.raises(ValueError, match="whitespace"): + format_lb_tags({"bad key": "value"}) + + +def test_format_lb_tags_space_in_value() -> None: + with pytest.raises(ValueError, match="whitespace"): + format_lb_tags({"key": "bad value"}) + + +def test_format_lb_tags_tab_in_key() -> None: + with pytest.raises(ValueError, match="whitespace"): + format_lb_tags({"bad\tkey": "value"}) + + +def test_format_lb_tags_newline_in_key() -> None: + with pytest.raises(ValueError, match="whitespace"): + format_lb_tags({"bad\nkey": "value"}) + + +def test_format_lb_tags_tab_in_value() -> None: + with pytest.raises(ValueError, match="whitespace"): + format_lb_tags({"key": "bad\tvalue"}) + + +def test_format_lb_tags_newline_in_value() -> None: + with pytest.raises(ValueError, match="whitespace"): + format_lb_tags({"key": "bad\nvalue"}) + + +def test_format_lb_tags_key_at_limit() -> None: + # 128 chars — should succeed (boundary check for > comparison) + format_lb_tags({"k" * 128: "value"}) + + +def test_format_lb_tags_key_too_long() -> None: + with pytest.raises(ValueError, match="128-character limit"): + format_lb_tags({"k" * 129: "value"}) + + +def test_format_lb_tags_value_at_limit() -> None: + # 256 chars — should succeed (boundary check for > comparison) + format_lb_tags({"key": "v" * 256}) + + +def test_format_lb_tags_value_too_long() -> None: + with pytest.raises(ValueError, match="256-character limit"): + format_lb_tags({"key": "v" * 257}) + + +def test_format_lb_tags_aws_reserved_prefix() -> None: + with pytest.raises(ValueError, match="reserved 'aws:' prefix"): + format_lb_tags({"aws:foo": "bar"}) + + +def test_format_lb_tags_carriage_return_in_key() -> None: + with pytest.raises(ValueError, match="whitespace"): + format_lb_tags({"bad\rkey": "value"}) + + +def test_format_lb_tags_carriage_return_in_value() -> None: + with pytest.raises(ValueError, match="whitespace"): + format_lb_tags({"key": "bad\rvalue"}) + + +def test_format_lb_tags_slash_in_key() -> None: + # Production keys like posit.team/true-name contain /; must be accepted + result = format_lb_tags({"posit.team/true-name": "myapp", "posit.team/environment": "production"}) + assert "posit.team/true-name=myapp" in result + assert "posit.team/environment=production" in result diff --git a/python-pulumi/tests/test_traefik_nlb_tags.py b/python-pulumi/tests/test_traefik_nlb_tags.py new file mode 100644 index 0000000..9bad2ba --- /dev/null +++ b/python-pulumi/tests/test_traefik_nlb_tags.py @@ -0,0 +1,90 @@ +import pytest + +from ptd.pulumi_resources.traefik import _build_nlb_tag_string + + +def test_build_nlb_tag_string_happy_path() -> None: + result = _build_nlb_tag_string( + tags={"posit.team/true-name": "myapp", "posit.team/environment": "production"}, + cluster_name="myapp-cluster", + ) + # Parse into key=value pairs to avoid coupling the test to dict insertion order + parsed = dict(pair.split("=", 1) for pair in result.split(",")) + assert parsed == { + "posit.team/true-name": "myapp", + "posit.team/environment": "production", + "Name": "myapp-cluster", + } + + +def test_build_nlb_tag_string_tags_none() -> None: + with pytest.raises(ValueError, match="must not be None"): + _build_nlb_tag_string(tags=None, cluster_name="myapp-cluster") + + +def test_build_nlb_tag_string_missing_true_name() -> None: + with pytest.raises(ValueError, match="posit.team/true-name"): + _build_nlb_tag_string( + tags={"posit.team/environment": "production"}, + cluster_name="myapp-cluster", + ) + + +def test_build_nlb_tag_string_missing_environment() -> None: + with pytest.raises(ValueError, match="posit.team/environment"): + _build_nlb_tag_string( + tags={"posit.team/true-name": "myapp"}, + cluster_name="myapp-cluster", + ) + + +def test_build_nlb_tag_string_invalid_cluster_name() -> None: + with pytest.raises(ValueError, match="comma or equals"): + _build_nlb_tag_string( + tags={"posit.team/true-name": "myapp", "posit.team/environment": "production"}, + cluster_name="bad,name", + ) + + +def test_build_nlb_tag_string_empty_cluster_name() -> None: + with pytest.raises(ValueError, match="must not be None or empty"): + _build_nlb_tag_string( + tags={"posit.team/true-name": "myapp", "posit.team/environment": "production"}, + cluster_name="", + ) + + +def test_build_nlb_tag_string_invalid_true_name_value() -> None: + with pytest.raises(ValueError, match="comma or equals"): + _build_nlb_tag_string( + tags={"posit.team/true-name": "bad,name", "posit.team/environment": "prod"}, + cluster_name="cluster", + ) + + +def test_build_nlb_tag_string_invalid_environment_value() -> None: + with pytest.raises(ValueError, match="comma or equals"): + _build_nlb_tag_string( + tags={"posit.team/true-name": "myapp", "posit.team/environment": "bad=env"}, + cluster_name="cluster", + ) + + +def test_build_nlb_tag_string_extra_tags_are_dropped() -> None: + """Extra tags in the input dict (e.g. aws:created-by, Cost-Center) are intentionally + discarded; only true-name, environment, and Name should appear in the output.""" + result = _build_nlb_tag_string( + tags={ + "posit.team/true-name": "myapp", + "posit.team/environment": "production", + "aws:created-by": "someone", + "Cost-Center": "123", + }, + cluster_name="myapp-cluster", + ) + parsed = dict(pair.split("=", 1) for pair in result.split(",")) + assert parsed == { + "posit.team/true-name": "myapp", + "posit.team/environment": "production", + "Name": "myapp-cluster", + }