From b04acfca470214c31b53cba053a1698f44f673f7 Mon Sep 17 00:00:00 2001 From: ian-flores Date: Thu, 26 Feb 2026 12:27:19 -0800 Subject: [PATCH 1/3] Upgrade AWS control room Traefik from v2 to v3 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update Traefik Helm chart from v24.0.0 to v33.2.1 for AWS control rooms. Changes: - Bump chart version to 33.2.1 - Update repository URL to https://traefik.github.io/charts - Migrate Helm values to v3 format: - ports.web.redirectTo → ports.web.redirections.entryPoint structure - ingressClass.default → ingressClass.isDefaultClass - Update default version in AWSControlRoomConfig AWS-specific configuration preserved: - NLB annotations and SSL certificate handling unchanged - TLS termination remains at NLB level (websecure.tls.enabled: False) - Service type, ports, and health check config unchanged Workload Traefik already uses v3 (version 37.1.2) and requires no changes. --- python-pulumi/src/ptd/aws_control_room.py | 2 +- python-pulumi/src/ptd/pulumi_resources/traefik.py | 14 ++++++++++---- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/python-pulumi/src/ptd/aws_control_room.py b/python-pulumi/src/ptd/aws_control_room.py index a43225a..7129030 100644 --- a/python-pulumi/src/ptd/aws_control_room.py +++ b/python-pulumi/src/ptd/aws_control_room.py @@ -71,7 +71,7 @@ class AWSControlRoomConfig: tailscale_enabled: bool = True tigera_operator_version: str = "3.27.2" traefik_forward_auth_version: str = "0.0.14" - traefik_version: str = "24.0.0" + traefik_version: str = "33.2.1" ebs_csi_addon_version: str = "v1.41.0-eksbuild.1" diff --git a/python-pulumi/src/ptd/pulumi_resources/traefik.py b/python-pulumi/src/ptd/pulumi_resources/traefik.py index 8ea5d51..51f54b5 100644 --- a/python-pulumi/src/ptd/pulumi_resources/traefik.py +++ b/python-pulumi/src/ptd/pulumi_resources/traefik.py @@ -156,11 +156,11 @@ def _deploy(self, cert: aws.acm.Certificate | None): f"{self.cluster.name}-traefik", k8s.helm.v3.ReleaseArgs( chart="traefik", - version="24.0.0", + version="33.2.1", namespace=self.namespace, name="traefik", repository_opts=k8s.helm.v3.RepositoryOptsArgs( - repo="https://helm.traefik.io/traefik/", + repo="https://traefik.github.io/charts", ), values={ "service": { @@ -184,7 +184,13 @@ def _deploy(self, cert: aws.acm.Certificate | None): }, "ports": { "web": { - "redirectTo": "websecure", + "redirections": { + "entryPoint": { + "to": "websecure", + "scheme": "https", + "permanent": True, + } + }, }, "websecure": { "tls": { @@ -226,7 +232,7 @@ def _deploy(self, cert: aws.acm.Certificate | None): }, "ingressClass": { "enabled": True, - "default": True, + "isDefaultClass": True, }, "ingressRoute": { "dashboard": { From 6ee85fbdf908705a4189824d129bc14ab81acd80 Mon Sep 17 00:00:00 2001 From: ian-flores Date: Thu, 26 Feb 2026 12:27:19 -0800 Subject: [PATCH 2/3] Address review findings (job 218) Changes: - Add `version` parameter to `Traefik.__init__` (defaulting to `"33.2.1"` for backward compatibility) and store it as `self.version` - Use `self.version` instead of the hardcoded `"33.2.1"` in `_deploy` so the Helm chart version flows from config - Pass `version=self.control_room.cfg.traefik_version` at the `Traefik(...)` call site in `aws_control_room_cluster.py`, ensuring the single source of truth in `AWSControlRoomConfig.traefik_version` drives both places --- .../src/ptd/pulumi_resources/aws_control_room_cluster.py | 1 + python-pulumi/src/ptd/pulumi_resources/traefik.py | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/python-pulumi/src/ptd/pulumi_resources/aws_control_room_cluster.py b/python-pulumi/src/ptd/pulumi_resources/aws_control_room_cluster.py index 3c094f3..882f3de 100644 --- a/python-pulumi/src/ptd/pulumi_resources/aws_control_room_cluster.py +++ b/python-pulumi/src/ptd/pulumi_resources/aws_control_room_cluster.py @@ -218,6 +218,7 @@ def extract_dvo_records(dvos) -> list[aws.route53.Record]: "", cert, deployment_replicas=self.control_room.cfg.traefik_deployment_replicas, + version=self.control_room.cfg.traefik_version, opts=pulumi.ResourceOptions( parent=self.eks, provider=self.eks.provider, diff --git a/python-pulumi/src/ptd/pulumi_resources/traefik.py b/python-pulumi/src/ptd/pulumi_resources/traefik.py index 51f54b5..6c9b880 100644 --- a/python-pulumi/src/ptd/pulumi_resources/traefik.py +++ b/python-pulumi/src/ptd/pulumi_resources/traefik.py @@ -43,6 +43,7 @@ def __init__( node_selector: str = "", cert: aws.acm.Certificate | None = None, deployment_replicas: int = 3, + version: str = "33.2.1", *args, **kwargs, ): @@ -63,6 +64,7 @@ def __init__( self.cluster = cluster self.node_selector: str = node_selector self.deployment_replicas = deployment_replicas + self.version = version self.traefik: k8s.helm.v3.Release | None = None self._deploy(cert) @@ -156,7 +158,7 @@ def _deploy(self, cert: aws.acm.Certificate | None): f"{self.cluster.name}-traefik", k8s.helm.v3.ReleaseArgs( chart="traefik", - version="33.2.1", + version=self.version, namespace=self.namespace, name="traefik", repository_opts=k8s.helm.v3.RepositoryOptsArgs( From 683ac35618cd47a414eef64e3074364b40164ff6 Mon Sep 17 00:00:00 2001 From: ian-flores Date: Thu, 26 Feb 2026 12:27:19 -0800 Subject: [PATCH 3/3] Address review findings (job 222) --- Changes: - Remove default value from `Traefik.__init__` `version` parameter (now keyword-only after `*args`) to prevent silent divergence from `AWSControlRoomConfig.traefik_version` - Extract `_build_traefik_helm_values()` as a module-level pure function (matching existing `_build_nlb_tag_string` pattern) to make Helm values testable - Add 7 unit tests covering v3 redirect syntax (`redirections.entryPoint`), `isDefaultClass`, node selector, replica count, and cert ARN propagation - Add Traefik v3 CRD migration note to `docs/KNOWN_ISSUES.md` documenting the risk of existing v2-style CRD resources failing to reconcile after upgrade --- docs/KNOWN_ISSUES.md | 14 ++ .../src/ptd/pulumi_resources/traefik.py | 171 ++++++++++-------- python-pulumi/tests/test_traefik_nlb_tags.py | 60 +++++- 3 files changed, 165 insertions(+), 80 deletions(-) diff --git a/docs/KNOWN_ISSUES.md b/docs/KNOWN_ISSUES.md index 6158398..79f9417 100644 --- a/docs/KNOWN_ISSUES.md +++ b/docs/KNOWN_ISSUES.md @@ -70,6 +70,20 @@ With this setting disabled, the infrastructure will deploy successfully without +### Traefik v3 CRD Migration + +**Background:** +PTD upgraded the Traefik ingress controller from v2 (Helm chart `24.x`) to v3 (Helm chart `33.x`). Traefik v3 ships updated CRD schemas for `IngressRoute`, `Middleware`, `TLSOption`, and related resources. + +**Risk:** +If any workloads use Traefik v2-style CRD fields that were removed or renamed in v3 (e.g., `ServersTransport` → `ServersTransports`, changed TLS store/options schemas, modified middleware CRD fields), those resources may fail to reconcile after the Helm upgrade without producing obvious errors. + +**Before rolling out to production clusters:** +1. Audit existing `IngressRoute`, `Middleware`, `TLSOption`, and `ServersTransport` resources against the [Traefik v3 migration guide](https://doc.traefik.io/traefik/migration/v2-to-v3/). +2. Pay particular attention to `ServersTransport` → `ServersTransports` rename and any middleware fields your workloads rely on. +3. Test the upgrade in a staging cluster before applying to production. + + ## Workarounds and Best Practices ### Direct Pulumi Stack Access with `ptd workon` diff --git a/python-pulumi/src/ptd/pulumi_resources/traefik.py b/python-pulumi/src/ptd/pulumi_resources/traefik.py index 6c9b880..a1764f8 100644 --- a/python-pulumi/src/ptd/pulumi_resources/traefik.py +++ b/python-pulumi/src/ptd/pulumi_resources/traefik.py @@ -35,6 +35,91 @@ def _build_nlb_tag_string(tags: dict[str, str] | None, cluster_name: str) -> str ) +def _build_traefik_helm_values( + node_selector: str, + deployment_replicas: int, + cert_arn, + nlb_tags: str, +) -> dict: + """Build the Helm values dict for the Traefik chart (v3 syntax).""" + return { + "service": { + "type": "LoadBalancer", + "annotations": { + "service.beta.kubernetes.io/aws-load-balancer-type": "external", + "service.beta.kubernetes.io/aws-load-balancer-scheme": "internet-facing", + "service.beta.kubernetes.io/aws-load-balancer-ip-address-type": "ipv4", + "service.beta.kubernetes.io/aws-load-balancer-nlb-target-type": "ip", + "service.beta.kubernetes.io/aws-load-balancer-ssl-cert": cert_arn, + "service.beta.kubernetes.io/aws-load-balancer-ssl-ports": "443", + "service.beta.kubernetes.io/aws-load-balancer-access-log-enabled": "false", + "service.beta.kubernetes.io/aws-load-balancer-ssl-negotiation-policy": "ELBSecurityPolicy-FS-1-2-2019-08", + "service.beta.kubernetes.io/aws-load-balancer-healthcheck-healthy-threshold": "3", + "service.beta.kubernetes.io/aws-load-balancer-healthcheck-unhealthy-threshold": "3", + "service.beta.kubernetes.io/aws-load-balancer-healthcheck-timeout": "10", + "service.beta.kubernetes.io/aws-load-balancer-healthcheck-interval": "10", + "service.beta.kubernetes.io/aws-load-balancer-additional-resource-tags": nlb_tags, + }, + }, + "ports": { + "web": { + "redirections": { + "entryPoint": { + "to": "websecure", + "scheme": "https", + "permanent": True, + } + }, + }, + "websecure": { + "tls": { + "enabled": False, + } + }, + }, + "nodeSelector": ( + { + "node.kubernetes.io/instance-type": node_selector, + } + if node_selector + else None + ), + "providers": { + "kubernetesCRD": { + "enabled": True, + }, + "kubernetesIngress": { + "enabled": True, + }, + "publishedService": { + "enabled": True, + }, + }, + "additionalArguments": [ + "--metrics.prometheus=true", + ], + "deployment": { + "replicas": deployment_replicas, + }, + "logs": { + "general": {"level": "DEBUG"}, + "access": {"enabled": True}, + }, + "image": { + "registry": "ghcr.io/traefik", + }, + "ingressClass": { + "enabled": True, + "isDefaultClass": True, + }, + "ingressRoute": { + "dashboard": { + "enabled": True, + } + }, + } + + class Traefik(pulumi.ComponentResource): def __init__( self, @@ -43,8 +128,8 @@ def __init__( node_selector: str = "", cert: aws.acm.Certificate | None = None, deployment_replicas: int = 3, - version: str = "33.2.1", *args, + version: str, **kwargs, ): """ @@ -164,84 +249,12 @@ def _deploy(self, cert: aws.acm.Certificate | None): repository_opts=k8s.helm.v3.RepositoryOptsArgs( repo="https://traefik.github.io/charts", ), - values={ - "service": { - # TODO: we could make this into an ingress if we want...? - "type": "LoadBalancer", - "annotations": { - "service.beta.kubernetes.io/aws-load-balancer-type": "external", - "service.beta.kubernetes.io/aws-load-balancer-scheme": "internet-facing", - "service.beta.kubernetes.io/aws-load-balancer-ip-address-type": "ipv4", - "service.beta.kubernetes.io/aws-load-balancer-nlb-target-type": "ip", - "service.beta.kubernetes.io/aws-load-balancer-ssl-cert": (cert.arn if cert else None), - "service.beta.kubernetes.io/aws-load-balancer-ssl-ports": "443", - "service.beta.kubernetes.io/aws-load-balancer-access-log-enabled": "false", - "service.beta.kubernetes.io/aws-load-balancer-ssl-negotiation-policy": "ELBSecurityPolicy-FS-1-2-2019-08", - "service.beta.kubernetes.io/aws-load-balancer-healthcheck-healthy-threshold": "3", - "service.beta.kubernetes.io/aws-load-balancer-healthcheck-unhealthy-threshold": "3", - "service.beta.kubernetes.io/aws-load-balancer-healthcheck-timeout": "10", - "service.beta.kubernetes.io/aws-load-balancer-healthcheck-interval": "10", - "service.beta.kubernetes.io/aws-load-balancer-additional-resource-tags": nlb_tags, - }, - }, - "ports": { - "web": { - "redirections": { - "entryPoint": { - "to": "websecure", - "scheme": "https", - "permanent": True, - } - }, - }, - "websecure": { - "tls": { - "enabled": False, - } - }, - }, - "nodeSelector": ( - { - # "meta" nodes - "node.kubernetes.io/instance-type": self.node_selector, - } - if self.node_selector - else None - ), - "providers": { - "kubernetesCRD": { - "enabled": True, - }, - "kubernetesIngress": { - "enabled": True, - }, - "publishedService": { - "enabled": True, - }, - }, - "additionalArguments": [ - "--metrics.prometheus=true", - ], - "deployment": { - "replicas": self.deployment_replicas, - }, - "logs": { - "general": {"level": "DEBUG"}, - "access": {"enabled": True}, - }, - "image": { - "registry": "ghcr.io/traefik", - }, - "ingressClass": { - "enabled": True, - "isDefaultClass": True, - }, - "ingressRoute": { - "dashboard": { - "enabled": True, - } - }, - }, + values=_build_traefik_helm_values( + node_selector=self.node_selector, + deployment_replicas=self.deployment_replicas, + cert_arn=(cert.arn if cert else None), + nlb_tags=nlb_tags, + ), ), opts=pulumi.ResourceOptions( provider=self.cluster.provider, diff --git a/python-pulumi/tests/test_traefik_nlb_tags.py b/python-pulumi/tests/test_traefik_nlb_tags.py index 9bad2ba..4b606a4 100644 --- a/python-pulumi/tests/test_traefik_nlb_tags.py +++ b/python-pulumi/tests/test_traefik_nlb_tags.py @@ -1,6 +1,6 @@ import pytest -from ptd.pulumi_resources.traefik import _build_nlb_tag_string +from ptd.pulumi_resources.traefik import _build_nlb_tag_string, _build_traefik_helm_values def test_build_nlb_tag_string_happy_path() -> None: @@ -70,6 +70,64 @@ def test_build_nlb_tag_string_invalid_environment_value() -> None: ) +def _make_traefik_values(**overrides): + """Helper to create traefik helm values with sensible defaults.""" + defaults = dict( + node_selector="", + deployment_replicas=3, + cert_arn=None, + nlb_tags="posit.team/true-name=myapp,posit.team/environment=prod,Name=cluster", + ) + defaults.update(overrides) + return _build_traefik_helm_values(**defaults) + + +def test_traefik_v3_redirect_uses_redirections_syntax() -> None: + values = _make_traefik_values() + web = values["ports"]["web"] + assert "redirectTo" not in web, "v2 redirect syntax must not be present" + assert "redirections" in web + assert web["redirections"]["entryPoint"]["to"] == "websecure" + assert web["redirections"]["entryPoint"]["scheme"] == "https" + assert web["redirections"]["entryPoint"]["permanent"] is True + + +def test_traefik_v3_ingress_class_uses_is_default_class() -> None: + values = _make_traefik_values() + ingress_class = values["ingressClass"] + assert "default" not in ingress_class, "v2 'default' key must not be present" + assert ingress_class["isDefaultClass"] is True + assert ingress_class["enabled"] is True + + +def test_traefik_node_selector_empty_string_yields_none() -> None: + values = _make_traefik_values(node_selector="") + assert values["nodeSelector"] is None + + +def test_traefik_node_selector_set_yields_dict() -> None: + values = _make_traefik_values(node_selector="m5.xlarge") + assert values["nodeSelector"] == {"node.kubernetes.io/instance-type": "m5.xlarge"} + + +def test_traefik_deployment_replicas_propagated() -> None: + values = _make_traefik_values(deployment_replicas=5) + assert values["deployment"]["replicas"] == 5 + + +def test_traefik_cert_arn_none_sets_annotation_to_none() -> None: + values = _make_traefik_values(cert_arn=None) + assert values["service"]["annotations"]["service.beta.kubernetes.io/aws-load-balancer-ssl-cert"] is None + + +def test_traefik_cert_arn_set_propagates_to_annotation() -> None: + values = _make_traefik_values(cert_arn="arn:aws:acm:us-east-1:123:certificate/abc") + assert ( + values["service"]["annotations"]["service.beta.kubernetes.io/aws-load-balancer-ssl-cert"] + == "arn:aws:acm:us-east-1:123:certificate/abc" + ) + + def test_build_nlb_tag_string_extra_tags_are_dropped() -> None: """Extra tags in the input dict (e.g. aws:created-by, Cost-Center) are intentionally discarded; only true-name, environment, and Name should appear in the output."""