diff --git a/docs/KNOWN_ISSUES.md b/docs/KNOWN_ISSUES.md index 6158398..79f9417 100644 --- a/docs/KNOWN_ISSUES.md +++ b/docs/KNOWN_ISSUES.md @@ -70,6 +70,20 @@ With this setting disabled, the infrastructure will deploy successfully without +### Traefik v3 CRD Migration + +**Background:** +PTD upgraded the Traefik ingress controller from v2 (Helm chart `24.x`) to v3 (Helm chart `33.x`). Traefik v3 ships updated CRD schemas for `IngressRoute`, `Middleware`, `TLSOption`, and related resources. + +**Risk:** +If any workloads use Traefik v2-style CRD fields that were removed or renamed in v3 (e.g., `ServersTransport` → `ServersTransports`, changed TLS store/options schemas, modified middleware CRD fields), those resources may fail to reconcile after the Helm upgrade without producing obvious errors. + +**Before rolling out to production clusters:** +1. Audit existing `IngressRoute`, `Middleware`, `TLSOption`, and `ServersTransport` resources against the [Traefik v3 migration guide](https://doc.traefik.io/traefik/migration/v2-to-v3/). +2. Pay particular attention to `ServersTransport` → `ServersTransports` rename and any middleware fields your workloads rely on. +3. Test the upgrade in a staging cluster before applying to production. + + ## Workarounds and Best Practices ### Direct Pulumi Stack Access with `ptd workon` diff --git a/python-pulumi/src/ptd/aws_control_room.py b/python-pulumi/src/ptd/aws_control_room.py index a43225a..7129030 100644 --- a/python-pulumi/src/ptd/aws_control_room.py +++ b/python-pulumi/src/ptd/aws_control_room.py @@ -71,7 +71,7 @@ class AWSControlRoomConfig: tailscale_enabled: bool = True tigera_operator_version: str = "3.27.2" traefik_forward_auth_version: str = "0.0.14" - traefik_version: str = "24.0.0" + traefik_version: str = "33.2.1" ebs_csi_addon_version: str = "v1.41.0-eksbuild.1" diff --git a/python-pulumi/src/ptd/pulumi_resources/aws_control_room_cluster.py b/python-pulumi/src/ptd/pulumi_resources/aws_control_room_cluster.py index 3c094f3..882f3de 100644 --- a/python-pulumi/src/ptd/pulumi_resources/aws_control_room_cluster.py +++ b/python-pulumi/src/ptd/pulumi_resources/aws_control_room_cluster.py @@ -218,6 +218,7 @@ def extract_dvo_records(dvos) -> list[aws.route53.Record]: "", cert, deployment_replicas=self.control_room.cfg.traefik_deployment_replicas, + version=self.control_room.cfg.traefik_version, opts=pulumi.ResourceOptions( parent=self.eks, provider=self.eks.provider, diff --git a/python-pulumi/src/ptd/pulumi_resources/traefik.py b/python-pulumi/src/ptd/pulumi_resources/traefik.py index 8ea5d51..a1764f8 100644 --- a/python-pulumi/src/ptd/pulumi_resources/traefik.py +++ b/python-pulumi/src/ptd/pulumi_resources/traefik.py @@ -35,6 +35,91 @@ def _build_nlb_tag_string(tags: dict[str, str] | None, cluster_name: str) -> str ) +def _build_traefik_helm_values( + node_selector: str, + deployment_replicas: int, + cert_arn, + nlb_tags: str, +) -> dict: + """Build the Helm values dict for the Traefik chart (v3 syntax).""" + return { + "service": { + "type": "LoadBalancer", + "annotations": { + "service.beta.kubernetes.io/aws-load-balancer-type": "external", + "service.beta.kubernetes.io/aws-load-balancer-scheme": "internet-facing", + "service.beta.kubernetes.io/aws-load-balancer-ip-address-type": "ipv4", + "service.beta.kubernetes.io/aws-load-balancer-nlb-target-type": "ip", + "service.beta.kubernetes.io/aws-load-balancer-ssl-cert": cert_arn, + "service.beta.kubernetes.io/aws-load-balancer-ssl-ports": "443", + "service.beta.kubernetes.io/aws-load-balancer-access-log-enabled": "false", + "service.beta.kubernetes.io/aws-load-balancer-ssl-negotiation-policy": "ELBSecurityPolicy-FS-1-2-2019-08", + "service.beta.kubernetes.io/aws-load-balancer-healthcheck-healthy-threshold": "3", + "service.beta.kubernetes.io/aws-load-balancer-healthcheck-unhealthy-threshold": "3", + "service.beta.kubernetes.io/aws-load-balancer-healthcheck-timeout": "10", + "service.beta.kubernetes.io/aws-load-balancer-healthcheck-interval": "10", + "service.beta.kubernetes.io/aws-load-balancer-additional-resource-tags": nlb_tags, + }, + }, + "ports": { + "web": { + "redirections": { + "entryPoint": { + "to": "websecure", + "scheme": "https", + "permanent": True, + } + }, + }, + "websecure": { + "tls": { + "enabled": False, + } + }, + }, + "nodeSelector": ( + { + "node.kubernetes.io/instance-type": node_selector, + } + if node_selector + else None + ), + "providers": { + "kubernetesCRD": { + "enabled": True, + }, + "kubernetesIngress": { + "enabled": True, + }, + "publishedService": { + "enabled": True, + }, + }, + "additionalArguments": [ + "--metrics.prometheus=true", + ], + "deployment": { + "replicas": deployment_replicas, + }, + "logs": { + "general": {"level": "DEBUG"}, + "access": {"enabled": True}, + }, + "image": { + "registry": "ghcr.io/traefik", + }, + "ingressClass": { + "enabled": True, + "isDefaultClass": True, + }, + "ingressRoute": { + "dashboard": { + "enabled": True, + } + }, + } + + class Traefik(pulumi.ComponentResource): def __init__( self, @@ -44,6 +129,7 @@ def __init__( cert: aws.acm.Certificate | None = None, deployment_replicas: int = 3, *args, + version: str, **kwargs, ): """ @@ -63,6 +149,7 @@ def __init__( self.cluster = cluster self.node_selector: str = node_selector self.deployment_replicas = deployment_replicas + self.version = version self.traefik: k8s.helm.v3.Release | None = None self._deploy(cert) @@ -156,84 +243,18 @@ def _deploy(self, cert: aws.acm.Certificate | None): f"{self.cluster.name}-traefik", k8s.helm.v3.ReleaseArgs( chart="traefik", - version="24.0.0", + version=self.version, namespace=self.namespace, name="traefik", repository_opts=k8s.helm.v3.RepositoryOptsArgs( - repo="https://helm.traefik.io/traefik/", + repo="https://traefik.github.io/charts", + ), + values=_build_traefik_helm_values( + node_selector=self.node_selector, + deployment_replicas=self.deployment_replicas, + cert_arn=(cert.arn if cert else None), + nlb_tags=nlb_tags, ), - values={ - "service": { - # TODO: we could make this into an ingress if we want...? - "type": "LoadBalancer", - "annotations": { - "service.beta.kubernetes.io/aws-load-balancer-type": "external", - "service.beta.kubernetes.io/aws-load-balancer-scheme": "internet-facing", - "service.beta.kubernetes.io/aws-load-balancer-ip-address-type": "ipv4", - "service.beta.kubernetes.io/aws-load-balancer-nlb-target-type": "ip", - "service.beta.kubernetes.io/aws-load-balancer-ssl-cert": (cert.arn if cert else None), - "service.beta.kubernetes.io/aws-load-balancer-ssl-ports": "443", - "service.beta.kubernetes.io/aws-load-balancer-access-log-enabled": "false", - "service.beta.kubernetes.io/aws-load-balancer-ssl-negotiation-policy": "ELBSecurityPolicy-FS-1-2-2019-08", - "service.beta.kubernetes.io/aws-load-balancer-healthcheck-healthy-threshold": "3", - "service.beta.kubernetes.io/aws-load-balancer-healthcheck-unhealthy-threshold": "3", - "service.beta.kubernetes.io/aws-load-balancer-healthcheck-timeout": "10", - "service.beta.kubernetes.io/aws-load-balancer-healthcheck-interval": "10", - "service.beta.kubernetes.io/aws-load-balancer-additional-resource-tags": nlb_tags, - }, - }, - "ports": { - "web": { - "redirectTo": "websecure", - }, - "websecure": { - "tls": { - "enabled": False, - } - }, - }, - "nodeSelector": ( - { - # "meta" nodes - "node.kubernetes.io/instance-type": self.node_selector, - } - if self.node_selector - else None - ), - "providers": { - "kubernetesCRD": { - "enabled": True, - }, - "kubernetesIngress": { - "enabled": True, - }, - "publishedService": { - "enabled": True, - }, - }, - "additionalArguments": [ - "--metrics.prometheus=true", - ], - "deployment": { - "replicas": self.deployment_replicas, - }, - "logs": { - "general": {"level": "DEBUG"}, - "access": {"enabled": True}, - }, - "image": { - "registry": "ghcr.io/traefik", - }, - "ingressClass": { - "enabled": True, - "default": True, - }, - "ingressRoute": { - "dashboard": { - "enabled": True, - } - }, - }, ), opts=pulumi.ResourceOptions( provider=self.cluster.provider, diff --git a/python-pulumi/tests/test_traefik_nlb_tags.py b/python-pulumi/tests/test_traefik_nlb_tags.py index 9bad2ba..4b606a4 100644 --- a/python-pulumi/tests/test_traefik_nlb_tags.py +++ b/python-pulumi/tests/test_traefik_nlb_tags.py @@ -1,6 +1,6 @@ import pytest -from ptd.pulumi_resources.traefik import _build_nlb_tag_string +from ptd.pulumi_resources.traefik import _build_nlb_tag_string, _build_traefik_helm_values def test_build_nlb_tag_string_happy_path() -> None: @@ -70,6 +70,64 @@ def test_build_nlb_tag_string_invalid_environment_value() -> None: ) +def _make_traefik_values(**overrides): + """Helper to create traefik helm values with sensible defaults.""" + defaults = dict( + node_selector="", + deployment_replicas=3, + cert_arn=None, + nlb_tags="posit.team/true-name=myapp,posit.team/environment=prod,Name=cluster", + ) + defaults.update(overrides) + return _build_traefik_helm_values(**defaults) + + +def test_traefik_v3_redirect_uses_redirections_syntax() -> None: + values = _make_traefik_values() + web = values["ports"]["web"] + assert "redirectTo" not in web, "v2 redirect syntax must not be present" + assert "redirections" in web + assert web["redirections"]["entryPoint"]["to"] == "websecure" + assert web["redirections"]["entryPoint"]["scheme"] == "https" + assert web["redirections"]["entryPoint"]["permanent"] is True + + +def test_traefik_v3_ingress_class_uses_is_default_class() -> None: + values = _make_traefik_values() + ingress_class = values["ingressClass"] + assert "default" not in ingress_class, "v2 'default' key must not be present" + assert ingress_class["isDefaultClass"] is True + assert ingress_class["enabled"] is True + + +def test_traefik_node_selector_empty_string_yields_none() -> None: + values = _make_traefik_values(node_selector="") + assert values["nodeSelector"] is None + + +def test_traefik_node_selector_set_yields_dict() -> None: + values = _make_traefik_values(node_selector="m5.xlarge") + assert values["nodeSelector"] == {"node.kubernetes.io/instance-type": "m5.xlarge"} + + +def test_traefik_deployment_replicas_propagated() -> None: + values = _make_traefik_values(deployment_replicas=5) + assert values["deployment"]["replicas"] == 5 + + +def test_traefik_cert_arn_none_sets_annotation_to_none() -> None: + values = _make_traefik_values(cert_arn=None) + assert values["service"]["annotations"]["service.beta.kubernetes.io/aws-load-balancer-ssl-cert"] is None + + +def test_traefik_cert_arn_set_propagates_to_annotation() -> None: + values = _make_traefik_values(cert_arn="arn:aws:acm:us-east-1:123:certificate/abc") + assert ( + values["service"]["annotations"]["service.beta.kubernetes.io/aws-load-balancer-ssl-cert"] + == "arn:aws:acm:us-east-1:123:certificate/abc" + ) + + def test_build_nlb_tag_string_extra_tags_are_dropped() -> None: """Extra tags in the input dict (e.g. aws:created-by, Cost-Center) are intentionally discarded; only true-name, environment, and Name should appear in the output."""