diff --git a/docs/KNOWN_ISSUES.md b/docs/KNOWN_ISSUES.md index 6158398..d98c324 100644 --- a/docs/KNOWN_ISSUES.md +++ b/docs/KNOWN_ISSUES.md @@ -117,4 +117,23 @@ exit - Changes made directly via Pulumi CLI may be overwritten by subsequent `ptd ensure` runs if they conflict with your configuration - This is an advanced troubleshooting tool - use it when the standard PTD commands aren't sufficient + +### External Secrets Operator: ClusterSecretStore Fails on First Run + +**The Problem:** +When enabling `enable_external_secrets_operator` on a fresh cluster, the `ClusterSecretStore` resource +may fail to apply with `no matches for kind "ClusterSecretStore"`. This happens because Pulumi registers +the ESO HelmChart CR but the CRDs installed by the chart have not yet converged before Pulumi attempts +to create the `ClusterSecretStore`. + +**Why It Happens:** +`depends_on` the HelmChart CR only ensures the CR is accepted by the API server, not that the ESO +controller has finished installing its CRDs. On a fresh cluster, CRD propagation can take several +minutes. Pulumi will retry for up to 10 minutes via `CustomTimeouts(create="10m")`, but may still +time out on very slow clusters or under resource pressure. + +**The Solution:** +Re-run `ptd ensure` after the initial failure. By that point the CRDs will be available and the +`ClusterSecretStore` will apply successfully. + --- diff --git a/docs/team-operator/kind-site-example.yaml b/docs/team-operator/kind-site-example.yaml new file mode 100644 index 0000000..a35c9e2 --- /dev/null +++ b/docs/team-operator/kind-site-example.yaml @@ -0,0 +1,72 @@ +# Example Site CR for kind local development +# Demonstrates cloud-agnostic configuration using standard Kubernetes resources +# +# Prerequisites: +# - kind cluster with standard StorageClass (default local-path-provisioner) +# - K8s Secrets created manually (dev-secrets, workload-secrets) +# - PostgreSQL database accessible from the cluster +# +# Usage: +# kubectl apply -f kind-site-example.yaml + +apiVersion: core.posit.team/v1beta1 +kind: Site +metadata: + name: dev + namespace: posit-team + labels: + app.kubernetes.io/instance: dev +spec: + # Cloud-agnostic storage: uses kind's default StorageClass + storageClassName: standard + + # Cloud-agnostic secrets: reference K8s Secrets by name + # Create these manually for kind: + # kubectl create secret generic dev-secrets -n posit-team \ + # --from-literal=dev.lic="..." \ + # --from-literal=connect-apikey="..." \ + # --from-literal=admin_token="..." + secret: + name: dev-secrets + + workloadSecret: + name: workload-secrets + + # Database credentials (still needs type/vaultName format for now) + mainDatabaseCredentialSecret: + type: kubernetes + name: postgres-credentials + + # Domain for accessing services + domain: dev.localhost + + # Network trust level + networkTrust: anyone + + # Product-specific configuration + connect: + # Cloud-agnostic IAM: explicit ServiceAccount name + # For kind, no annotations needed (no cloud IAM integration) + serviceAccountName: dev-connect + # Storage buckets (not needed for local dev) + + workbench: + serviceAccountName: dev-workbench + # sessionTolerations: [] # optional, for node taints + + packageManager: + serviceAccountName: dev-packagemanager + # For kind, Package Manager can use the same storage as other products + # No special Azure Files configuration needed + + chronicle: + serviceAccountName: dev-chronicle + + flightdeck: + serviceAccountName: dev-home + + # No gatewayRef needed for basic kind testing + # kind can use traditional Ingress resources instead of Gateway API + + # No nfsEgressCIDR needed for local development + # Network policies can be disabled or simplified for kind diff --git a/python-pulumi/src/ptd/__init__.py b/python-pulumi/src/ptd/__init__.py index 2e2c01f..0231746 100644 --- a/python-pulumi/src/ptd/__init__.py +++ b/python-pulumi/src/ptd/__init__.py @@ -428,6 +428,12 @@ class WorkloadClusterConfig: # After migration, set to False to let Helm manage CRDs going forward. team_operator_skip_crds: bool = False + def __post_init__(self) -> None: + # No-op implementation makes super().__post_init__() safe to call from subclasses + # (e.g. AWSWorkloadClusterConfig) without requiring every intermediate class to guard + # against AttributeError when the MRO reaches this base. + pass + def load_workload_cluster_site_dict( cluster_site_dict: dict[str, typing.Any], diff --git a/python-pulumi/src/ptd/aws_workload.py b/python-pulumi/src/ptd/aws_workload.py index 81724aa..a1552da 100644 --- a/python-pulumi/src/ptd/aws_workload.py +++ b/python-pulumi/src/ptd/aws_workload.py @@ -255,9 +255,29 @@ class AWSWorkloadClusterConfig(ptd.WorkloadClusterConfig): additional_node_groups: dict[str, ptd.NodeGroupConfig] = dataclasses.field(default_factory=dict) public_endpoint_access: bool = True ebs_csi_addon_version: str = "v1.41.0-eksbuild.1" + pod_identity_agent_version: str | None = None + enable_pod_identity_agent: bool = False + enable_external_secrets_operator: bool = False + # Requires the workload secret (secret_name) to contain 'fs-dns-name' (FSx NFS endpoint) before + # `pulumi up` is run; a missing key causes a deploy-time error (dry runs warn instead). + # Security note: the storageClass pathPattern derives subdirectory paths from the + # nfs.io/storage-path PVC annotation, which is user-controlled. Any entity with PVC create + # permissions can supply arbitrary paths; restrict via OPA/Gatekeeper or a + # ValidatingWebhookConfiguration if cross-path access is a concern. + enable_nfs_subdir_provisioner: bool = False # PVCs must carry the nfs.io/storage-path annotation; the storageClass pathPattern uses it to derive subdirectory paths enable_efs_csi_driver: bool = False efs_config: ptd.EFSConfig | None = None karpenter_config: KarpenterConfig | None = None + enable_gateway_api: bool = False # Enables Gateway API CRDs, Traefik Gateway provider, and gatewayRef in Site CRs + + def __post_init__(self) -> None: + super().__post_init__() + if self.enable_external_secrets_operator and not self.enable_pod_identity_agent: + msg = ( + "enable_external_secrets_operator requires enable_pod_identity_agent=True " + "(ClusterSecretStore uses no auth block and relies on Pod Identity for credentials)." + ) + raise ValueError(msg) @dataclasses.dataclass(frozen=True) @@ -268,6 +288,8 @@ class AWSWorkloadClusterComponentConfig(ptd.WorkloadClusterComponentConfig): secret_store_csi_driver_aws_provider_version: str | None = "0.3.5" # noqa: S105 nvidia_device_plugin_version: str | None = "0.17.1" karpenter_version: str | None = "1.6.0" + nfs_subdir_provisioner_version: str | None = "4.0.18" + external_secrets_operator_version: str | None = "0.10.7" class AWSWorkload(ptd.workload.AbstractWorkload): @@ -585,6 +607,9 @@ def ebs_csi_role_name(self) -> str: def fsx_openzfs_role_name(self) -> str: return f"aws-fsx-openzfs-csi-driver.{self.compound_name}.posit.team" + def external_secrets_role_name(self, release: str) -> str: + return f"external-secrets.{release}.{self.compound_name}.posit.team" + def cluster_home_role_name(self, release: str) -> str: return f"home.{release}.{self.compound_name}.posit.team" diff --git a/python-pulumi/src/ptd/azure_workload.py b/python-pulumi/src/ptd/azure_workload.py index dae9a08..3ef6284 100644 --- a/python-pulumi/src/ptd/azure_workload.py +++ b/python-pulumi/src/ptd/azure_workload.py @@ -111,6 +111,7 @@ class AzureWorkloadClusterConfig(ptd.WorkloadClusterConfig): system_node_pool_root_disk_size: int | None = None use_lets_encrypt: bool = False + enable_gateway_api: bool = False # Enables Gateway API CRDs, Traefik Gateway provider, and gatewayRef in Site CRs @dataclasses.dataclass(frozen=True) diff --git a/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py b/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py index a7023af..90661dd 100644 --- a/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py +++ b/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py @@ -1306,6 +1306,34 @@ def with_aws_secrets_store_csi_driver_provider( return self + def with_pod_identity_agent( + self, + version: str | None = None, + ) -> typing.Self: + """ + Add the EKS Pod Identity Agent addon. + + This addon enables EKS Pod Identity for associating IAM roles with + Kubernetes service accounts without IRSA annotations. Pod Identity + associations are created separately via aws.eks.PodIdentityAssociation. + + :param version: Optional, String, version of the addon to install. + By setting this to None, the latest version will be installed. + :return: self + """ + self.pod_identity_agent_addon = aws.eks.Addon( + f"{self.name}-eks-pod-identity-agent", + args=aws.eks.AddonArgs( + addon_name="eks-pod-identity-agent", + addon_version=version, + cluster_name=self.name, + tags=self.eks.tags, + ), + opts=pulumi.ResourceOptions(parent=self.eks), + ) + + return self + def attach_efs_security_group( self, efs_file_system_id: str, diff --git a/python-pulumi/src/ptd/pulumi_resources/aws_workload_clusters.py b/python-pulumi/src/ptd/pulumi_resources/aws_workload_clusters.py index 9605de6..a770847 100644 --- a/python-pulumi/src/ptd/pulumi_resources/aws_workload_clusters.py +++ b/python-pulumi/src/ptd/pulumi_resources/aws_workload_clusters.py @@ -14,6 +14,7 @@ import ptd.pulumi_resources.aws_eks_cluster import ptd.pulumi_resources.aws_iam import ptd.pulumi_resources.aws_karpenter +import ptd.pulumi_resources.aws_workload_helm import ptd.pulumi_resources.custom_k8s_resources import ptd.pulumi_resources.external_dns import ptd.pulumi_resources.helm_controller @@ -27,6 +28,26 @@ import ptd.secrecy +def _pod_identity_assoc( + resource: "AWSWorkloadClusters", + logical_name: str, + cluster_name: str, + namespace: str, + service_account: str, + role_arn: pulumi.Input[str], +) -> None: + """Create a single EKS PodIdentityAssociation with standard naming and tagging.""" + aws.eks.PodIdentityAssociation( + f"{cluster_name}-{logical_name}-pod-identity", + cluster_name=cluster_name, + namespace=namespace, + service_account=service_account, + role_arn=role_arn, + tags=resource.required_tags, + opts=pulumi.ResourceOptions(parent=resource), + ) + + class AWSWorkloadClusters(pulumi.ComponentResource): workload: ptd.aws_workload.AWSWorkload @@ -48,6 +69,7 @@ class AWSWorkloadClusters(pulumi.ComponentResource): autoscaling_queues: dict[str, aws.sqs.Queue] packagemanager_roles: dict[str, aws.iam.Role | pulumi.Output[aws.iam.Role]] team_operator_roles: dict[str, aws.iam.Role] + external_secrets_roles: dict[str, aws.iam.Role] workbench_roles: dict[str, aws.iam.Role] workbench_session_roles: dict[str, aws.iam.Role] @@ -102,12 +124,25 @@ def __init__(self, workload: ptd.aws_workload.AWSWorkload, *args, **kwargs): f"organization/ptd-aws-workload-persistent/{self.workload.compound_name}" ) + # Initialize optional product role dicts defensively so _define_pod_identity_associations + # can safely check membership even if the defining methods are skipped or reordered. + self.chronicle_roles = {} + self.home_roles = {} + self.external_secrets_roles = {} + self.connect_roles = {} + self.connect_session_roles = {} + self.workbench_roles = {} + self.workbench_session_roles = {} + self._define_home_iam() self._define_chronicle_iam(persistent_stack) self._define_connect_iam() self._define_workbench_iam() self._define_packagemanager_iam(persistent_stack) self._define_team_operator_iam() + self._define_external_secrets_iam() + # Create Pod Identity associations for all products (ADDITIVE - keeps IRSA for backward compatibility) + self._define_pod_identity_associations() self._apply_custom_k8s_resources() self._define_team_operator() # after team operator so we can reuse the namespaces @@ -134,6 +169,8 @@ def _oidc_url_tails(self): @staticmethod def _define_read_secrets_inline() -> str: + # resources=["*"] is intentional: workload roles (connect, workbench, packagemanager, etc.) + # all use this same broad policy. Scoping to specific ARN prefixes is deferred work. return aws.iam.get_policy_document( statements=[ aws.iam.GetPolicyDocumentStatementArgs( @@ -148,6 +185,29 @@ def _define_read_secrets_inline() -> str: ] ).json + def _define_eso_read_secrets_inline(self) -> str: + # ESO uses a cluster-wide ClusterSecretStore, so its blast radius is larger than + # per-product IRSA roles. Scope to this workload's secret prefix to prevent + # cross-workload reads when multiple workloads share the same AWS account. + account_id = aws.get_caller_identity().account_id + region = self.workload.cfg.region + prefix = self.workload.compound_name + return aws.iam.get_policy_document( + statements=[ + aws.iam.GetPolicyDocumentStatementArgs( + effect="Allow", + actions=[ + "secretsmanager:GetSecretValue", + "secretsmanager:DescribeSecret", + # ListSecrets does not support resource-level permissions in IAM; + # including it in a resource-scoped statement would silently grant + # list access to all secrets in the account. + ], + resources=[f"arn:aws:secretsmanager:{region}:{account_id}:secret:{prefix}/*"], + ) + ] + ).json + @staticmethod def _define_streaming_bedrock_access() -> str: return aws.iam.get_policy_document( @@ -230,6 +290,7 @@ def _define_home_iam(self): role_policies=[ self._define_read_secrets_inline(), ], + pod_identity=self.workload.cfg.clusters[release].enable_pod_identity_agent, ) def _define_connect_iam(self): @@ -237,12 +298,14 @@ def _define_connect_iam(self): self.connect_session_roles = {} for release in self.managed_clusters_by_release: + pod_identity = self.workload.cfg.clusters[release].enable_pod_identity_agent self.connect_roles[release] = self._define_k8s_iam_role( name=self.workload.cluster_connect_role_name(release), release=release, namespace=ptd.POSIT_TEAM_NAMESPACE, service_accounts=[f"{site_name}-connect" for site_name in sorted(self.workload.cfg.sites.keys())], role_policies=[self._define_read_secrets_inline()], + pod_identity=pod_identity, ) for site_name in sorted(self.workload.cfg.sites.keys()): @@ -256,6 +319,7 @@ def _define_connect_iam(self): policy=policy, policy_name=role_name, role_policies=[self._define_streaming_bedrock_access()], + pod_identity=pod_identity, ) def _define_workbench_iam(self): @@ -286,6 +350,7 @@ def _define_workbench_iam(self): namespace=ptd.POSIT_TEAM_NAMESPACE, service_accounts=[f"{site_name}-workbench" for site_name in sorted(self.workload.cfg.sites.keys())], role_policies=workbench_role_policies, + pod_identity=cluster_cfg.enable_pod_identity_agent, ) for site_name in sorted(self.workload.cfg.sites.keys()): @@ -311,6 +376,7 @@ def _define_workbench_iam(self): policy_name=role_name, service_accounts=[f"{site_name}-workbench-session"], role_policies=workbench_session_role_policies, + pod_identity=cluster_cfg.enable_pod_identity_agent, ) def _define_packagemanager_iam(self, persistent_stack): @@ -337,6 +403,7 @@ def _define_packagemanager_iam(self, persistent_stack): required_tags=self.required_tags, ) + # Key format: release + "//" + site_name — must stay in sync with _define_pod_identity_associations. self.packagemanager_roles[release + "//" + site_name] = self._define_k8s_iam_role( name=self.workload.cluster_packagemanager_role_name(release, site_name), release=release, @@ -345,6 +412,7 @@ def _define_packagemanager_iam(self, persistent_stack): policy=policy, policy_name=policy_name, role_policies=[self._define_read_secrets_inline()], + pod_identity=self.workload.cfg.clusters[release].enable_pod_identity_agent, ) def _define_k8s_iam_role( @@ -359,6 +427,7 @@ def _define_k8s_iam_role( role_policies: pulumi.Input[typing.Sequence[pulumi.Input[str]],] | None = None, auth_issuers: list[ptd.aws_iam.AuthIssuer] | None = None, opts: pulumi.ResourceOptions | None = None, + pod_identity: bool = False, # noqa: FBT001, FBT002 ) -> aws.iam.Role: """ Define a Kubernetes IAM role with appropriate trust relationships. @@ -371,6 +440,7 @@ def _define_k8s_iam_role( :param role_policies: Role policies to attach to the role (Previously known as inline_policies) :param auth_issuers: A list of auth issuers that the role should trust. DO NOT list the same auth issuer more than once! Use a list of client_ids instead + :param pod_identity: When True, adds pods.eks.amazonaws.com as a trusted principal (required for Pod Identity) :return: aws.iam.Role """ if auth_issuers is None: @@ -379,34 +449,51 @@ def _define_k8s_iam_role( service_accounts = [] if opts is None: opts = pulumi.ResourceOptions() + + extra_statements = ( + [ + { + "Action": ["sts:AssumeRole", "sts:TagSession"], + "Effect": "Allow", + "Principal": {"Service": "pods.eks.amazonaws.com"}, + } + ] + if pod_identity + else [] + ) + + if len(self._oidc_url_tails) > 0 or len(auth_issuers) > 0: + irsa_policy = ptd.aws_iam.build_hybrid_irsa_role_assume_role_policy( + service_accounts=service_accounts, + namespace=namespace, + managed_account_id=self.workload.cfg.account_id, + oidc_url_tails=self._oidc_url_tails, + auth_issuers=auth_issuers, + ) + if not isinstance(irsa_policy.get("Statement"), list): + msg = "Expected Statement list from build_hybrid_irsa_role_assume_role_policy" + raise ValueError(msg) + base_policy = {**irsa_policy, "Statement": list(irsa_policy["Statement"]) + extra_statements} + else: + base_policy = { + "Version": "2012-10-17", + "Statement": [ + { + "Action": "sts:AssumeRole", + "Effect": "Allow", + "Principal": { + "AWS": aws.get_caller_identity().arn, + }, + }, + *extra_statements, + ], + } + role = aws.iam.Role( name, aws.iam.RoleArgs( name=name, - assume_role_policy=json.dumps( - ( - ptd.aws_iam.build_hybrid_irsa_role_assume_role_policy( - service_accounts=service_accounts, - namespace=namespace, - managed_account_id=self.workload.cfg.account_id, - oidc_url_tails=self._oidc_url_tails, - auth_issuers=auth_issuers, - ) - if len(self._oidc_url_tails) > 0 or len(auth_issuers) > 0 - else { - "Version": "2012-10-17", - "Statement": [ - { - "Action": "sts:AssumeRole", - "Effect": "Allow", - "Principal": { - "AWS": aws.get_caller_identity().arn, - }, - }, - ], - } - ), - ), + assume_role_policy=json.dumps(base_policy), permissions_boundary=self.workload.iam_permissions_boundary, tags=self.required_tags, ), @@ -476,6 +563,7 @@ def _define_chronicle_iam(self, persistent_stack): policy=policy, policy_name=policy_name, role_policies=[self._define_read_secrets_inline()], + pod_identity=self.workload.cfg.clusters[release].enable_pod_identity_agent, ) read_only_policy_name = self.workload.chronicle_read_only_s3_bucket_policy_name(release, site_name) @@ -559,6 +647,176 @@ def _define_team_operator_iam(self): policy_name=self.workload.team_operator_policy_name, ) + def _define_external_secrets_iam(self) -> None: + """Define IAM roles for external-secrets-operator to access AWS Secrets Manager.""" + for release in self.managed_clusters_by_release: + cluster_cfg = self.workload.cfg.clusters[release] + if not cluster_cfg.enable_external_secrets_operator: + continue + # Invariant: enable_pod_identity_agent=True when enable_external_secrets_operator=True + # is enforced by AWSWorkloadClusterConfig.__post_init__; no need to re-check here. + # pod_identity=True: ESO uses no auth block in ClusterSecretStore and relies exclusively + # on Pod Identity, so the role trust policy must trust pods.eks.amazonaws.com. + self.external_secrets_roles[release] = self._define_k8s_iam_role( + name=self.workload.external_secrets_role_name(release), + release=release, + namespace=ptd.pulumi_resources.aws_workload_helm.ESO_NAMESPACE, + service_accounts=[ptd.pulumi_resources.aws_workload_helm.ESO_SERVICE_ACCOUNT], + role_policies=[self._define_eso_read_secrets_inline()], + pod_identity=True, + ) + + def _define_pod_identity_associations(self) -> None: + """ + Create EKS Pod Identity associations for all product service accounts. + + This is ADDITIVE - existing IRSA roles and annotations are kept for backward compatibility. + Both Pod Identity and IRSA can coexist. The operator will be updated to stop computing + IRSA annotations in a future phase. + + Pod Identity associations connect service accounts directly to IAM roles without requiring + annotations on the ServiceAccount resource. + + Note: team_operator_roles is intentionally excluded here. The team-operator's service + account retains IRSA-based access; Pod Identity will be added in a future phase once + the operator itself is updated to remove IRSA annotation computation. + + Note: fsx_openzfs_roles is also intentionally excluded. The FSx OpenZFS CSI driver uses + node-level IAM (instance profile) rather than pod-level credentials, so no Pod Identity + association is needed for those roles. + """ + for release in self.managed_clusters_by_release: + cluster_cfg = self.workload.cfg.clusters[release] + if not cluster_cfg.enable_pod_identity_agent: + continue + + cluster_name = f"{self.workload.compound_name}-{release}" + + # External Secrets Operator (per-release, only if ESO is also enabled) + if cluster_cfg.enable_external_secrets_operator: + if release not in self.external_secrets_roles: + msg = ( + f"external_secrets_roles missing key {release!r}; " + "_define_external_secrets_iam must be called before _define_pod_identity_associations" + ) + raise RuntimeError(msg) + _eso_sa = ptd.pulumi_resources.aws_workload_helm.ESO_SERVICE_ACCOUNT + _eso_ns = ptd.pulumi_resources.aws_workload_helm.ESO_NAMESPACE + _pod_identity_assoc( + self, + _eso_sa, + cluster_name, + _eso_ns, + _eso_sa, + self.external_secrets_roles[release].arn, + ) + + # Per-site product associations + for site_name in sorted(self.workload.cfg.sites.keys()): + # Connect + if release not in self.connect_roles: + msg = ( + f"connect_roles missing key {release!r}; " + "_define_connect_iam must be called before _define_pod_identity_associations" + ) + raise RuntimeError(msg) + _pod_identity_assoc( + self, + f"{site_name}-connect", + cluster_name, + ptd.POSIT_TEAM_NAMESPACE, + f"{site_name}-connect", + self.connect_roles[release].arn, + ) + + # Connect Session — always present: _define_connect_iam populates connect_session_roles + # for every release/site combo unconditionally. + _session_key = f"{release}-{site_name}" + if _session_key not in self.connect_session_roles: + msg = ( + f"connect_session_roles missing key {_session_key!r}; " + "_define_connect_iam must be called before _define_pod_identity_associations" + ) + raise RuntimeError(msg) + _pod_identity_assoc( + self, + f"{site_name}-connect-session", + cluster_name, + ptd.POSIT_TEAM_NAMESPACE, + f"{site_name}-connect-session", + self.connect_session_roles[_session_key].arn, + ) + + # Workbench + if release not in self.workbench_roles: + msg = ( + f"workbench_roles missing key {release!r}; " + "_define_workbench_iam must be called before _define_pod_identity_associations" + ) + raise RuntimeError(msg) + _pod_identity_assoc( + self, + f"{site_name}-workbench", + cluster_name, + ptd.POSIT_TEAM_NAMESPACE, + f"{site_name}-workbench", + self.workbench_roles[release].arn, + ) + + # Workbench Session — always present: _define_workbench_iam populates workbench_session_roles + # for every release/site combo unconditionally. + if _session_key not in self.workbench_session_roles: + msg = ( + f"workbench_session_roles missing key {_session_key!r}; " + "_define_workbench_iam must be called before _define_pod_identity_associations" + ) + raise RuntimeError(msg) + _pod_identity_assoc( + self, + f"{site_name}-workbench-session", + cluster_name, + ptd.POSIT_TEAM_NAMESPACE, + f"{site_name}-workbench-session", + self.workbench_session_roles[_session_key].arn, + ) + + # Package Manager + # Key format uses "//" separator — must match _define_packagemanager_iam (release + "//" + site_name). + if release + "//" + site_name in self.packagemanager_roles: + _pod_identity_assoc( + self, + f"{site_name}-packagemanager", + cluster_name, + ptd.POSIT_TEAM_NAMESPACE, + f"{site_name}-packagemanager", + self.packagemanager_roles[release + "//" + site_name].arn, + ) + + # Chronicle (optional product — skip if not configured for this release/site) + if f"{release}-{site_name}" in self.chronicle_roles: + _pod_identity_assoc( + self, + f"{site_name}-chronicle", + cluster_name, + ptd.POSIT_TEAM_NAMESPACE, + f"{site_name}-chronicle", + self.chronicle_roles[f"{release}-{site_name}"].arn, + ) + + # Home/Flightdeck (optional product — skip if not configured for this release) + # home_roles is keyed per-release (one IAM role per release), but Home's trust + # policy allows all per-site SAs ({site_name}-home) — see _define_home_iam. + # Pod Identity requires one association per SA, so this block stays inside the loop. + if release in self.home_roles: + _pod_identity_assoc( + self, + f"{site_name}-home", + cluster_name, + ptd.POSIT_TEAM_NAMESPACE, + f"{site_name}-home", + self.home_roles[release].arn, + ) + def _apply_custom_k8s_resources(self): """Apply custom Kubernetes resources from the custom_k8s_resources/ directory.""" ptd.pulumi_resources.custom_k8s_resources.apply_custom_k8s_resources( diff --git a/python-pulumi/src/ptd/pulumi_resources/aws_workload_eks.py b/python-pulumi/src/ptd/pulumi_resources/aws_workload_eks.py index 8796f73..5f43fb8 100644 --- a/python-pulumi/src/ptd/pulumi_resources/aws_workload_eks.py +++ b/python-pulumi/src/ptd/pulumi_resources/aws_workload_eks.py @@ -157,6 +157,10 @@ def _build_with_vpc_config( if self.workload.cfg.secrets_store_addon_enabled: eks_cluster.with_aws_secrets_store_csi_driver_provider() + # Enable EKS Pod Identity Agent for cloud-agnostic IAM (opt-in) + if cluster_cfg.enable_pod_identity_agent: + eks_cluster.with_pod_identity_agent(version=cluster_cfg.pod_identity_agent_version) + eks_cluster.with_gp3() eks_cluster.with_encrypted_ebs_storage_class() eks_cluster.with_oidc_provider() diff --git a/python-pulumi/src/ptd/pulumi_resources/aws_workload_helm.py b/python-pulumi/src/ptd/pulumi_resources/aws_workload_helm.py index 029bfeb..4b6f735 100644 --- a/python-pulumi/src/ptd/pulumi_resources/aws_workload_helm.py +++ b/python-pulumi/src/ptd/pulumi_resources/aws_workload_helm.py @@ -12,6 +12,62 @@ from ptd.pulumi_resources.lib import format_lb_tags ALLOY_NAMESPACE = "alloy" +NFS_STORAGE_CLASS_NAME = "posit-shared-storage" +CLUSTER_SECRET_STORE_NAME = "aws-secrets-manager" # noqa: S105 +ESO_SERVICE_ACCOUNT = "external-secrets" +ESO_NAMESPACE = "external-secrets" +# v1beta1 matches external_secrets_operator_version default "0.10.7". +# Update this if ESO is upgraded past the version that drops v1beta1 support. +ESO_API_VERSION = "external-secrets.io/v1beta1" + + +def _nfs_subdir_provisioner_values(fsx_dns_name: str, fsx_nfs_path: str = "/fsx") -> dict: + """Build the Helm values dict for nfs-subdir-external-provisioner.""" + return { + "nfs": { + "server": fsx_dns_name, + "path": fsx_nfs_path, + "mountOptions": [ + "nfsvers=4.2", + "rsize=1048576", + "wsize=1048576", + "timeo=600", + ], + }, + "storageClass": { + "name": NFS_STORAGE_CLASS_NAME, + "reclaimPolicy": "Retain", + "accessModes": "ReadWriteMany", + "onDelete": "retain", + "pathPattern": "${.PVC.annotations.nfs.io/storage-path}", + }, + } + + +def _eso_helm_values() -> dict: + """Build the Helm values dict for external-secrets-operator.""" + return { + "installCRDs": True, + "serviceAccount": { + "create": True, + "name": ESO_SERVICE_ACCOUNT, + }, + } + + +def _cluster_secret_store_spec(region: str) -> dict: + """Build the ClusterSecretStore spec for AWS Secrets Manager (no auth — uses Pod Identity).""" + return { + "provider": { + "aws": { + "service": "SecretsManager", + "region": region, + }, + }, + "conditions": [ + {"namespaceSelector": {"matchLabels": {"kubernetes.io/metadata.name": ptd.POSIT_TEAM_NAMESPACE}}} + ], + } def _build_alb_tag_string(true_name: str, environment: str, compound_name: str) -> str: @@ -74,10 +130,22 @@ def __init__(self, workload: ptd.aws_workload.AWSWorkload, *args, **kwargs): self._define_aws_lbc(release, components.aws_load_balancer_controller_version) self._define_aws_fsx_openzfs_csi(release, components.aws_fsx_openzfs_csi_driver_version) + # Deploy nfs-subdir-external-provisioner (opt-in via enable_nfs_subdir_provisioner) + if self.workload.cfg.clusters[release].enable_nfs_subdir_provisioner: + self._define_nfs_subdir_provisioner(release, components.nfs_subdir_provisioner_version) if not self.workload.cfg.secrets_store_addon_enabled: self._define_secret_store_csi(release, components.secret_store_csi_driver_version) self._define_secret_store_csi_aws(release, components.secret_store_csi_driver_aws_provider_version) + # Deploy external-secrets-operator (opt-in via enable_external_secrets_operator) + if self.workload.cfg.clusters[release].enable_external_secrets_operator: + self._define_external_secrets_operator(release, components.external_secrets_operator_version) + # Deploy Gateway API CRDs (opt-in via enable_gateway_api) + if self.workload.cfg.clusters[release].enable_gateway_api: + self._define_gateway_api_crds(release) self._define_traefik(release, components.traefik_version, weight, cert_arns_output) + # Create Gateway resources (Gateway, GatewayClass, ReferenceGrant) after Traefik + if self.workload.cfg.clusters[release].enable_gateway_api: + self._define_gateway_resources(release) self._define_metrics_server(release, components.metrics_server_version) self._define_loki(release, components.loki_version, components) self._define_grafana(release, components.grafana_version) @@ -170,6 +238,119 @@ def _define_aws_fsx_openzfs_csi(self, release: str, version: str): opts=pulumi.ResourceOptions(provider=self.kube_providers[release]), ) + def _define_nfs_subdir_provisioner(self, release: str, version: str | None) -> None: + """Deploy nfs-subdir-external-provisioner for FSx storage.""" + workload_secrets, ok = ptd.secrecy.aws_get_secret_value_json( + self.workload.secret_name, region=self.workload.cfg.region + ) + if not ok or "fs-dns-name" not in workload_secrets: + msg = ( + f"enable_nfs_subdir_provisioner=True but secret '{self.workload.secret_name}' " + "is missing or does not contain 'fs-dns-name'." + ) + if pulumi.runtime.is_dry_run(): + pulumi.warn( + "[ACTION REQUIRED] " + msg + " NFS subdir provisioner will be ABSENT from this preview diff; " + "`pulumi up` will raise an error unless the secret is populated first." + ) + return + raise ValueError(msg) + + fsx_dns_name = workload_secrets["fs-dns-name"] + fsx_nfs_path = workload_secrets.get("fs-nfs-path", "/fsx") + + spec: dict = { + "repo": "https://kubernetes-sigs.github.io/nfs-subdir-external-provisioner/", + "chart": "nfs-subdir-external-provisioner", + "targetNamespace": ptd.KUBE_SYSTEM_NAMESPACE, + "valuesContent": yaml.dump(_nfs_subdir_provisioner_values(fsx_dns_name, fsx_nfs_path)), + } + if version is not None: + spec["version"] = version + + k8s.apiextensions.CustomResource( + f"{self.workload.compound_name}-{release}-nfs-subdir-provisioner-helm-release", + metadata=k8s.meta.v1.ObjectMetaArgs( + name="nfs-subdir-external-provisioner", + namespace=ptd.HELM_CONTROLLER_NAMESPACE, + labels=self.required_tags, + ), + api_version="helm.cattle.io/v1", + kind="HelmChart", + spec=spec, + opts=pulumi.ResourceOptions(provider=self.kube_providers[release]), + ) + + def _define_external_secrets_operator(self, release: str, version: str | None) -> None: + """Deploy external-secrets-operator and create ClusterSecretStore for AWS Secrets Manager. + + Note: the ClusterSecretStore is created with ``depends_on=[eso_helm_release]``, which + ensures Pulumi registers it after the HelmChart CR object exists in the API server. + However, this does NOT wait for the Helm release to complete and CRDs to be installed. + On a fresh deploy, the ClusterSecretStore apply will fail until ESO's CRDs converge + (~1-2 reconcile loops). This is an architectural constraint of using HelmChart CRDs + rather than ``pulumi_kubernetes.helm.v3.Release``. + """ + # Deploy external-secrets-operator Helm chart + # Note: helm-controller (RKE2) auto-creates the targetNamespace from the HelmChart CR, + # so the "external-secrets" namespace does not need to be created explicitly here. + eso_spec: dict = { + "repo": "https://charts.external-secrets.io", + "chart": "external-secrets", + "targetNamespace": ESO_NAMESPACE, + "valuesContent": yaml.dump(_eso_helm_values()), + } + if version is not None: + eso_spec["version"] = version + + eso_helm_release = k8s.apiextensions.CustomResource( + f"{self.workload.compound_name}-{release}-external-secrets-helm-release", + metadata=k8s.meta.v1.ObjectMetaArgs( + name="external-secrets", + namespace=ptd.HELM_CONTROLLER_NAMESPACE, + labels=self.required_tags, + ), + api_version="helm.cattle.io/v1", + kind="HelmChart", + spec=eso_spec, + opts=pulumi.ResourceOptions(provider=self.kube_providers[release]), + ) + + # Create ClusterSecretStore for AWS Secrets Manager. + # depends_on the HelmChart CR so Pulumi applies it after the ESO chart CR is registered. + # CustomTimeouts makes the eventual-consistency explicit: on a fresh cluster the CRD may not + # be available immediately; Pulumi will retry for up to 10 minutes before failing. + k8s.apiextensions.CustomResource( + f"{self.workload.compound_name}-{release}-cluster-secret-store", + metadata=k8s.meta.v1.ObjectMetaArgs( + name=CLUSTER_SECRET_STORE_NAME, + labels=self.required_tags, + ), + api_version=ESO_API_VERSION, + kind="ClusterSecretStore", + spec=_cluster_secret_store_spec(self.workload.cfg.region), + opts=pulumi.ResourceOptions( + provider=self.kube_providers[release], + depends_on=[eso_helm_release], + custom_timeouts=pulumi.CustomTimeouts(create="10m"), + ), + ) + + def _define_gateway_api_crds(self, release: str) -> None: + """Install Gateway API standard CRDs before Traefik deployment. + + Installs Gateway API v1.2.1 standard CRDs (Gateway, GatewayClass, HTTPRoute, ReferenceGrant). + This must be installed before Traefik's Gateway API provider is enabled. + + Note: Using a static URL install (not a Helm chart) to match the design doc approach. + The CRDs are cluster-scoped and will be installed once per cluster. + """ + k8s.yaml.ConfigFile( + f"{self.workload.compound_name}-{release}-gateway-api-crds", + file="https://github.com/kubernetes-sigs/gateway-api/releases/download/v1.2.1/standard-install.yaml", + opts=pulumi.ResourceOptions(provider=self.kube_providers[release]), + ) + def _define_secret_store_csi(self, release: str, version: str): k8s.apiextensions.CustomResource( f"{self.workload.compound_name}-{release}-secret-store-csi-helm-release", @@ -267,6 +448,97 @@ def _define_aws_lbc(self, release: str, version: str): opts=pulumi.ResourceOptions(provider=self.kube_providers[release]), ) + def _define_gateway_resources(self, release: str) -> None: + """Create Gateway API resources (GatewayClass, Gateway, ReferenceGrant). + + Creates the infrastructure Gateway resources that the team-operator will reference + via gatewayRef in Site CRs. These must be created after Traefik is deployed and + after Gateway API CRDs are installed. + """ + # Create GatewayClass + k8s.apiextensions.CustomResource( + f"{self.workload.compound_name}-{release}-traefik-gateway-class", + api_version="gateway.networking.k8s.io/v1", + kind="GatewayClass", + metadata=k8s.meta.v1.ObjectMetaArgs( + name="traefik", + labels=self.required_tags, + ), + spec={ + "controllerName": "traefik.io/gateway-controller", + }, + opts=pulumi.ResourceOptions(provider=self.kube_providers[release]), + ) + + # Create Gateway in traefik namespace + # Note: TLS configuration is not included here as it's managed by the ALB/NLB + # in AWS. The Gateway defines the entry points for HTTPRoutes. + k8s.apiextensions.CustomResource( + f"{self.workload.compound_name}-{release}-posit-team-gateway", + api_version="gateway.networking.k8s.io/v1", + kind="Gateway", + metadata=k8s.meta.v1.ObjectMetaArgs( + name="posit-team", + namespace=ptd.TRAEFIK_NAMESPACE, + labels=self.required_tags, + ), + spec={ + "gatewayClassName": "traefik", + "listeners": [ + { + "name": "https", + "protocol": "HTTPS", + "port": 443, + "allowedRoutes": { + "namespaces": { + "from": "All", + }, + }, + }, + { + "name": "http", + "protocol": "HTTP", + "port": 80, + "allowedRoutes": { + "namespaces": { + "from": "All", + }, + }, + }, + ], + }, + opts=pulumi.ResourceOptions(provider=self.kube_providers[release]), + ) + + # Create ReferenceGrant to allow HTTPRoutes in posit-team namespace + # to reference Services in traefik namespace (cross-namespace references) + k8s.apiextensions.CustomResource( + f"{self.workload.compound_name}-{release}-allow-posit-team-routes", + api_version="gateway.networking.k8s.io/v1beta1", + kind="ReferenceGrant", + metadata=k8s.meta.v1.ObjectMetaArgs( + name="allow-posit-team", + namespace=ptd.TRAEFIK_NAMESPACE, + labels=self.required_tags, + ), + spec={ + "from": [ + { + "group": "gateway.networking.k8s.io", + "kind": "HTTPRoute", + "namespace": ptd.POSIT_TEAM_NAMESPACE, + } + ], + "to": [ + { + "group": "", + "kind": "Service", + } + ], + }, + opts=pulumi.ResourceOptions(provider=self.kube_providers[release]), + ) + def _define_metrics_server(self, release: str, version: str): k8s.apiextensions.CustomResource( f"{self.workload.compound_name}-{release}-metrics-server-helm-release", @@ -657,6 +929,23 @@ def _define_traefik(self, release: str, version: str, weight: str, cert_arns_out opts=pulumi.ResourceOptions(provider=self.kube_providers[release]), ) + # Build providers config conditionally based on enable_gateway_api flag + providers_config = { + "kubernetesCRD": { + "allowCrossNamespace": True, + "enabled": True, + }, + "kubernetesIngress": { + "enabled": True, + }, + } + + # Add Gateway API provider if enabled + if self.workload.cfg.clusters[release].enable_gateway_api: + providers_config["kubernetesGateway"] = { + "enabled": True, + } + chart = k8s.apiextensions.CustomResource( f"{self.workload.compound_name}-{release}-traefik-helm-release", metadata=k8s.meta.v1.ObjectMetaArgs( @@ -693,15 +982,7 @@ def _define_traefik(self, release: str, version: str, weight: str, cert_arns_out "enabled": True, }, }, - "providers": { - "kubernetesCRD": { - "allowCrossNamespace": True, - "enabled": True, - }, - "kubernetesIngress": { - "enabled": True, - }, - }, + "providers": providers_config, "ports": { "traefik": { "expose": { diff --git a/python-pulumi/src/ptd/pulumi_resources/aws_workload_sites.py b/python-pulumi/src/ptd/pulumi_resources/aws_workload_sites.py index 836bc1f..603f50b 100644 --- a/python-pulumi/src/ptd/pulumi_resources/aws_workload_sites.py +++ b/python-pulumi/src/ptd/pulumi_resources/aws_workload_sites.py @@ -11,6 +11,29 @@ import ptd.pulumi_resources.aws_eks_cluster import ptd.pulumi_resources.team_site import ptd.secrecy +from ptd.pulumi_resources.aws_workload_helm import CLUSTER_SECRET_STORE_NAME, ESO_API_VERSION + + +def _external_secret_spec(site_name: str, secret_key: str) -> dict: + """Build the ExternalSecret spec dict for a site.""" + return { + "refreshInterval": "1h", + "secretStoreRef": { + "name": CLUSTER_SECRET_STORE_NAME, + "kind": "ClusterSecretStore", + }, + "target": { + "name": f"{site_name}-secrets", + "creationPolicy": "Owner", + }, + "dataFrom": [ + { + "extract": { + "key": secret_key, + } + } + ], + } class AWSWorkloadSites(pulumi.ComponentResource): @@ -70,65 +93,105 @@ def __init__(self, workload: ptd.aws_workload.AWSWorkload, *args, **kwargs): raise ValueError(msg) self._define_team_sites() + self._define_external_secrets() def _define_team_sites(self): self.team_sites = {} - def set_workload_fields(obj: dict[str, typing.Any], _: pulumi.ResourceOptions): - if obj["kind"] != "Site": - return + for release in self.managed_clusters_by_release: + cluster_cfg = self.workload.cfg.clusters.get(release) - workload_secrets = typing.cast( - ptd.secrecy.AWSWorkloadSecret, - self.workload_secrets_dict, - ) - main_db = ptd.aws_rds_describe_db_instance( - workload_secrets.get("main-database-id", ""), region=self.workload.cfg.region - ) + def generate_set_workload_fields( + _release: str, cluster_cfg: typing.Any + ) -> ptd.pulumi_resources.KustomizeTransformationFunc: + def set_workload_fields(obj: dict[str, typing.Any], _: pulumi.ResourceOptions): + if obj["kind"] != "Site": + return - account_id = aws.get_caller_identity().account_id + workload_secrets = typing.cast( + ptd.secrecy.AWSWorkloadSecret, + self.workload_secrets_dict, + ) + main_db = ptd.aws_rds_describe_db_instance( + workload_secrets.get("main-database-id", ""), region=self.workload.cfg.region + ) - # Check if EFS is enabled for any cluster in this release - cluster_cfg = self.workload.cfg.clusters.get(release) - efs_enabled = False - if cluster_cfg: - efs_enabled = cluster_cfg.enable_efs_csi_driver or cluster_cfg.efs_config is not None - - site_spec = { - "awsAccountId": account_id, - "chronicle": { - "s3Bucket": workload_secrets["chronicle-bucket"], - }, - "domain": self.workload.cfg.domain, - "mainDatabaseCredentialSecret": { - "type": "aws", - "vaultName": main_db["MasterUserSecret"]["SecretArn"], - }, - "networkTrust": self.workload.cfg.network_trust.value, - "packageManager": { - "s3Bucket": workload_secrets["packagemanager-bucket"], - }, - "secret": {"type": "aws"}, - "secretType": "aws", - "volumeSource": { - "dnsName": workload_secrets["fs-dns-name"], - "type": "nfs", - }, - "workloadSecret": {"type": "aws"}, - } + account_id = aws.get_caller_identity().account_id - # Add EFS configuration if enabled - if efs_enabled: - site_spec["efsEnabled"] = True - if self.workload.cfg.vpc_cidr: - site_spec["vpcCIDR"] = self.workload.cfg.vpc_cidr + # Check if EFS is enabled for any cluster in this release + efs_enabled = False + if cluster_cfg: + efs_enabled = cluster_cfg.enable_efs_csi_driver or cluster_cfg.efs_config is not None - obj["spec"] = deepmerge.always_merger.merge( - obj.get("spec", {}), - copy.deepcopy(site_spec), - ) + site_spec = { + "awsAccountId": account_id, + "chronicle": { + "s3Bucket": workload_secrets["chronicle-bucket"], + }, + "domain": self.workload.cfg.domain, + "mainDatabaseCredentialSecret": { + "type": "aws", + "vaultName": main_db["MasterUserSecret"]["SecretArn"], + }, + "networkTrust": self.workload.cfg.network_trust.value, + "packageManager": { + "s3Bucket": workload_secrets["packagemanager-bucket"], + }, + "secret": {"type": "aws"}, + "secretType": "aws", + "volumeSource": { + "dnsName": workload_secrets["fs-dns-name"], + "type": "nfs", + }, + "workloadSecret": {"type": "aws"}, + } + + # Add EFS configuration if enabled + if efs_enabled: + site_spec["efsEnabled"] = True + if self.workload.cfg.vpc_cidr: + site_spec["vpcCIDR"] = self.workload.cfg.vpc_cidr + + # Cloud-agnostic storage (when nfs-subdir-provisioner is enabled) + if cluster_cfg and cluster_cfg.enable_nfs_subdir_provisioner: + site_spec["storageClassName"] = "posit-shared-storage" + # Use nfsEgressCIDR instead of efsEnabled/vpcCIDR + if self.workload.cfg.vpc_cidr: + site_spec["nfsEgressCIDR"] = self.workload.cfg.vpc_cidr + + # Cloud-agnostic secrets (when external-secrets-operator is enabled) + if cluster_cfg and cluster_cfg.enable_external_secrets_operator: + # Use K8s Secret names instead of type+vaultName + # Note: site_name comes from obj metadata, workload secret is workload-scoped + site_name = obj.get("metadata", {}).get("name", "") + site_spec["secret"] = {"name": f"{site_name}-secrets"} + site_spec["workloadSecret"] = {"name": f"{self.workload.compound_name}-secrets"} + + # Cloud-agnostic IAM (when Pod Identity is enabled) + if cluster_cfg and cluster_cfg.enable_pod_identity_agent: + # Set explicit ServiceAccount names for Pod Identity contract + site_name = obj.get("metadata", {}).get("name", "") + site_spec.setdefault("connect", {})["serviceAccountName"] = f"{site_name}-connect" + site_spec.setdefault("workbench", {})["serviceAccountName"] = f"{site_name}-workbench" + site_spec.setdefault("packageManager", {})["serviceAccountName"] = f"{site_name}-packagemanager" + site_spec.setdefault("chronicle", {})["serviceAccountName"] = f"{site_name}-chronicle" + site_spec.setdefault("flightdeck", {})["serviceAccountName"] = f"{site_name}-home" + + # Cloud-agnostic ingress (when Gateway API is enabled) + if cluster_cfg and cluster_cfg.enable_gateway_api: + # Reference the Gateway resource created by infrastructure + site_spec["gatewayRef"] = { + "name": "posit-team", + "namespace": "traefik", + } + + obj["spec"] = deepmerge.always_merger.merge( + obj.get("spec", {}), + copy.deepcopy(site_spec), + ) + + return set_workload_fields - for release in self.managed_clusters_by_release: for site_name in sorted(self.workload.cfg.sites.keys()): def generate_set_site_fields( @@ -156,7 +219,7 @@ def set_site_fields(obj: dict[str, typing.Any], _: pulumi.ResourceOptions): site_name=site_name, kubeconfig=self.kubeconfigs[release], transformations=[ - set_workload_fields, + generate_set_workload_fields(release, cluster_cfg), generate_set_site_fields(site_name), ], cluster_config=self.workload.cfg.clusters[release], @@ -165,3 +228,55 @@ def set_site_fields(obj: dict[str, typing.Any], _: pulumi.ResourceOptions): providers=[self.kube_providers[release]], ), ) + + def _define_external_secrets(self) -> None: + """ + Create ExternalSecret CRs for each site to sync secrets from AWS Secrets Manager to K8s Secrets. + + This creates K8s Secrets that the operator can reference by name instead of calling AWS SDK directly. + + Note: these CRs reference the `aws-secrets-manager` ClusterSecretStore which is created by + AWSWorkloadHelm. No Pulumi ``depends_on`` is wired here because even if we declared one, it + would only guarantee the HelmChart CR object exists — not that ESO's CRDs have converged. + The ClusterSecretStore will retry until ESO is ready (~1-2 reconcile loops). + """ + for release in self.managed_clusters_by_release: + if not self.workload.cfg.clusters[release].enable_external_secrets_operator: + continue + + # Create ExternalSecret for workload-level secrets (once per release) + kubernetes.apiextensions.CustomResource( + f"{self.workload.compound_name}-{release}-workload-external-secret", + metadata=kubernetes.meta.v1.ObjectMetaArgs( + name=f"{self.workload.compound_name}-secrets", + namespace=ptd.POSIT_TEAM_NAMESPACE, + labels=self.required_tags, + ), + api_version=ESO_API_VERSION, + kind="ExternalSecret", + spec=_external_secret_spec(self.workload.compound_name, self.workload.secret_name), + opts=pulumi.ResourceOptions( + parent=self, + provider=self.kube_providers[release], + custom_timeouts=pulumi.CustomTimeouts(create="10m"), + ), + ) + + # Create ExternalSecret for each site + for site_name in sorted(self.workload.cfg.sites.keys()): + kubernetes.apiextensions.CustomResource( + f"{self.workload.compound_name}-{release}-{site_name}-external-secret", + metadata=kubernetes.meta.v1.ObjectMetaArgs( + name=f"{site_name}-secrets", + namespace=ptd.POSIT_TEAM_NAMESPACE, + labels=self.required_tags, + ), + api_version=ESO_API_VERSION, + kind="ExternalSecret", + spec=_external_secret_spec(site_name, self.workload.site_secret_name(site_name)), + opts=pulumi.ResourceOptions( + parent=self, + provider=self.kube_providers[release], + custom_timeouts=pulumi.CustomTimeouts(create="10m"), + ), + ) diff --git a/python-pulumi/src/ptd/pulumi_resources/azure_traefik.py b/python-pulumi/src/ptd/pulumi_resources/azure_traefik.py index 15ed71b..5aa55df 100644 --- a/python-pulumi/src/ptd/pulumi_resources/azure_traefik.py +++ b/python-pulumi/src/ptd/pulumi_resources/azure_traefik.py @@ -34,11 +34,39 @@ def __init__( self.workload = workload self.traefik: k8s.helm.v3.Release | None = None + # Install Gateway API CRDs if enabled (before Traefik) + if self.release in self.workload.cfg.clusters and self.workload.cfg.clusters[self.release].enable_gateway_api: + self._define_gateway_api_crds() + self._define_namespace() self._define_helm_release() + # Create Gateway resources if enabled (after Traefik) + if self.release in self.workload.cfg.clusters and self.workload.cfg.clusters[self.release].enable_gateway_api: + self._define_gateway_resources() + self.register_outputs({}) + def _build_providers_config(self) -> dict: + """Build Traefik providers configuration conditionally based on enable_gateway_api flag.""" + providers_config = { + "kubernetesCRD": { + "enabled": True, + "allowCrossNamespace": True, + }, + "kubernetesIngress": { + "enabled": True, + }, + } + + # Add Gateway API provider if enabled + if self.release in self.workload.cfg.clusters and self.workload.cfg.clusters[self.release].enable_gateway_api: + providers_config["kubernetesGateway"] = { + "enabled": True, + } + + return providers_config + def _define_namespace(self): k8s.core.v1.Namespace( f"{self.workload.compound_name}-{self.release}-traefik-namespace", @@ -90,15 +118,7 @@ def _define_helm_release(self): "enabled": True, } }, - "providers": { - "kubernetesCRD": { - "enabled": True, - "allowCrossNamespace": True, - }, - "kubernetesIngress": { - "enabled": True, - }, - }, + "providers": self._build_providers_config(), "service": { "annotations": { "service.beta.kubernetes.io/azure-load-balancer-internal": "true", @@ -188,3 +208,100 @@ def _define_redirect_middleware(self): } }, } + + def _define_gateway_api_crds(self) -> None: + """Install Gateway API standard CRDs before Traefik deployment. + + Installs Gateway API v1.2.1 standard CRDs (Gateway, GatewayClass, HTTPRoute, ReferenceGrant). + This must be installed before Traefik's Gateway API provider is enabled. + """ + k8s.yaml.ConfigFile( + f"{self.workload.compound_name}-{self.release}-gateway-api-crds", + file="https://github.com/kubernetes-sigs/gateway-api/releases/download/v1.2.1/standard-install.yaml", + opts=pulumi.ResourceOptions(parent=self), + ) + + def _define_gateway_resources(self) -> None: + """Create Gateway API resources (GatewayClass, Gateway, ReferenceGrant). + + Creates the infrastructure Gateway resources that the team-operator will reference + via gatewayRef in Site CRs. + """ + # Create GatewayClass + k8s.apiextensions.CustomResource( + f"{self.workload.compound_name}-{self.release}-traefik-gateway-class", + api_version="gateway.networking.k8s.io/v1", + kind="GatewayClass", + metadata=k8s.meta.v1.ObjectMetaArgs( + name="traefik", + ), + spec={ + "controllerName": "traefik.io/gateway-controller", + }, + opts=pulumi.ResourceOptions(parent=self), + ) + + # Create Gateway in traefik namespace + k8s.apiextensions.CustomResource( + f"{self.workload.compound_name}-{self.release}-posit-team-gateway", + api_version="gateway.networking.k8s.io/v1", + kind="Gateway", + metadata=k8s.meta.v1.ObjectMetaArgs( + name="posit-team", + namespace=AZURE_TRAEFIK_NAMESPACE, + ), + spec={ + "gatewayClassName": "traefik", + "listeners": [ + { + "name": "https", + "protocol": "HTTPS", + "port": 443, + "allowedRoutes": { + "namespaces": { + "from": "All", + }, + }, + }, + { + "name": "http", + "protocol": "HTTP", + "port": 80, + "allowedRoutes": { + "namespaces": { + "from": "All", + }, + }, + }, + ], + }, + opts=pulumi.ResourceOptions(parent=self), + ) + + # Create ReferenceGrant to allow HTTPRoutes in posit-team namespace + # to reference Services in traefik namespace + k8s.apiextensions.CustomResource( + f"{self.workload.compound_name}-{self.release}-allow-posit-team-routes", + api_version="gateway.networking.k8s.io/v1beta1", + kind="ReferenceGrant", + metadata=k8s.meta.v1.ObjectMetaArgs( + name="allow-posit-team", + namespace=AZURE_TRAEFIK_NAMESPACE, + ), + spec={ + "from": [ + { + "group": "gateway.networking.k8s.io", + "kind": "HTTPRoute", + "namespace": "posit-team", + } + ], + "to": [ + { + "group": "", + "kind": "Service", + } + ], + }, + opts=pulumi.ResourceOptions(parent=self), + ) diff --git a/python-pulumi/src/ptd/pulumi_resources/azure_workload_sites.py b/python-pulumi/src/ptd/pulumi_resources/azure_workload_sites.py index c4036ad..e536926 100644 --- a/python-pulumi/src/ptd/pulumi_resources/azure_workload_sites.py +++ b/python-pulumi/src/ptd/pulumi_resources/azure_workload_sites.py @@ -70,26 +70,36 @@ def set_workload_fields(obj: dict[str, typing.Any], _: pulumi.ResourceOptions): if obj["kind"] != "Site": return + site_spec = { + # TODO: set chronicle and ppm storage buckets + "domain": self.workload.cfg.domain, + "networkTrust": self.workload.cfg.network_trust.value, + "packageManager": { + "azureFiles": { + "storageClassName": self.workload.azure_files_csi_storage_class_name, + "shareSizeGiB": self.workload.cfg.ppm_file_share_size_gib, + }, + }, + "secret": {"type": "kubernetes"}, + "secretType": "kubernetes", + "volumeSource": { + "type": "azure-netapp", + }, + } + + # Cloud-agnostic ingress (when Gateway API is enabled) + # Note: Azure workload sites don't have per-site cluster_cfg, so check all releases + for release in self.managed_clusters_by_release: + if release in self.workload.cfg.clusters and self.workload.cfg.clusters[release].enable_gateway_api: + site_spec["gatewayRef"] = { + "name": "posit-team", + "namespace": "traefik", + } + break # Only need to set once if any cluster has it enabled + obj["spec"] = deepmerge.always_merger.merge( obj.get("spec", {}), - copy.deepcopy( - { - # TODO: set chronicle and ppm storage buckets - "domain": self.workload.cfg.domain, - "networkTrust": self.workload.cfg.network_trust.value, - "packageManager": { - "azureFiles": { - "storageClassName": self.workload.azure_files_csi_storage_class_name, - "shareSizeGiB": self.workload.cfg.ppm_file_share_size_gib, - }, - }, - "secret": {"type": "kubernetes"}, - "secretType": "kubernetes", - "volumeSource": { - "type": "azure-netapp", - }, - } - ), + copy.deepcopy(site_spec), ) for release in self.managed_clusters_by_release: diff --git a/python-pulumi/tests/test_eso_and_external_secret_values.py b/python-pulumi/tests/test_eso_and_external_secret_values.py new file mode 100644 index 0000000..b553bae --- /dev/null +++ b/python-pulumi/tests/test_eso_and_external_secret_values.py @@ -0,0 +1,69 @@ +"""Tests for ESO Helm values and ExternalSecret/ClusterSecretStore CR structure.""" + +import yaml + +from ptd.pulumi_resources.aws_workload_helm import ( + _cluster_secret_store_spec, + _eso_helm_values, +) +from ptd.pulumi_resources.aws_workload_sites import _external_secret_spec as _build_external_secret_spec + + +def test_eso_helm_values_install_crds(): + values = _eso_helm_values() + assert values["installCRDs"] is True + + +def test_eso_helm_values_service_account(): + values = _eso_helm_values() + sa = values["serviceAccount"] + assert sa["create"] is True + assert sa["name"] == "external-secrets" + # No IRSA annotations — Pod Identity is used instead + assert "annotations" not in sa + + +def test_eso_helm_values_yaml_roundtrip(): + values = _eso_helm_values() + parsed = yaml.safe_load(yaml.dump(values)) + assert parsed["installCRDs"] is True + assert parsed["serviceAccount"]["name"] == "external-secrets" + assert "annotations" not in parsed["serviceAccount"] + + +def test_cluster_secret_store_no_auth_block(): + """ClusterSecretStore must have no auth block — credentials come from Pod Identity.""" + spec = _cluster_secret_store_spec("us-east-1") + aws_provider = spec["provider"]["aws"] + assert aws_provider["service"] == "SecretsManager" + assert aws_provider["region"] == "us-east-1" + assert "auth" not in aws_provider, "auth block must be absent; Pod Identity provides ambient credentials" + + +def test_cluster_secret_store_region_propagated(): + spec = _cluster_secret_store_spec("eu-west-1") + assert spec["provider"]["aws"]["region"] == "eu-west-1" + + +def test_external_secret_store_ref(): + spec = _build_external_secret_spec("mysite", "myworkload/mysite") + assert spec["secretStoreRef"]["name"] == "aws-secrets-manager" + assert spec["secretStoreRef"]["kind"] == "ClusterSecretStore" + + +def test_external_secret_refresh_interval(): + spec = _build_external_secret_spec("mysite", "myworkload/mysite") + assert spec["refreshInterval"] == "1h" + + +def test_external_secret_target_name(): + spec = _build_external_secret_spec("mysite", "myworkload/mysite") + assert spec["target"]["name"] == "mysite-secrets" + assert spec["target"]["creationPolicy"] == "Owner" + + +def test_external_secret_data_from_extract(): + secret_key = "myworkload/mysite" + spec = _build_external_secret_spec("mysite", secret_key) + assert len(spec["dataFrom"]) == 1 + assert spec["dataFrom"][0]["extract"]["key"] == secret_key diff --git a/python-pulumi/tests/test_workload_cluster_config.py b/python-pulumi/tests/test_workload_cluster_config.py index 2e3d7ba..b37a6ed 100644 --- a/python-pulumi/tests/test_workload_cluster_config.py +++ b/python-pulumi/tests/test_workload_cluster_config.py @@ -3,6 +3,7 @@ import pytest import ptd +import ptd.aws_workload def test_workload_cluster_config_default_initialization(): @@ -308,3 +309,59 @@ def test_workload_cluster_config_custom_k8s_resources_in_workload(): assert workload_config.clusters["20250328"].custom_k8s_resources == ["storage", "common"] assert workload_config.clusters["20250415"].custom_k8s_resources == ["monitoring"] + + +def test_packagemanager_roles_key_format(): + """Verify the '//' separator used as the packagemanager_roles dict key. + + _define_packagemanager_iam (population) and _define_pod_identity_associations (lookup) + must produce the same key. Both currently use: release + "//" + site_name. + This test uses the two expression forms so a change to either separator would fail here. + """ + release = "20250328" + site_name = "mysite" + # Form used by _define_packagemanager_iam + population_key = release + "//" + site_name + # Form used by _define_pod_identity_associations + lookup_key = f"{release}//{site_name}" + assert population_key == lookup_key + assert population_key == "20250328//mysite" + # Slashes in release or site_name would silently corrupt the separator. + assert "/" not in release + assert "/" not in site_name + + +def test_session_roles_key_format(): + """Verify the '-' separator used as the connect_session_roles and workbench_session_roles dict key. + + _define_connect_iam / _define_workbench_iam (population) and _define_pod_identity_associations + (lookup) must produce the same key. Both currently use: f"{release}-{site_name}". + This test uses the two expression forms so a change to either separator would fail here. + """ + release = "20250328" + site_name = "mysite" + # Form used by _define_connect_iam / _define_workbench_iam + population_key = release + "-" + site_name + # Form used by _define_pod_identity_associations + lookup_key = f"{release}-{site_name}" + assert population_key == lookup_key + assert population_key == "20250328-mysite" + + +def test_eso_requires_pod_identity(): + """enable_external_secrets_operator=True without enable_pod_identity_agent=True raises ValueError.""" + with pytest.raises(ValueError, match="enable_pod_identity_agent=True"): + ptd.aws_workload.AWSWorkloadClusterConfig( + enable_external_secrets_operator=True, + enable_pod_identity_agent=False, + ) + + +def test_eso_with_pod_identity_is_valid(): + """enable_external_secrets_operator=True with enable_pod_identity_agent=True is allowed.""" + cfg = ptd.aws_workload.AWSWorkloadClusterConfig( + enable_external_secrets_operator=True, + enable_pod_identity_agent=True, + ) + assert cfg.enable_external_secrets_operator is True + assert cfg.enable_pod_identity_agent is True