From d1ad1d419c7b6e7e67b02ab6f8749d54f9a3399d Mon Sep 17 00:00:00 2001 From: Michael Hotan Date: Fri, 27 Mar 2026 15:50:51 +1100 Subject: [PATCH] Add selfhosted authorization documentation Add comprehensive authorization page covering: - Three authorization modes (Noop, Union built-in RBAC, External) - Helm configuration for each mode - External authorization server requirements (gRPC contract, actions, identity) - Service account permissions for internal platform apps - Observability metrics and alerts - Verification and troubleshooting guides Also removes Union-internal service names from selfhosted docs and converts architecture diagrams to Mermaid. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../helm-chart-reference/dataplane.md | 7 +- .../selfhosted-deployment/_index.md | 22 +- .../selfhosted-deployment/authentication.md | 56 +-- .../selfhosted-deployment/authorization.md | 405 ++++++++++++++++++ .../control-plane-aws.md | 67 +-- .../control-plane-gcp.md | 71 +-- .../selfhosted-deployment/data-plane-aws.md | 34 +- .../selfhosted-deployment/data-plane-gcp.md | 38 +- .../selfhosted-deployment/image-builder.md | 2 +- .../selfhosted-deployment/monitoring.md | 71 ++- 10 files changed, 593 insertions(+), 180 deletions(-) create mode 100644 content/deployment/selfhosted-deployment/authorization.md diff --git a/content/deployment/helm-chart-reference/dataplane.md b/content/deployment/helm-chart-reference/dataplane.md index 990132855..634df1c84 100644 --- a/content/deployment/helm-chart-reference/dataplane.md +++ b/content/deployment/helm-chart-reference/dataplane.md @@ -1,7 +1,7 @@ --- title: Dataplane chart variants: -flyte -byoc +selfmanaged -chart_version: 2026.3.10 +chart_version: 2026.3.11 weight: 1 --- @@ -11,8 +11,8 @@ Deploys the Union dataplane components to onboard a kubernetes cluster to the Un | | | |---|---| -| **Chart version** | 2026.3.10 | -| **App version** | 2026.3.7 | +| **Chart version** | 2026.3.11 | +| **App version** | 2026.3.8 | | **Kubernetes version** | `>= 1.28.0-0` | ## Dependencies @@ -38,6 +38,7 @@ Deploys the Union dataplane components to onboard a kubernetes cluster to the Un | additionalPodSpec | object | Define additional PodSpec values for all of the Union pods. | `{}` | | clusterName | string | Cluster name should be shared with Union for proper functionality. | `"{{ .Values.global.CLUSTER_NAME }}"` | | clusterresourcesync | object | clusterresourcesync contains the configuration information for the syncresources service. | `(see values.yaml)` | +| clusterresourcesync.additionalTemplates | list | Additional cluster resource templates to create per project namespace. Use this instead of overriding `templates` to avoid accidentally removing the default namespace, service account, and resource quota templates. Each entry has a `key` (filename stem) and `value` (Kubernetes manifest). | `[]` | | clusterresourcesync.additionalVolumeMounts | list | Appends additional volume mounts to the main container's spec. May include template values. | `[]` | | clusterresourcesync.additionalVolumes | list | Appends additional volumes to the deployment spec. May include template values. | `[]` | | clusterresourcesync.affinity | object | affinity configurations for the syncresources pods | `{}` | diff --git a/content/deployment/selfhosted-deployment/_index.md b/content/deployment/selfhosted-deployment/_index.md index 6bcfb1e9c..2cc3a6819 100644 --- a/content/deployment/selfhosted-deployment/_index.md +++ b/content/deployment/selfhosted-deployment/_index.md @@ -35,18 +35,18 @@ In a self-hosted intra-cluster deployment, the control plane and data plane comm ```mermaid graph TB subgraph cluster["Kubernetes Cluster"] - subgraph cp["Namespace: union-cp (Control Plane)"] + subgraph cp["Controlplane Namespace"] cpingress["NGINX Ingress\n(TLS/HTTP2)\nClusterIP"] - flyteadmin["Flyteadmin"] + admin["Admin"] identity["Identity"] - executions["Executions"] + services["Services"] - cpingress --> flyteadmin + cpingress --> admin cpingress --> identity - cpingress --> executions + cpingress --> services end - subgraph dp["Namespace: union (Data Plane)"] + subgraph dp["Dataplane Namespace"] dpingress["NGINX Ingress\nClusterIP"] operator["Operator"] propeller["Propeller"] @@ -65,10 +65,10 @@ graph TB dpingress -.->|"Internal DNS"| cpingress cpingress -.->|"Internal DNS"| dpingress - flyteadmin --> db + admin --> db identity --> db - executions --> db - flyteadmin --> storage + services --> db + admin --> storage operator --> storage end ``` @@ -126,6 +126,10 @@ Deploy the data plane with GCS and Workload Identity Configure OIDC/OAuth2 authentication for your deployment {{< /link-card >}} +{{< link-card target="./authorization" icon="shield" title="Authorization" >}} +Configure authorization mode (Noop, External, or Union built-in RBAC) +{{< /link-card >}} + {{< link-card target="./image-builder" icon="package" title="Image builder" >}} Register the image builder for automatic container image builds {{< /link-card >}} diff --git a/content/deployment/selfhosted-deployment/authentication.md b/content/deployment/selfhosted-deployment/authentication.md index d462f35ee..345c4668a 100644 --- a/content/deployment/selfhosted-deployment/authentication.md +++ b/content/deployment/selfhosted-deployment/authentication.md @@ -127,7 +127,7 @@ global: AUTH_TOKEN_URL: "https://your-idp.example.com/oauth2/default/v1/token" ``` -Enable authentication in FlyteAdmin: +Enable authentication in the admin service: ```yaml flyte: @@ -146,21 +146,23 @@ flyte: The control plane needs secrets for the browser login app (App 1) and the service-to-service app (App 3): ```shell -# Secret for flyteadmin (mounted at /etc/secrets/) +# Secret for admin service (mounted at /etc/secrets/) +# Note: "flyte-admin-secrets" is the default name expected by the Helm chart kubectl create secret generic flyte-admin-secrets \ --from-literal=client_secret='' \ - -n union-cp + -n -# Secret for flyte-scheduler (mounted at /etc/secrets/) +# Secret for scheduler (mounted at /etc/secrets/) +# Note: "flyte-secret-auth" is the default name expected by the Helm chart kubectl create secret generic flyte-secret-auth \ --from-literal=client_secret='' \ - -n union-cp + -n -# Add service-to-service client secret to the main secret -kubectl create secret generic union-controlplane-secrets \ +# Add service-to-service client secret to the controlplane secrets +kubectl create secret generic \ --from-literal=pass.txt='' \ --from-literal=client_secret='' \ - -n union-cp --dry-run=client -o yaml | kubectl apply -f - + -n --dry-run=client -o yaml | kubectl apply -f - ``` > [!NOTE] @@ -180,7 +182,7 @@ Create the data plane auth secret: ```shell kubectl create secret generic union-secret-auth \ --from-literal=client_secret='' \ - -n union + -n ``` ## Step 5: Configure EAGER_API_KEY @@ -199,7 +201,7 @@ Create the Kubernetes secret in the data plane namespace: ```shell kubectl create secret generic \ --from-literal=='' \ - -n union + -n ``` > [!NOTE] @@ -212,7 +214,7 @@ Deploy or upgrade both the control plane and data plane with the updated configu ```shell # Upgrade control plane helm upgrade unionai-controlplane unionai/controlplane \ - --namespace union-cp \ + --namespace \ -f values..selfhosted-intracluster.yaml \ -f values.registry.yaml \ -f values..selfhosted-overrides.yaml \ @@ -220,7 +222,7 @@ helm upgrade unionai-controlplane unionai/controlplane \ # Upgrade data plane helm upgrade unionai-dataplane unionai/dataplane \ - --namespace union \ + --namespace \ -f values..selfhosted-intracluster.yaml \ -f values..selfhosted-overrides.yaml \ --timeout 10m --wait @@ -229,31 +231,31 @@ helm upgrade unionai-dataplane unionai/dataplane \ ## Verification ```shell -# Check flyteadmin logs for auth initialization -kubectl logs -n union-cp deploy/flyteadmin | grep -i auth +# Check admin service logs for auth initialization +kubectl logs -n deploy/ | grep -i auth # Test the /me endpoint (should return 401 without a token) -kubectl exec -n union-cp deploy/flyteadmin -- \ +kubectl exec -n deploy/ -- \ curl -s -o /dev/null -w "%{http_code}" \ - https://controlplane-nginx-controller.union-cp.svc.cluster.local/me -k + https://..svc.cluster.local/me -k # Test CLI login uctl config init --host https:// uctl get project # Check data plane operator auth -kubectl logs -n union -l app.kubernetes.io/name=operator --tail=50 | grep -i "token\|auth" +kubectl logs -n -l app.kubernetes.io/name=operator --tail=50 | grep -i "token\|auth" ``` ## Summary of secrets | Secret name | Namespace | Keys | Source | |-------------|-----------|------|--------| -| `flyte-admin-secrets` | `union-cp` | `client_secret` | Browser login app (App 1) secret | -| `flyte-secret-auth` | `union-cp` | `client_secret` | Browser login app (App 1) secret | -| `union-controlplane-secrets` | `union-cp` | `pass.txt`, `client_secret` | DB password, Service-to-service app (App 3) secret | -| `union-secret-auth` | `union` | `client_secret` | Operator app (App 4) secret | -| EAGER secret | `union` | varies | EAGER app (App 5) encoded key | +| `flyte-admin-secrets` (Helm chart default) | `` | `client_secret` | Browser login app (App 1) secret | +| `flyte-secret-auth` (Helm chart default) | `` | `client_secret` | Browser login app (App 1) secret | +| `` | `` | `pass.txt`, `client_secret` | DB password, Service-to-service app (App 3) secret | +| `union-secret-auth` | `` | `client_secret` | Operator app (App 4) secret | +| EAGER secret | `` | varies | EAGER app (App 5) encoded key | ## Self-hosted vs. self-managed authentication @@ -266,26 +268,26 @@ kubectl logs -n union -l app.kubernetes.io/name=operator --tail=50 | grep -i "to ## Troubleshooting -### FlyteAdmin auth endpoints return 404 +### Admin service auth endpoints return 404 Ensure `useAuth: true` is set under `flyte.configmap.adminServer.server.security`. Without this, the `/login`, `/callback`, and `/me` endpoints are not registered. ### Token validation fails with "audience mismatch" -The `allowedAudience` in the FlyteAdmin configuration must include `https://`. This should match the audience configured on your authorization server. +The `allowedAudience` in the admin service configuration must include `https://`. This should match the audience configured on your authorization server. ### Data plane cannot authenticate to control plane ```shell # Verify AUTH_CLIENT_ID is set -kubectl get configmap -n union -o yaml | grep -i auth_client +kubectl get configmap -n -o yaml | grep -i auth_client # Check that union-secret-auth exists -kubectl get secret union-secret-auth -n union \ +kubectl get secret union-secret-auth -n \ -o jsonpath='{.data.client_secret}' | base64 -d # Check operator logs -kubectl logs -n union -l app.kubernetes.io/name=operator --tail=50 \ +kubectl logs -n -l app.kubernetes.io/name=operator --tail=50 \ | grep -i "auth\|token\|401" ``` diff --git a/content/deployment/selfhosted-deployment/authorization.md b/content/deployment/selfhosted-deployment/authorization.md new file mode 100644 index 000000000..940083e5e --- /dev/null +++ b/content/deployment/selfhosted-deployment/authorization.md @@ -0,0 +1,405 @@ +--- +title: Authorization +weight: 6 +variants: -flyte -serverless -byoc +selfmanaged +mermaid: true +--- +# Authorization + +{{< key product_name >}} self-hosted deployments support configurable authorization backends to control who can perform which actions on platform resources. The authorization mode determines how access control decisions are made for API requests from the console, CLI, and SDK. + +Unlike other deployment models where {{< key product_name >}} manages RBAC for you, **self-hosted deployments let you choose the authorization model** that fits your organization's security requirements. + +## Prerequisites + +Authorization builds on top of [authentication]({{< relref "authentication" >}}). Before configuring authorization, ensure: + +1. **Authentication is configured and working** — all five OAuth2 applications are created and the control plane is accepting authenticated requests. +2. **Custom claims are configured on your authorization server:** + +| Claim | Values | Required for | Used for | +|-------|--------|-------------|----------| +| `sub` | User's internal ID or app's client ID | All modes | Primary identity for authorization decisions | +| `identitytype` | `"user"` or `"app"` | Union mode | Distinguishes human users from service accounts. Not strictly required for External mode — your external server can determine identity type from the `sub` claim or JWT payload directly. | +| `preferred_username` | User login or app client ID | All modes | Identity injection ("Owned By" display in the console) | + +3. **You understand which OAuth apps generate which identity types:** + +| OAuth App | # | Token `sub` claim | Identity type | Purpose in authorization | +|-----------|---|-------------------|---------------|--------------------------| +| Browser login | 1 | User's internal ID | `user` | End-user console/UI actions | +| CLI | 2 | User's internal ID (interactive) or app's client ID (service credentials) | `user` or `app` | End-user or automated CLI actions | +| Service-to-service | 3 | App's client ID | `app` | Internal platform calls | +| Operator | 4 | App's client ID | `app` | Dataplane → controlplane operations | +| EAGER | 5 | App's client ID | `app` | Task pod operations on behalf of users | + +> [!NOTE] +> Apps 3–5 are internal platform service accounts. Your external authorization server must grant them appropriate permissions for the platform to function. See [Service account permissions](#service-account-permissions) below. + +## Architecture + +All controlplane services route authorization decisions through a centralized authorization component that delegates to the configured backend: + +```mermaid +graph LR + subgraph cp["Controlplane"] + A["Service A"] --> Auth["Authorize()"] + B["Service B"] --> Auth + C["Service C"] --> Auth + D["Service ..."] --> Auth + Auth --> Backend["Backend\n(Noop / Union / External)"] + end +``` + +Each controlplane service forwards `Authorize()` calls and the configured backend returns allow/deny decisions. + +## Authorization modes + +{{< key product_name >}} supports three authorization modes: + +| Mode | Backend | Best for | Enforcement | Configuration | +|------|---------|----------|-------------|---------------| +| **Noop** | None | Development, small teams | All requests allowed | Default, no config needed | +| **Union** | {{< key product_name >}} RBAC | Production deployments | {{< key product_name >}}-managed policies | Built-in, enable via config | +| **External** | Customer-provided gRPC server | Organizations with existing RBAC systems | Customer-defined policies | Requires external server | + +### Noop (default) + +No authorization enforcement — all authenticated requests are allowed. This is the default mode. + +**When to use:** +- Development and testing environments +- Small teams where all users are trusted +- Initial deployment before configuring authorization +- Environments where authentication alone provides sufficient access control + +**Trade-offs:** +- No access control beyond authentication +- Any authenticated user can perform any action on any resource +- No audit trail of authorization decisions + +### Union (built-in RBAC) — recommended + +{{< key product_name >}}'s built-in authorization engine, **embedded in the controlplane Helm chart**. It deploys automatically when enabled, with no separate chart installation required. Provides role-based access control with predefined roles (Admin, Contributor, Viewer) and policy-based fine-grained permissions. + +> [!NOTE] +> The Helm config value `type: "UserClouds"` is a legacy name from an earlier implementation. It activates {{< key product_name >}}'s built-in authorization engine. This will be renamed to `type: "Union"` in a future release. + +**When to use:** +- Production deployments wanting out-of-the-box RBAC with no additional infrastructure +- Organizations that need role management through the {{< key product_name >}} console +- Teams wanting a performant, battle-tested authorization backend with low operational burden + +**Trade-offs:** +- Built-in role management (Admin, Contributor, Viewer) with full RBAC — assign users and groups to roles with resource-level granularity +- Zero additional infrastructure — embedded in the controlplane chart, managed by {{< key product_name >}} +- Uses the controlplane database for policy storage — no separate database required +- This is the same authorization engine used by {{< key product_name >}}'s managed deployments + +### External + +Delegates authorization decisions to a customer-provided gRPC server. The external server receives the caller's identity, the requested action, and the target resource, and returns an allow/deny decision. + +> [!WARNING] +> The external authorization server is called on **every API request**. Its latency directly impacts platform response times. Ensure your server can handle the request volume with low latency (<10ms p99 recommended). + +**When to use:** +- Organizations with existing RBAC/policy engines (e.g. OPA, Cedar, custom systems) where a sync with {{< key product_name >}}'s native authorization is undesirable or not possible +- Enterprises requiring authorization integration with internal identity management +- Deployments needing custom authorization logic beyond role-based access + +**Trade-offs:** +- Full control over authorization policies and logic +- Requires building, deploying, and operating an external authorization server +- The external server is on the critical path — its reliability and performance directly impact platform availability +- Higher operational burden than Union mode — you own the server's uptime, scaling, and policy management +- {{< key product_name >}} owns the authorization routing layer; the customer owns the external backend + +> [!NOTE] +> A **fail-open** option (`failOpen: true`) allows requests when the external server is unreachable. This trades security for availability — use with caution in production. + +## Configuration + +Authorization mode is configured in the controlplane Helm values under `services.authorizer.configMap.authorizer`. + +### Noop + +No configuration required — this is the default: + +```yaml +services: + authorizer: + configMap: + authorizer: + type: "Noop" +``` + +### Union (built-in RBAC) + +Enable {{< key product_name >}}'s built-in RBAC: + +```yaml +services: + authorizer: + configMap: + authorizer: + type: "UserClouds" # Legacy name — activates Union's built-in RBAC +``` + +The `userCloudsClient` defaults are pre-configured in the chart. In most cases you only need to change the `type` field. + +### External + +Configure the authorization backend to proxy to your external gRPC server: + +```yaml +services: + authorizer: + configMap: + authorizer: + type: "External" + externalClient: + grpcConfig: + # gRPC target for your external authorization server. + # Uses standard gRPC name resolution (dns:///, unix:///, etc). + host: "dns:///your-authz-server.namespace.svc.cluster.local:50051" + + # Connect without TLS (plaintext). Set to false for TLS connections. + insecure: true + + # Skip server certificate verification (TLS only). Do NOT use in production. + # insecureSkipVerify: false + + # Timeout per gRPC call (default: "5s"). + # perRetryTimeout: "5s" + + # Max retries. 0 = fail fast, no retries (default: 0). + # maxRetries: 0 + + # gRPC metadata keys forwarded to the external server. + # Default: ["authorization", "flyte-authorization"] + forwardHeaders: + - authorization + - flyte-authorization + + # Allow requests when the external server is unreachable. + # If false (default), deny on error. + failOpen: false +``` + +## External authorization server requirements + +This section applies only to **External** mode. Your authorization server must meet the following requirements. + +### gRPC contract + +Your server must implement the `AuthorizerService.Authorize` unary RPC: + +```protobuf +service AuthorizerService { + rpc Authorize(AuthorizeRequest) returns (AuthorizeResponse); +} +``` + +**Request fields:** + +| Field | Type | Description | +|-------|------|-------------| +| `identity` | `Identity` | The caller — one of `user_id` (subject), `application_id` (subject), or `external_identity` (subject + raw OIDC token) | +| `action` | `Action` enum | The operation being requested | +| `resource` | `Resource` | The target resource (organization, domain, project, or cluster) | +| `organization` | `string` | The organization identifier | + +**Response:** + +| Field | Type | Description | +|-------|------|-------------| +| `allowed` | `bool` | `true` to allow the request, `false` to deny | + +### Identity resolution + +The caller's identity is resolved and forwarded to your server through two channels: + +1. **`AuthorizeRequest.identity` protobuf field** (recommended) — a structured identity containing `user_id.subject`, `application_id.subject`, or `external_identity.subject` + `external_identity.token`. This is the primary source of identity and is always populated when identity can be resolved. + +2. **gRPC metadata headers** (`authorization` / `flyte-authorization`) — forwarded to your server if configured in `forwardHeaders`. Contains the raw JWT/OIDC token. Your server can decode the JWT payload to read claims (`sub`, `identitytype`, `email`, `groups`, etc.) without signature verification — the token has already been validated upstream. + +> [!NOTE] +> For browser and CLI requests, identity is resolved from the authentication layer's headers (`X-User-Subject`, `X-User-Claim-Identitytype`). For service-to-service requests, identity is resolved from the JWT token in gRPC metadata. + +### Actions + +Your server must handle the following authorization actions: + +| Action | Description | Typical callers | +|--------|-------------|-----------------| +| `ACTION_VIEW_FLYTE_INVENTORY` | View workflows, tasks, launch plans | All users and services | +| `ACTION_VIEW_FLYTE_EXECUTIONS` | View executions and run details | All users and services | +| `ACTION_REGISTER_FLYTE_INVENTORY` | Register workflows, tasks, launch plans | Contributors, operators, EAGER | +| `ACTION_CREATE_FLYTE_EXECUTIONS` | Launch executions | Contributors, operators, EAGER | +| `ACTION_ADMINISTER_PROJECT` | Manage project settings | Admins | +| `ACTION_MANAGE_PERMISSIONS` | Manage user roles and policies | Admins | +| `ACTION_ADMINISTER_ACCOUNT` | Account-level administration | Admins | +| `ACTION_MANAGE_CLUSTER` | Cluster lifecycle operations | Operators (App 4) | +| `ACTION_EDIT_EXECUTION_RELATED_ATTRIBUTES` | Modify execution attributes | Contributors, operators | +| `ACTION_EDIT_CLUSTER_RELATED_ATTRIBUTES` | Modify cluster attributes | Operators | +| `ACTION_EDIT_UNUSED_ATTRIBUTES` | Modify other attributes | Contributors | +| `ACTION_SUPPORT_SYSTEM_LOGS` | Access system logs | Admins | +| `ACTION_VIEW_IDENTITIES` | View user/app identities | Admins | + +### Service account permissions + +Your external authorization server must grant appropriate permissions to the internal platform service accounts (OAuth Apps 3–5 from [Authentication]({{< relref "authentication" >}})). Without these, internal platform operations will fail. + +| OAuth App | # | Subject (`sub` claim) | Required permissions | +|-----------|---|----------------------|----------------------| +| Service-to-service | 3 | `INTERNAL_CLIENT_ID` value | All actions listed above (this is the internal platform identity) | +| Operator | 4 | `AUTH_CLIENT_ID` value | `MANAGE_CLUSTER`, `VIEW_FLYTE_INVENTORY`, `VIEW_FLYTE_EXECUTIONS`, `CREATE_FLYTE_EXECUTIONS` | +| EAGER | 5 | EAGER app client ID | `VIEW_FLYTE_INVENTORY`, `VIEW_FLYTE_EXECUTIONS`, `REGISTER_FLYTE_INVENTORY`, `CREATE_FLYTE_EXECUTIONS`, `EDIT_EXECUTION_RELATED_ATTRIBUTES`, `EDIT_CLUSTER_RELATED_ATTRIBUTES` | + +> [!WARNING] +> If the operator service account (App 4) is not granted `MANAGE_CLUSTER`, the dataplane will be unable to register with the controlplane or send heartbeats. If the EAGER service account (App 5) is not granted execution permissions, task pods will fail to launch child tasks or register workflow artifacts. + +**Example:** If your external server uses a static subject-to-role mapping, the configuration might look like: + +```yaml +subjects: + # Human users + "user@example.com": Admin + + # Internal platform service accounts — use the OAuth app client IDs + # from your identity provider. These are the same client IDs configured + # in the authentication step. + "": PlatformAdmin # App 3: internal platform identity + "": ClusterOperator # App 4: dataplane operator + "": RuntimeService # App 5: task execution +``` + +Your implementation may use different role names or a different permission model entirely — the requirement is that these subjects are granted the listed actions. + +### Reference implementation + +A reference implementation is available as an example for testing and development. Contact {{< key product_name >}} support for access. It demonstrates: + +- Extracting identity from the `AuthorizeRequest.identity` protobuf field +- Falling back to forwarded JWT metadata headers +- Static subject → role → action permission mapping +- Logging identity resolution and authorization decisions + +> [!NOTE] +> The reference implementation is intended for testing and development. Production implementations should integrate with your organization's identity and policy management systems. + +## Observability + +The controlplane exposes Prometheus metrics for monitoring authorization decisions and backend health. These are included in the controlplane Grafana dashboard under the **Authorizer** row. + +### Key metrics + +| Metric | Type | Description | +|--------|------|-------------| +| `authz_allowed{action}` | Counter | Allowed decisions by action type | +| `authz_denied{action}` | Counter | Denied decisions by action type | +| `authorize_duration` | Histogram | End-to-end Authorize() latency | +| `authorize_errors_total{error_source}` | Counter | Errors by source (backend, identity_resolution) | +| `authz_type_info{type}` | Gauge | Active authorization mode | +| `external:errors{grpc_code}` | Counter | External backend errors by gRPC status code | +| `external:authorize_duration` | Histogram | External backend call latency | +| `external:fail_open_activated` | Counter | Fail-open bypass events | +| `external:connection_state` | Gauge | gRPC connection state to external backend | + +### Alerts + +When [alerting is enabled]({{< relref "monitoring#alerting" >}}), the following authorization-specific alerts are available: + +| Alert | Severity | Condition | +|-------|----------|-----------| +| `UnionCPAuthorizerExternalErrors` | Warning | External backend errors >0.1/s for 5 minutes | +| `UnionCPAuthorizerFailOpenActive` | Critical | Fail-open is actively bypassing authorization | +| `UnionCPAuthorizerHighDenyRate` | Warning | Authorization deny rate exceeds 50% for 10 minutes | + +## Verification + +After configuring authorization, verify it's working: + +1. **Check the authorization component is running:** + +```shell +kubectl get pods -n -l app.kubernetes.io/name=authorizer +``` + +2. **Verify the authorization mode in logs:** + +```shell +kubectl logs -n deployment/authorizer | grep "Authz client config" +# Expected: Authz client config: type=External (or Noop, UserClouds) +``` + +3. **For External mode, verify connectivity:** + +```shell +kubectl logs -n deployment/authorizer | grep "external authorization" +# Expected: Initializing an external authorization proxy service with endpoint ... +``` + +4. **Verify from the console:** Navigate to the {{< key product_name >}} console and confirm you can view projects and runs without errors. + +5. **Verify from the CLI:** Trigger a workflow execution to confirm the non-browser flow works: + +```shell +uctl get project +uctl create execution --project --domain development --launch-plan +``` + +6. **For External mode, verify service account access:** Monitor the external server logs for requests from the internal platform service accounts (Apps 3, 4, 5). Ensure all are receiving `ALLOWED` decisions. + +## Troubleshooting + +### All requests denied + +- **Check service account mappings** — the most common cause is that the internal platform service accounts (Apps 3, 4, 5) are not granted permissions in the external server. Check the external server logs for `DENIED` decisions with service account subjects. +- Check that the external authorization server is running and reachable +- Verify the `grpcConfig.host` endpoint is correct (use `dns:///` prefix for DNS-based resolution) +- Temporarily set `failOpen: true` to confirm the issue is with the external backend + +### Dataplane cannot register or heartbeat + +The operator (App 4) needs `ACTION_MANAGE_CLUSTER` permission. Check: + +```shell +kubectl logs -n deployment/authorizer | grep "MANAGE_CLUSTER" +``` + +If you see denied decisions for the operator's client ID, add it to your external server's permission configuration. + +### Workflows fail to launch child tasks + +The EAGER service account (App 5) needs `ACTION_CREATE_FLYTE_EXECUTIONS` and `ACTION_REGISTER_FLYTE_INVENTORY`. Check: + +```shell +kubectl logs -n deployment/authorizer | grep "" +``` + +### "Owned By: Unknown" in the console + +The `preferred_username` claim is not configured in your identity provider. See [Authentication — Authorization server setup]({{< relref "authentication#authorization-server-setup" >}}). + +### Authorization component crashlooping + +- Check logs: `kubectl logs -n deployment/authorizer` +- Verify the `type` field is a valid value (`Noop`, `External`, or `UserClouds` for Union RBAC) +- Ensure the `externalClient.grpcConfig.host` is set when using `External` mode + +### High latency on API calls + +- Check `external:authorize_duration` metrics in the Grafana dashboard +- The authorization backend is on the critical path — external backend latency directly impacts API response times +- Consider reducing `perRetryTimeout` or setting `maxRetries: 0` for fail-fast behavior + +### Connection errors to external backend + +- Check `external:errors{grpc_code}` metrics for the failure mode: + - `Unavailable`: Network connectivity issue — verify the service endpoint and port + - `DeadlineExceeded`: Timeout — the external server is too slow to respond + - `Internal`/`Unknown`: Application error in the external server +- Use `insecure: true` for plaintext connections within the cluster +- Use `insecureSkipVerify: true` only for testing with self-signed certificates diff --git a/content/deployment/selfhosted-deployment/control-plane-aws.md b/content/deployment/selfhosted-deployment/control-plane-aws.md index 0540baddd..b856c6944 100644 --- a/content/deployment/selfhosted-deployment/control-plane-aws.md +++ b/content/deployment/selfhosted-deployment/control-plane-aws.md @@ -37,16 +37,16 @@ helm repo update ### Step 2: Create registry image pull secret -Create the registry secret in the `union-cp` namespace: +Create the registry secret in the control plane namespace: ```shell -kubectl create namespace union-cp +kubectl create namespace kubectl create secret docker-registry union-registry-secret \ --docker-server="registry.unionai.cloud" \ --docker-username="" \ --docker-password="" \ - -n union-cp + -n ``` > [!NOTE] @@ -65,12 +65,12 @@ gRPC requires TLS for HTTP/2 with NGINX. You can use self-signed certificates fo openssl req -x509 -nodes -days 365 -newkey rsa:2048 \ -keyout controlplane-tls.key \ -out controlplane-tls.crt \ - -subj "/CN=controlplane-nginx-controller.union-cp.svc.cluster.local" + -subj "/CN=..svc.cluster.local" kubectl create secret tls controlplane-tls-cert \ --key controlplane-tls.key \ --cert controlplane-tls.crt \ - -n union-cp + -n ``` {{< /tab >}} @@ -84,14 +84,14 @@ For production deployments, use cert-manager with a self-signed `ClusterIssuer` ### Step 4: Create database password secret ```shell -kubectl create secret generic union-controlplane-secrets \ +kubectl create secret generic \ --from-literal=pass.txt='' \ - -n union-cp + -n ``` > [!NOTE] -> The secret name `union-controlplane-secrets` is required and should not be changed. > The secret must contain a key named `pass.txt` with the database password. +> The default secret name is set in your Helm values. ### Step 5: Download values files @@ -122,7 +122,7 @@ To enable authentication, add the OIDC configuration to this file. See the [Auth ```shell helm upgrade --install unionai-controlplane unionai/controlplane \ - --namespace union-cp \ + --namespace \ --create-namespace \ -f values.aws.selfhosted-intracluster.yaml \ -f values.registry.yaml \ @@ -141,21 +141,24 @@ helm upgrade --install unionai-controlplane unionai/controlplane \ ```shell # Check pod status -kubectl get pods -n union-cp +kubectl get pods -n # Verify services are running -kubectl get svc -n union-cp +kubectl get svc -n -# Check flyteadmin logs -kubectl logs -n union-cp deploy/flyteadmin --tail=50 +# Check admin service logs +kubectl logs -n deploy/ --tail=50 # Test internal connectivity -kubectl exec -n union-cp deploy/flyteadmin -- \ - curl -k https://controlplane-nginx-controller.union-cp.svc.cluster.local +kubectl exec -n deploy/ -- \ + curl -k https://..svc.cluster.local ``` All pods should be in `Running` state and internal connectivity should succeed. +> [!NOTE] +> Replace `` with your Helm release namespace (the namespace you used during `helm install`). Replace `` and `` with the actual deployment names from `kubectl get deploy -n `. + ## Key configuration ### Single-tenant mode @@ -173,22 +176,22 @@ Configure the namespace and name of the Kubernetes TLS secret: ```yaml global: - TLS_SECRET_NAMESPACE: "union-cp" + TLS_SECRET_NAMESPACE: "" TLS_SECRET_NAME: "controlplane-tls-cert" ingress-nginx: controller: extraArgs: - default-ssl-certificate: "union-cp/controlplane-tls-cert" + default-ssl-certificate: "/controlplane-tls-cert" ``` ### Service discovery Control plane services discover each other via Kubernetes DNS: -- **Flyteadmin**: `flyteadmin.union-cp.svc.cluster.local:81` -- **NGINX Ingress**: `controlplane-nginx-controller.union-cp.svc.cluster.local` -- **Data plane** (for dataproxy): `dataplane-nginx-controller.union.svc.cluster.local` +- **Admin service**: `..svc.cluster.local:81` +- **NGINX Ingress**: `..svc.cluster.local` +- **Data plane** (for dataproxy): `..svc.cluster.local` ## Next steps @@ -200,29 +203,29 @@ Control plane services discover each other via Kubernetes DNS: ### Control plane pods not starting ```shell -kubectl describe pod -n union-cp +kubectl describe pod -n kubectl top nodes -kubectl get secret -n union-cp +kubectl get secret -n ``` ### TLS/Certificate errors ```shell -kubectl get secret controlplane-tls-cert -n union-cp -kubectl get secret controlplane-tls-cert -n union-cp \ +kubectl get secret controlplane-tls-cert -n +kubectl get secret controlplane-tls-cert -n \ -o jsonpath='{.data.tls\.crt}' | base64 -d | openssl x509 -text -noout -kubectl logs -n union-cp deploy/controlplane-nginx-controller +kubectl logs -n deploy/ ``` ### Database connection failures ```shell # Verify credentials -kubectl get secret union-controlplane-secrets -n union-cp \ +kubectl get secret -n \ -o jsonpath='{.data.pass\.txt}' | base64 -d # Test connectivity -kubectl run -n union-cp test-db --image=postgres:14 --rm -it -- \ +kubectl run -n test-db --image=postgres:14 --rm -it -- \ psql -h -U -d ``` @@ -230,13 +233,13 @@ kubectl run -n union-cp test-db --image=postgres:14 --rm -it -- \ ```shell # Verify service endpoints -kubectl get svc -n union-cp | grep -E 'flyteadmin|nginx-controller' +kubectl get svc -n | grep -E 'admin\|nginx-controller' # Test DNS resolution from data plane namespace -kubectl run -n union test-dns --image=busybox --rm -it -- \ - nslookup controlplane-nginx-controller.union-cp.svc.cluster.local +kubectl run -n test-dns --image=busybox --rm -it -- \ + nslookup ..svc.cluster.local # Check network policies -kubectl get networkpolicies -n union-cp -kubectl get networkpolicies -n union +kubectl get networkpolicies -n +kubectl get networkpolicies -n ``` diff --git a/content/deployment/selfhosted-deployment/control-plane-gcp.md b/content/deployment/selfhosted-deployment/control-plane-gcp.md index 1566d2b27..1e8af250d 100644 --- a/content/deployment/selfhosted-deployment/control-plane-gcp.md +++ b/content/deployment/selfhosted-deployment/control-plane-gcp.md @@ -42,16 +42,16 @@ helm repo update ### Step 2: Create registry image pull secret -Create the registry secret in the `union-cp` namespace: +Create the registry secret in the control plane namespace: ```shell -kubectl create namespace union-cp +kubectl create namespace kubectl create secret docker-registry union-registry-secret \ --docker-server="registry.unionai.cloud" \ --docker-username="" \ --docker-password="" \ - -n union-cp + -n ``` > [!NOTE] @@ -70,12 +70,12 @@ gRPC requires TLS for HTTP/2 with NGINX. You can use self-signed certificates fo openssl req -x509 -nodes -days 365 -newkey rsa:2048 \ -keyout controlplane-tls.key \ -out controlplane-tls.crt \ - -subj "/CN=controlplane-nginx-controller.union-cp.svc.cluster.local" + -subj "/CN=..svc.cluster.local" kubectl create secret tls controlplane-tls-cert \ --key controlplane-tls.key \ --cert controlplane-tls.crt \ - -n union-cp + -n ``` {{< /tab >}} @@ -89,14 +89,14 @@ For production deployments, use cert-manager with a self-signed `ClusterIssuer` ### Step 4: Create database password secret ```shell -kubectl create secret generic union-controlplane-secrets \ +kubectl create secret generic \ --from-literal=pass.txt='' \ - -n union-cp + -n ``` > [!NOTE] -> The secret name `union-controlplane-secrets` is required and should not be changed. > The secret must contain a key named `pass.txt` with the database password. +> The default secret name is set in your Helm values. ### Step 5: Download values files @@ -128,7 +128,7 @@ To enable authentication, add the OIDC configuration to this file. See the [Auth ```shell helm upgrade --install unionai-controlplane unionai/controlplane \ - --namespace union-cp \ + --namespace \ --create-namespace \ -f values.gcp.selfhosted-intracluster.yaml \ -f values.registry.yaml \ @@ -147,21 +147,24 @@ helm upgrade --install unionai-controlplane unionai/controlplane \ ```shell # Check pod status -kubectl get pods -n union-cp +kubectl get pods -n # Verify services are running -kubectl get svc -n union-cp +kubectl get svc -n -# Check flyteadmin logs -kubectl logs -n union-cp deploy/flyteadmin --tail=50 +# Check admin service logs +kubectl logs -n deploy/ --tail=50 # Test internal connectivity -kubectl exec -n union-cp deploy/flyteadmin -- \ - curl -k https://controlplane-nginx-controller.union-cp.svc.cluster.local +kubectl exec -n deploy/ -- \ + curl -k https://..svc.cluster.local ``` All pods should be in `Running` state and internal connectivity should succeed. +> [!NOTE] +> Replace `` with your Helm release namespace (the namespace you used during `helm install`). Replace `` and `` with the actual deployment names from `kubectl get deploy -n `. + ## Key configuration ### Single-tenant mode @@ -179,22 +182,22 @@ Configure the namespace and name of the Kubernetes TLS secret: ```yaml global: - TLS_SECRET_NAMESPACE: "union-cp" + TLS_SECRET_NAMESPACE: "" TLS_SECRET_NAME: "controlplane-tls-cert" ingress-nginx: controller: extraArgs: - default-ssl-certificate: "union-cp/controlplane-tls-cert" + default-ssl-certificate: "/controlplane-tls-cert" ``` ### Service discovery Control plane services discover each other via Kubernetes DNS: -- **Flyteadmin**: `flyteadmin.union-cp.svc.cluster.local:81` -- **NGINX Ingress**: `controlplane-nginx-controller.union-cp.svc.cluster.local` -- **Data plane** (for dataproxy): `dataplane-nginx-controller.union.svc.cluster.local` +- **Admin service**: `..svc.cluster.local:81` +- **NGINX Ingress**: `..svc.cluster.local` +- **Data plane** (for dataproxy): `..svc.cluster.local` ## Next steps @@ -206,29 +209,29 @@ Control plane services discover each other via Kubernetes DNS: ### Control plane pods not starting ```shell -kubectl describe pod -n union-cp +kubectl describe pod -n kubectl top nodes -kubectl get secret -n union-cp +kubectl get secret -n ``` ### TLS/Certificate errors ```shell -kubectl get secret controlplane-tls-cert -n union-cp -kubectl get secret controlplane-tls-cert -n union-cp \ +kubectl get secret controlplane-tls-cert -n +kubectl get secret controlplane-tls-cert -n \ -o jsonpath='{.data.tls\.crt}' | base64 -d | openssl x509 -text -noout -kubectl logs -n union-cp deploy/controlplane-nginx-controller +kubectl logs -n deploy/ ``` ### Database connection failures ```shell # Verify credentials -kubectl get secret union-controlplane-secrets -n union-cp \ +kubectl get secret -n \ -o jsonpath='{.data.pass\.txt}' | base64 -d # Test connectivity -kubectl run -n union-cp test-db --image=postgres:14 --rm -it -- \ +kubectl run -n test-db --image=postgres:14 --rm -it -- \ psql -h -U -d ``` @@ -236,13 +239,13 @@ kubectl run -n union-cp test-db --image=postgres:14 --rm -it -- \ ```shell # Verify service account annotations -kubectl get sa -n union-cp -o yaml | grep iam.gke.io/gcp-service-account +kubectl get sa -n -o yaml | grep iam.gke.io/gcp-service-account # Check IAM bindings gcloud iam service-accounts get-iam-policy # Verify pod can authenticate -kubectl exec -n union-cp deploy/flyteadmin -- \ +kubectl exec -n deploy/ -- \ curl -H "Metadata-Flavor: Google" \ http://metadata.google.internal/computeMetadata/v1/instance/service-accounts/default/email ``` @@ -251,13 +254,13 @@ kubectl exec -n union-cp deploy/flyteadmin -- \ ```shell # Verify service endpoints -kubectl get svc -n union-cp | grep -E 'flyteadmin|nginx-controller' +kubectl get svc -n | grep -E 'admin\|nginx-controller' # Test DNS resolution from data plane namespace -kubectl run -n union test-dns --image=busybox --rm -it -- \ - nslookup controlplane-nginx-controller.union-cp.svc.cluster.local +kubectl run -n test-dns --image=busybox --rm -it -- \ + nslookup ..svc.cluster.local # Check network policies -kubectl get networkpolicies -n union-cp -kubectl get networkpolicies -n union +kubectl get networkpolicies -n +kubectl get networkpolicies -n ``` diff --git a/content/deployment/selfhosted-deployment/data-plane-aws.md b/content/deployment/selfhosted-deployment/data-plane-aws.md index 295abb963..7d69e87fb 100644 --- a/content/deployment/selfhosted-deployment/data-plane-aws.md +++ b/content/deployment/selfhosted-deployment/data-plane-aws.md @@ -15,7 +15,7 @@ This guide covers deploying the {{< key product_name >}} data plane in the same In addition to the [general prerequisites](./_index#prerequisites): -1. **{{< key product_name >}} control plane** deployed in the same cluster (namespace `union-cp`) +1. **{{< key product_name >}} control plane** deployed in the same cluster 2. **S3 buckets** for data plane metadata storage 3. **IAM roles** configured with [IRSA](https://docs.aws.amazon.com/eks/latest/userguide/iam-roles-for-service-accounts.html) for backend and worker service accounts 4. **Network connectivity** between data plane and control plane namespaces @@ -26,7 +26,7 @@ In addition to the [general prerequisites](./_index#prerequisites): ```shell helm upgrade --install unionai-dataplane-crds unionai/dataplane-crds \ - --namespace union \ + --namespace \ --create-namespace ``` @@ -47,9 +47,9 @@ global: AWS_REGION: "us-east-1" BACKEND_IAM_ROLE_ARN: "arn:aws:iam::123456789012:role/union-backend" WORKER_IAM_ROLE_ARN: "arn:aws:iam::123456789012:role/union-worker" - CONTROLPLANE_INTRA_CLUSTER_HOST: "controlplane-nginx-controller.union-cp.svc.cluster.local" - QUEUE_SERVICE_HOST: "queue.union-cp.svc.cluster.local:80" - CACHESERVICE_ENDPOINT: "cacheservice.union-cp.svc.cluster.local:89" + CONTROLPLANE_INTRA_CLUSTER_HOST: "..svc.cluster.local" + QUEUE_SERVICE_HOST: "..svc.cluster.local:80" + CACHESERVICE_ENDPOINT: "..svc.cluster.local:89" ``` If authentication is enabled on the control plane, also set `AUTH_CLIENT_ID`. See the [Authentication](./authentication) guide. @@ -58,7 +58,7 @@ If authentication is enabled on the control plane, also set `AUTH_CLIENT_ID`. Se ```shell helm upgrade --install unionai-dataplane unionai/dataplane \ - --namespace union \ + --namespace \ --create-namespace \ -f values.aws.selfhosted-intracluster.yaml \ -f values.aws.selfhosted-overrides.yaml \ @@ -75,14 +75,14 @@ helm upgrade --install unionai-dataplane unionai/dataplane \ ```shell # Check that data plane pods are running -kubectl get pods -n union +kubectl get pods -n # Verify connectivity to control plane -kubectl logs -n union -l app.kubernetes.io/name=operator --tail=50 | grep "connection" +kubectl logs -n -l app.kubernetes.io/name=operator --tail=50 | grep "connection" # Check service DNS resolution -kubectl exec -n union deploy/unionai-dataplane-operator -- \ - nslookup controlplane-nginx-controller.union-cp.svc.cluster.local +kubectl exec -n deploy/unionai-dataplane-operator -- \ + nslookup ..svc.cluster.local ``` ## Key differences from self-managed deployment @@ -101,23 +101,23 @@ kubectl exec -n union deploy/unionai-dataplane-operator -- \ ```shell # Check DNS resolution from data plane namespace -kubectl run -n union test-dns --image=busybox --rm -it -- \ - nslookup controlplane-nginx-controller.union-cp.svc.cluster.local +kubectl run -n test-dns --image=busybox --rm -it -- \ + nslookup ..svc.cluster.local # Verify the service exists -kubectl get svc -n union-cp | grep nginx-controller +kubectl get svc -n | grep nginx-controller ``` ### Connection refused errors ```shell # Verify control plane services are running -kubectl get svc -n union-cp -kubectl get pods -n union-cp +kubectl get svc -n +kubectl get pods -n # Check network policies -kubectl get networkpolicies -n union -kubectl get networkpolicies -n union-cp +kubectl get networkpolicies -n +kubectl get networkpolicies -n ``` ### Certificate verification errors diff --git a/content/deployment/selfhosted-deployment/data-plane-gcp.md b/content/deployment/selfhosted-deployment/data-plane-gcp.md index bcfd13ad5..bc87f4793 100644 --- a/content/deployment/selfhosted-deployment/data-plane-gcp.md +++ b/content/deployment/selfhosted-deployment/data-plane-gcp.md @@ -20,7 +20,7 @@ This guide covers deploying the {{< key product_name >}} data plane in the same In addition to the [general prerequisites](./_index#prerequisites): -1. **{{< key product_name >}} control plane** deployed in the same cluster (namespace `union-cp`) +1. **{{< key product_name >}} control plane** deployed in the same cluster 2. **GCS buckets** for data plane metadata storage 3. **GCP service accounts** configured with [Workload Identity](https://cloud.google.com/kubernetes-engine/docs/how-to/workload-identity) for backend and worker service accounts 4. **Network connectivity** between data plane and control plane namespaces @@ -31,7 +31,7 @@ In addition to the [general prerequisites](./_index#prerequisites): ```shell helm upgrade --install unionai-dataplane-crds unionai/dataplane-crds \ - --namespace union \ + --namespace \ --create-namespace ``` @@ -53,9 +53,9 @@ global: GOOGLE_PROJECT_ID: "my-gcp-project" BACKEND_IAM_ROLE_ARN: "union-backend@my-project.iam.gserviceaccount.com" WORKER_IAM_ROLE_ARN: "union-worker@my-project.iam.gserviceaccount.com" - CONTROLPLANE_INTRA_CLUSTER_HOST: "controlplane-nginx-controller.union-cp.svc.cluster.local" - QUEUE_SERVICE_HOST: "queue.union-cp.svc.cluster.local:80" - CACHESERVICE_ENDPOINT: "cacheservice.union-cp.svc.cluster.local:89" + CONTROLPLANE_INTRA_CLUSTER_HOST: "..svc.cluster.local" + QUEUE_SERVICE_HOST: "..svc.cluster.local:80" + CACHESERVICE_ENDPOINT: "..svc.cluster.local:89" ``` If authentication is enabled on the control plane, also set `AUTH_CLIENT_ID`. See the [Authentication](./authentication) guide. @@ -64,7 +64,7 @@ If authentication is enabled on the control plane, also set `AUTH_CLIENT_ID`. Se ```shell helm upgrade --install unionai-dataplane unionai/dataplane \ - --namespace union \ + --namespace \ --create-namespace \ -f values.gcp.selfhosted-intracluster.yaml \ -f values.gcp.selfhosted-overrides.yaml \ @@ -81,14 +81,14 @@ helm upgrade --install unionai-dataplane unionai/dataplane \ ```shell # Check that data plane pods are running -kubectl get pods -n union +kubectl get pods -n # Verify connectivity to control plane -kubectl logs -n union -l app.kubernetes.io/name=operator --tail=50 | grep "connection" +kubectl logs -n -l app.kubernetes.io/name=operator --tail=50 | grep "connection" # Check service DNS resolution -kubectl exec -n union deploy/unionai-dataplane-operator -- \ - nslookup controlplane-nginx-controller.union-cp.svc.cluster.local +kubectl exec -n deploy/unionai-dataplane-operator -- \ + nslookup ..svc.cluster.local ``` ## Key differences from self-managed deployment @@ -107,37 +107,37 @@ kubectl exec -n union deploy/unionai-dataplane-operator -- \ ```shell # Check DNS resolution from data plane namespace -kubectl run -n union test-dns --image=busybox --rm -it -- \ - nslookup controlplane-nginx-controller.union-cp.svc.cluster.local +kubectl run -n test-dns --image=busybox --rm -it -- \ + nslookup ..svc.cluster.local # Verify the service exists -kubectl get svc -n union-cp | grep nginx-controller +kubectl get svc -n | grep nginx-controller ``` ### Connection refused errors ```shell # Verify control plane services are running -kubectl get svc -n union-cp -kubectl get pods -n union-cp +kubectl get svc -n +kubectl get pods -n # Check network policies -kubectl get networkpolicies -n union -kubectl get networkpolicies -n union-cp +kubectl get networkpolicies -n +kubectl get networkpolicies -n ``` ### Workload Identity issues ```shell # Verify service account annotations -kubectl get sa -n union -o yaml | grep iam.gke.io/gcp-service-account +kubectl get sa -n -o yaml | grep iam.gke.io/gcp-service-account # Check IAM bindings gcloud iam service-accounts get-iam-policy gcloud iam service-accounts get-iam-policy # Verify pod can authenticate -kubectl exec -n union deploy/unionai-dataplane-operator -- \ +kubectl exec -n deploy/unionai-dataplane-operator -- \ curl -H "Metadata-Flavor: Google" \ http://metadata.google.internal/computeMetadata/v1/instance/service-accounts/default/email ``` diff --git a/content/deployment/selfhosted-deployment/image-builder.md b/content/deployment/selfhosted-deployment/image-builder.md index cff0ce0dd..ce72965d0 100644 --- a/content/deployment/selfhosted-deployment/image-builder.md +++ b/content/deployment/selfhosted-deployment/image-builder.md @@ -1,6 +1,6 @@ --- title: Image builder -weight: 5 +weight: 8 variants: -flyte -serverless -byoc +selfmanaged --- diff --git a/content/deployment/selfhosted-deployment/monitoring.md b/content/deployment/selfhosted-deployment/monitoring.md index c1809389a..fb2c1d443 100644 --- a/content/deployment/selfhosted-deployment/monitoring.md +++ b/content/deployment/selfhosted-deployment/monitoring.md @@ -1,7 +1,8 @@ --- title: Monitoring -weight: 8 +weight: 7 variants: -flyte -serverless -byoc +selfmanaged +mermaid: true --- # Monitoring @@ -13,49 +14,43 @@ variants: -flyte -serverless -byoc +selfmanaged In a self-hosted deployment, the controlplane and dataplane share a single Kubernetes cluster. The controlplane namespace runs Prometheus, Grafana, and AlertManager. Prometheus scrapes metrics from services in both namespaces. -``` -┌──────────────────────────────────────────────────────────────┐ -│ Kubernetes Cluster │ -│ │ -│ controlplane namespace dataplane namespace │ -│ ┌─────────────────────┐ ┌────────────────────┐ │ -│ │ Prometheus │── scrapes ──▶│ Union Operator │ │ -│ │ Grafana │ │ Executor (V2) │ │ -│ │ AlertManager │ │ Propeller (V1) │ │ -│ │ │ │ │ │ -│ │ FlyteAdmin │ │ ServiceMonitor │ │ -│ │ Executions │ │ PrometheusRule │ │ -│ │ Queue │ │ Dashboard CM │ │ -│ │ Cluster │ └────────────────────┘ │ -│ │ Authorizer │ │ -│ │ ... │ ┌────────────────────┐ │ -│ └─────────────────────┘ │ Static Prometheus │ │ -│ │ (Union features) │ │ -│ └────────────────────┘ │ -└──────────────────────────────────────────────────────────────┘ +```mermaid +graph LR + subgraph cluster["Kubernetes Cluster"] + subgraph cp["Controlplane Namespace"] + prom["Prometheus\nGrafana\nAlertManager"] + cpsvc["CP Services\nServiceMonitor\nPrometheusRule\nDashboard CM"] + end + + subgraph dp["Dataplane Namespace"] + dpsvc["Operator\nExecutor\nPropeller"] + dpmon["ServiceMonitor\nPrometheusRule\nDashboard CM"] + static["Static Prometheus\n(Union features)"] + end + + prom -- scrapes --> dpsvc + prom -- scrapes --> cpsvc + end ``` ### Separate controlplane and dataplane clusters When the controlplane and dataplane run in separate clusters, each cluster can run its own monitoring stack independently. The dataplane chart includes the same Prometheus, Grafana, and alerting capabilities. -``` -┌──────────────────────────┐ ┌──────────────────────────┐ -│ Controlplane Cluster │ │ Dataplane Cluster │ -│ │ │ │ -│ ┌─────────────────────┐ │ │ ┌──────────────────────┐ │ -│ │ Prometheus │ │ │ │ Prometheus │ │ -│ │ Grafana │ │ │ │ Grafana │ │ -│ │ AlertManager │ │ │ │ AlertManager │ │ -│ │ │ │ │ │ │ │ -│ │ CP Services │ │ │ │ Union Operator │ │ -│ │ ServiceMonitor │ │ │ │ Executor (V2) │ │ -│ │ PrometheusRule │ │ │ │ Propeller (V1) │ │ -│ │ Dashboard CM │ │ │ │ ServiceMonitor │ │ -│ └─────────────────────┘ │ │ │ PrometheusRule │ │ -│ │ │ │ Dashboard CM │ │ -│ │ │ └──────────────────────┘ │ -└──────────────────────────┘ └──────────────────────────┘ +```mermaid +graph LR + subgraph cpcluster["Controlplane Cluster"] + cpprom["Prometheus\nGrafana\nAlertManager"] + cpstuff["CP Services\nServiceMonitor\nPrometheusRule\nDashboard CM"] + end + + subgraph dpcluster["Dataplane Cluster"] + dpprom["Prometheus\nGrafana\nAlertManager"] + dpstuff["Operator · Executor · Propeller\nServiceMonitor\nPrometheusRule\nDashboard CM"] + end + + cpprom -- scrapes --> cpstuff + dpprom -- scrapes --> dpstuff ``` ## Dashboards