From 71abd72bc8040a4d21b91832ce1bfcec6f827d8a Mon Sep 17 00:00:00 2001 From: Elliot Gunton Date: Wed, 26 Nov 2025 10:04:07 +0000 Subject: [PATCH 1/3] feat: OffloadTaskResults to separate etcd store (configurable) Signed-off-by: Elliot Gunton --- cmd/argoexec/executor/init.go | 9 ++++++- config/config.go | 11 ++++++++ .../workflow-controller-configmap.yaml | 4 +++ workflow/controller/controller.go | 25 ++++++++++++++++++- workflow/controller/controller_test.go | 2 +- workflow/controller/operator.go | 2 +- workflow/controller/taskresult.go | 2 +- 7 files changed, 50 insertions(+), 5 deletions(-) diff --git a/cmd/argoexec/executor/init.go b/cmd/argoexec/executor/init.go index 5b7f9c6e42cc..ee601aab6709 100644 --- a/cmd/argoexec/executor/init.go +++ b/cmd/argoexec/executor/init.go @@ -10,6 +10,7 @@ import ( "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" restclient "k8s.io/client-go/rest" "k8s.io/client-go/tools/clientcmd" @@ -79,7 +80,13 @@ func Init(ctx context.Context, clientConfig clientcmd.ClientConfig, varRunArgo s wfExecutor := executor.NewExecutor( ctx, clientset, - versioned.NewForConfigOrDie(config).ArgoprojV1alpha1().WorkflowTaskResults(namespace), + versioned.NewForConfigOrDie(&rest.Config{ + Host: "https://argo-wtr-apiserver.argo.svc.cluster.local:443", + BearerToken: "mytoken", + TLSClientConfig: rest.TLSClientConfig{ + Insecure: true, + }, + }).ArgoprojV1alpha1().WorkflowTaskResults(namespace), restClient, podName, types.UID(os.Getenv(common.EnvVarPodUID)), diff --git a/config/config.go b/config/config.go index c98bc9ae3326..aa8aca77437d 100644 --- a/config/config.go +++ b/config/config.go @@ -123,6 +123,9 @@ type Config struct { // ArtifactDrivers lists artifact driver plugins we can use ArtifactDrivers []ArtifactDriver `json:"artifactDrivers,omitempty"` + + // OffloadTaskResults holds the config for offloading task results to a separate store + OffloadTaskResults *OffloadTaskResultsConfig `json:"offloadTaskResults,omitempty"` } // ArtifactDriver is a plugin for an artifact driver @@ -452,3 +455,11 @@ func (req *WorkflowRestrictions) MustNotChangeSpec() bool { } return req.TemplateReferencing == TemplateReferencingSecure } + +type OffloadTaskResultsConfig struct { + // Enabled controls offloading. Default false. + Enabled bool `json:"enabled,omitempty"` + + // APIServer is the Kube API endpoint to write WorkflowTaskResults to. + APIServer string `json:"APIServer,omitempty"` +} diff --git a/manifests/base/workflow-controller/workflow-controller-configmap.yaml b/manifests/base/workflow-controller/workflow-controller-configmap.yaml index d28f4edb3f28..237f80e9a873 100644 --- a/manifests/base/workflow-controller/workflow-controller-configmap.yaml +++ b/manifests/base/workflow-controller/workflow-controller-configmap.yaml @@ -2,3 +2,7 @@ apiVersion: v1 kind: ConfigMap metadata: name: workflow-controller-configmap +data: + offloadTaskResults: | + enabled: true + APIServer: argo-wtr-etcd.argo.svc:2379 diff --git a/workflow/controller/controller.go b/workflow/controller/controller.go index 5bf5d5dadc86..e5e5f283f9a5 100644 --- a/workflow/controller/controller.go +++ b/workflow/controller/controller.go @@ -40,6 +40,7 @@ import ( argoErr "github.com/argoproj/argo-workflows/v3/errors" "github.com/argoproj/argo-workflows/v3/persist/sqldb" wfv1 "github.com/argoproj/argo-workflows/v3/pkg/apis/workflow/v1alpha1" + "github.com/argoproj/argo-workflows/v3/pkg/client/clientset/versioned" wfclientset "github.com/argoproj/argo-workflows/v3/pkg/client/clientset/versioned" "github.com/argoproj/argo-workflows/v3/pkg/client/informers/externalversions" wfextvv1alpha1 "github.com/argoproj/argo-workflows/v3/pkg/client/informers/externalversions/workflow/v1alpha1" @@ -112,6 +113,7 @@ type WorkflowController struct { rateLimiter *rate.Limiter dynamicInterface dynamic.Interface wfclientset wfclientset.Interface + wftrclientset wfclientset.Interface // maxStackDepth is a configurable limit to the depth of the "stack", which is increased with every nested call to // woc.executeTemplate and decreased when such calls return. This is used to prevent infinite recursion @@ -184,22 +186,43 @@ func init() { // NewWorkflowController instantiates a new WorkflowController func NewWorkflowController(ctx context.Context, restConfig *rest.Config, kubeclientset kubernetes.Interface, wfclientset wfclientset.Interface, namespace, managedNamespace, executorImage, executorImagePullPolicy, executorLogFormat, configMap string, executorPlugins bool) (*WorkflowController, error) { + logger := logging.RequireLoggerFromContext(ctx) dynamicInterface, err := dynamic.NewForConfig(restConfig) if err != nil { return nil, err } + configController := config.NewController(namespace, configMap, kubeclientset) + config, err := configController.Get(ctx) + if err != nil { + return nil, err + } + + wftrclientset := wfclientset + if config.OffloadTaskResults != nil && config.OffloadTaskResults.Enabled { + logger.Info(ctx, "Setting offloadtaskresults client") + offloadCfg := rest.Config{ + Host: config.OffloadTaskResults.APIServer, + BearerToken: "mytoken", + TLSClientConfig: rest.TLSClientConfig{ + Insecure: true, // TODO: Not use insecure + }, + } + wftrclientset = versioned.NewForConfigOrDie(&offloadCfg) + } + wfc := WorkflowController{ restConfig: restConfig, kubeclientset: kubeclientset, dynamicInterface: dynamicInterface, wfclientset: wfclientset, + wftrclientset: wftrclientset, namespace: namespace, managedNamespace: managedNamespace, cliExecutorImage: executorImage, cliExecutorImagePullPolicy: executorImagePullPolicy, cliExecutorLogFormat: executorLogFormat, - configController: config.NewController(namespace, configMap, kubeclientset), + configController: configController, workflowKeyLock: syncpkg.NewKeyLock(), cacheFactory: controllercache.NewCacheFactory(kubeclientset, namespace), eventRecorderManager: events.NewEventRecorderManager(kubeclientset), diff --git a/workflow/controller/controller_test.go b/workflow/controller/controller_test.go index b061a293ecb0..265bbdc42edd 100644 --- a/workflow/controller/controller_test.go +++ b/workflow/controller/controller_test.go @@ -473,7 +473,7 @@ func withOutputs(ctx context.Context, outputs wfv1.Outputs) with { Outputs: &outputs, }, } - _, err := woc.controller.wfclientset.ArgoprojV1alpha1().WorkflowTaskResults(woc.wf.Namespace). + _, err := woc.controller.wftrclientset.ArgoprojV1alpha1().WorkflowTaskResults(woc.wf.Namespace). Create( ctx, taskResult, diff --git a/workflow/controller/operator.go b/workflow/controller/operator.go index 07b0632eddf0..b0390c0b4385 100644 --- a/workflow/controller/operator.go +++ b/workflow/controller/operator.go @@ -834,7 +834,7 @@ func (woc *wfOperationCtx) checkTaskResultsInProgress(ctx context.Context) bool func (woc *wfOperationCtx) deleteTaskResults(ctx context.Context) error { deletePropagationBackground := metav1.DeletePropagationBackground - return woc.controller.wfclientset.ArgoprojV1alpha1().WorkflowTaskResults(woc.wf.Namespace). + return woc.controller.wftrclientset.ArgoprojV1alpha1().WorkflowTaskResults(woc.wf.Namespace). DeleteCollection( ctx, metav1.DeleteOptions{PropagationPolicy: &deletePropagationBackground}, diff --git a/workflow/controller/taskresult.go b/workflow/controller/taskresult.go index 971c7b7d7692..4e9ef858c985 100644 --- a/workflow/controller/taskresult.go +++ b/workflow/controller/taskresult.go @@ -35,7 +35,7 @@ func (wfc *WorkflowController) newWorkflowTaskResultInformer(ctx context.Context // This is a generated function, so we can't change the context. //nolint:contextcheck informer := wfextvv1alpha1.NewFilteredWorkflowTaskResultInformer( - wfc.wfclientset, + wfc.wftrclientset, wfc.GetManagedNamespace(), 20*time.Minute, cache.Indexers{ From 3fd45ca68dcd68ab631811875ceb463ed352bd8a Mon Sep 17 00:00:00 2001 From: Elliot Gunton Date: Thu, 27 Nov 2025 16:43:37 +0000 Subject: [PATCH 2/3] docs: Add offloading workflow task results to "scaling" page * Also add minimal YAMLs for creating kube-apiserver (not to be merged) Signed-off-by: Elliot Gunton --- docs/running-at-massive-scale.md | 2 + docs/scaling.md | 192 ++++++++++++++++++ .../poc-workflow-task-result-offloader.yaml | 138 +++++++++++++ .../workflow-controller-configmap.yaml | 2 +- 4 files changed, 333 insertions(+), 1 deletion(-) create mode 100644 manifests/base/workflow-controller/poc-workflow-task-result-offloader.yaml diff --git a/docs/running-at-massive-scale.md b/docs/running-at-massive-scale.md index e57221a71050..89dcd5240e28 100644 --- a/docs/running-at-massive-scale.md +++ b/docs/running-at-massive-scale.md @@ -27,6 +27,8 @@ Where Argo has a lot of work to do, the Kubernetes API can be overwhelmed. There * Limit the number of concurrent workflows using parallelism. * Rate-limit pod creation [configuration](workflow-controller-configmap.yaml) (>= v3.1). * Set [`DEFAULT_REQUEUE_TIME=1m`](environment-variables.md) +* Offload Workflow Task Results by using an external Kubernetes API Server via the `OffloadTaskResultsConfig` in the Workflow Controller ConfigMap. (>=4.0 [TBD]). + Read more in [Vertically Scaling](./scaling.md#offloading-workflow-task-results-to-a-secondary-kubernetes-api-server). ## Overwhelmed Database diff --git a/docs/scaling.md b/docs/scaling.md index 7fd8c277fdd9..eca6af96d8c3 100644 --- a/docs/scaling.md +++ b/docs/scaling.md @@ -100,6 +100,198 @@ It is not possible to provide a one-size-fits-all recommendation for these value !!! Note Despite the name, this rate limit only applies to the creation of Pods and not the creation of other Kubernetes resources (for example, ConfigMaps or PersistentVolumeClaims). +### Offloading Workflow Task Results to a Secondary Kubernetes API Server + +Workflow Task Results are how Argo Workflows tracks outputs of pods and passes them between tasks (in a DAG) and steps. +They are provided as a Custom Resource Definition (CRD) within the Argo Workflows installation, as `WorkflowTaskResults`, with the Argo executor creating and updating them, and the Workflow Controller reading them. +It is possible that with many workflows the Kubernetes API will be overwhelmed due to the creation and deletion of many `WorkflowTaskResults` on the cluster. +To solve this, the Workflow Controller ConfigMap can specify an `OffloadTaskResultsConfig`. + +#### POC Setup (Not for upstream docs) + +The goal is to have a fully functional Kubernetes API endpoint that stores Argo's `WorkflowTaskResults` in its own data store. +Conceptually, we will be running a lightweight sub-cluster within the main cluster, in a similar way to tools like `vcluster`. +For this, we will run a Kubernetes API Server (Service and Deployment) and point it to an `etcd` Service/Deployment for its backend storage. +This is loosely how the Kubernetes Control Plane itself runs -- for more information, take a look at the [Kubernetes Components](https://kubernetes.io/docs/concepts/overview/components/) documentation. + +##### Running a Kubernetes API Server + +###### Run an `etcd` instance + +We deploy a single-node `etcd`. The API server uses it exactly like the real Kubernetes control plane would. + +| Flag | Why we need it | +| ------------------------------------------------------ | ------------------------------------------------------- | +| `--data-dir=/var/lib/etcd` | Local storage. We use `emptyDir:` for ephemeral POC. | +| `--advertise-client-urls` / `--listen-client-urls` | Expose the client API on port 2379. | +| `--listen-peer-urls` / `--initial-advertise-peer-urls` | Required even for a single-member “cluster”. | +| `--initial-cluster` | Defines the cluster membership. Required syntactically. | + +The Service simply exposes port 2379 inside the namespace so the API server can reach it at `http://argo-wtr-etcd.argo.svc:2379`. + +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: argo-wtr-etcd + namespace: argo +spec: + replicas: 1 + selector: + matchLabels: + app: argo-wtr-etcd + template: + metadata: + labels: + app: argo-wtr-etcd + spec: + containers: + - name: etcd + image: quay.io/coreos/etcd:v3.6.6 + command: + - etcd + - --name=argo-wtr-etcd + - --data-dir=/var/lib/etcd + - --advertise-client-urls=http://0.0.0.0:2379 + - --listen-client-urls=http://0.0.0.0:2379 + - --listen-peer-urls=http://0.0.0.0:2380 + - --initial-advertise-peer-urls=http://0.0.0.0:2380 + - --initial-cluster=argo-wtr-etcd=http://0.0.0.0:2380 + ports: + - containerPort: 2379 + - containerPort: 2380 + volumeMounts: + - name: data + mountPath: /var/lib/etcd + volumes: + - name: data + emptyDir: {} +--- +apiVersion: v1 +kind: Service +metadata: + name: argo-wtr-etcd + namespace: argo +spec: + selector: + app: argo-wtr-etcd + ports: + - port: 2379 + targetPort: 2379 + name: client +``` + +###### Set Up Certs & API Server Security + +A full kube-apiserver normally requires multiple certificates, CA bundles, front-proxy certs, and authentication plugins. + +For this POC we run with the absolute minimum we can get away with: + +| File | Purpose | +| -------------------- | ------------------------------------------------------------------------------------------ | +| `tls.crt`, `tls.key` | Server certificate & private key for HTTPS endpoint (`--secure-port=443`). | +| `serviceaccount.key` | Used both as the *public* and *private* key for signing service account tokens. | +| `tokens.csv` | Static token authentication. Used so kubectl can authenticate without bootstrap machinery. | + +We create `tls.crt` and `tls.key` using: + +```console +openssl req -x509 -nodes -days 365 -newkey rsa:2048 -keyout tls.key -out tls.crt -subj "/CN=argo-wtr-apiserver" +``` + +We create `serviceaccount.key` using: + +```console +openssl genrsa -out serviceaccount.key 2048 +``` + +`tokens.csv` contains a static token authentication, where the file format is `,,,,,...`. + +Copy these values to the ConfigMap: + +```yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: certs-and-keys + namespace: argo +data: + serviceaccount.key: | + -----BEGIN PRIVATE KEY----- + + -----END PRIVATE KEY----- + tls.crt: | + -----BEGIN CERTIFICATE----- + + -----END CERTIFICATE----- + + tls.key: | + -----BEGIN PRIVATE KEY----- + + -----END PRIVATE KEY----- + tokens.csv: | + mytoken,admin,1,"system:masters" +``` + +###### Run the kube-apiserver + +| Flag | Why | +| --------------------------------------------------- | --------------------------------------------------- | +| `--etcd-servers=http://argo-wtr-etcd.argo.svc:2379` | Backend database. | +| `--secure-port=443` | Only expose HTTPS; insecure port removed in >=1.31. | +| `--tls-cert-file`, `--tls-private-key-file` | Required since insecure-port is gone. | +| `--token-auth-file=/var/run/kubernetes/tokens.csv` | Simplest auth flow for kubectl. | +| `--service-account-key-file` | Needed even if we don’t actually use SA tokens. | +| `--service-account-signing-key-file` | Required in 1.20+ to serve the SA issuer. | +| `--service-account-issuer` | Must match what your workloads use when validating. | +| `--authorization-mode=AlwaysAllow` | Disables RBAC entirely. | +| `--enable-admission-plugins=NamespaceLifecycle` | Default admission plugin required for namespace-scoped CRDs and is on by default in upstream. | + +###### Apply the `WorkflowTaskResults` CRD + +Once the API server is running, we can apply the CRD directly to it: + +```console +kubectl \ + --server=https://127.0.0.1:443 \ + --token=mytoken \ + --insecure-skip-tls-verify=true \ + apply -f argoproj.io_workflowtaskresults.yaml +``` + +###### Optional convenience: use a Config for `kubectl` and `k9s` + +To save writing out the args to `kubectl` and `k9s`, you can use this Config: + +```yaml +apiVersion: v1 +kind: Config +clusters: +- cluster: + server: https://127.0.0.1:443 + insecure-skip-tls-verify: true + name: argo-wtr-cluster +users: +- name: argo-wtr-user + user: + token: mytoken +contexts: +- context: + cluster: argo-wtr-cluster + user: argo-wtr-user + name: argo-wtr-context +current-context: argo-wtr-context +``` + +And run commands like: + +```console +KUBECONFIG=api-server-kubeconfig.yaml kubectl get ns +KUBECONFIG=api-server-kubeconfig.yaml ./k9s +``` + +(Download `k9s` to the container if using Dev Containers.) + ## Sharding ### One Install Per Namespace diff --git a/manifests/base/workflow-controller/poc-workflow-task-result-offloader.yaml b/manifests/base/workflow-controller/poc-workflow-task-result-offloader.yaml new file mode 100644 index 000000000000..d870807e9319 --- /dev/null +++ b/manifests/base/workflow-controller/poc-workflow-task-result-offloader.yaml @@ -0,0 +1,138 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: argo-wtr-etcd + namespace: argo +spec: + replicas: 1 + selector: + matchLabels: + app: argo-wtr-etcd + template: + metadata: + labels: + app: argo-wtr-etcd + spec: + containers: + - name: etcd + image: quay.io/coreos/etcd:v3.6.6 + command: + - etcd + - --name=argo-wtr-etcd + - --data-dir=/var/lib/etcd + - --advertise-client-urls=http://0.0.0.0:2379 + - --listen-client-urls=http://0.0.0.0:2379 + - --listen-peer-urls=http://0.0.0.0:2380 + - --initial-advertise-peer-urls=http://0.0.0.0:2380 + - --initial-cluster=argo-wtr-etcd=http://0.0.0.0:2380 + ports: + - containerPort: 2379 + - containerPort: 2380 + volumeMounts: + - name: data + mountPath: /var/lib/etcd + volumes: + - name: data + emptyDir: {} +--- +apiVersion: v1 +kind: Service +metadata: + name: argo-wtr-etcd + namespace: argo +spec: + selector: + app: argo-wtr-etcd + ports: + - port: 2379 + targetPort: 2379 + name: client +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: certs-and-keys + namespace: argo +data: + serviceaccount.key: | + -----BEGIN PRIVATE KEY----- + + -----END PRIVATE KEY----- + tls.crt: | + -----BEGIN CERTIFICATE----- + + -----END CERTIFICATE----- + + tls.key: | + -----BEGIN PRIVATE KEY----- + + -----END PRIVATE KEY----- + tokens.csv: | + mytoken,admin,1,system:masters +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: argo-wtr-apiserver + namespace: argo +spec: + replicas: 1 + selector: + matchLabels: + app: argo-wtr-apiserver + template: + metadata: + labels: + app: argo-wtr-apiserver + spec: + containers: + - name: kube-apiserver + image: registry.k8s.io/kube-apiserver:v1.31.9 + command: + - kube-apiserver + - --etcd-servers=http://argo-wtr-etcd.argo.svc:2379 + - --advertise-address=127.0.0.1 + - --allow-privileged=true + - --token-auth-file=/var/run/kubernetes/tokens.csv + - --service-account-key-file=/var/run/kubernetes/serviceaccount.key + - --service-account-signing-key-file=/var/run/kubernetes/serviceaccount.key + - --service-account-issuer=https://argo-wtr-apiserver.argo.svc + - --api-audiences=https://argo-wtr-apiserver.argo.svc + - --secure-port=443 + - --service-cluster-ip-range=10.96.0.0/12 + - --authorization-mode=AlwaysAllow + - --enable-admission-plugins=NamespaceLifecycle + - --tls-cert-file=/var/run/kubernetes/tls.crt + - --tls-private-key-file=/var/run/kubernetes/tls.key + ports: + - containerPort: 443 + volumeMounts: + - name: certs-and-keys-vol + mountPath: /var/run/kubernetes + readOnly: true + volumes: + - name: certs-and-keys-vol + configMap: + name: certs-and-keys + items: + - key: serviceaccount.key + path: serviceaccount.key + - key: tls.crt + path: tls.crt + - key: tls.key + path: tls.key + - key: tokens.csv + path: tokens.csv +--- +apiVersion: v1 +kind: Service +metadata: + name: argo-wtr-apiserver + namespace: argo +spec: + selector: + app: argo-wtr-apiserver + ports: + - port: 443 + targetPort: 443 + name: https diff --git a/manifests/base/workflow-controller/workflow-controller-configmap.yaml b/manifests/base/workflow-controller/workflow-controller-configmap.yaml index 237f80e9a873..ba13c645e115 100644 --- a/manifests/base/workflow-controller/workflow-controller-configmap.yaml +++ b/manifests/base/workflow-controller/workflow-controller-configmap.yaml @@ -5,4 +5,4 @@ metadata: data: offloadTaskResults: | enabled: true - APIServer: argo-wtr-etcd.argo.svc:2379 + APIServer: https://localhost:443 From dd7ca112eb2414d9b6251d95a57d727300058259 Mon Sep 17 00:00:00 2001 From: Elliot Gunton Date: Thu, 4 Dec 2025 16:13:51 +0000 Subject: [PATCH 3/3] fix: Refine instructions * Update `advertise-address` to stop "loopback" error messages in logs * Doesn't seem that manual port forward is needed before `make start` * Also create `argo` namespace (must match running workflow's namespace) Signed-off-by: Elliot Gunton --- cmd/argoexec/executor/init.go | 2 +- docs/scaling.md | 52 ++++++++++++++++--- .../poc-workflow-task-result-offloader.yaml | 15 ++++-- .../workflow-controller-configmap.yaml | 2 +- workflow/controller/controller.go | 6 +++ 5 files changed, 63 insertions(+), 14 deletions(-) diff --git a/cmd/argoexec/executor/init.go b/cmd/argoexec/executor/init.go index ee601aab6709..c9dd02fbde00 100644 --- a/cmd/argoexec/executor/init.go +++ b/cmd/argoexec/executor/init.go @@ -81,7 +81,7 @@ func Init(ctx context.Context, clientConfig clientcmd.ClientConfig, varRunArgo s ctx, clientset, versioned.NewForConfigOrDie(&rest.Config{ - Host: "https://argo-wtr-apiserver.argo.svc.cluster.local:443", + Host: "https://argo-wtr-apiserver.argo.svc.cluster.local:6443", BearerToken: "mytoken", TLSClientConfig: rest.TLSClientConfig{ Insecure: true, diff --git a/docs/scaling.md b/docs/scaling.md index eca6af96d8c3..890fd04e5500 100644 --- a/docs/scaling.md +++ b/docs/scaling.md @@ -189,7 +189,7 @@ For this POC we run with the absolute minimum we can get away with: | File | Purpose | | -------------------- | ------------------------------------------------------------------------------------------ | -| `tls.crt`, `tls.key` | Server certificate & private key for HTTPS endpoint (`--secure-port=443`). | +| `tls.crt`, `tls.key` | Server certificate & private key for HTTPS endpoint (`--secure-port=6443`). | | `serviceaccount.key` | Used both as the *public* and *private* key for signing service account tokens. | | `tokens.csv` | Static token authentication. Used so kubectl can authenticate without bootstrap machinery. | @@ -238,27 +238,47 @@ data: | Flag | Why | | --------------------------------------------------- | --------------------------------------------------- | | `--etcd-servers=http://argo-wtr-etcd.argo.svc:2379` | Backend database. | -| `--secure-port=443` | Only expose HTTPS; insecure port removed in >=1.31. | +| `--secure-port=6443` | Only expose HTTPS; insecure port removed in >=1.31. | | `--tls-cert-file`, `--tls-private-key-file` | Required since insecure-port is gone. | | `--token-auth-file=/var/run/kubernetes/tokens.csv` | Simplest auth flow for kubectl. | | `--service-account-key-file` | Needed even if we don’t actually use SA tokens. | | `--service-account-signing-key-file` | Required in 1.20+ to serve the SA issuer. | | `--service-account-issuer` | Must match what your workloads use when validating. | -| `--authorization-mode=AlwaysAllow` | Disables RBAC entirely. | +| `--authorization-mode=AlwaysAllow` | Disables RBAC entirely. | | `--enable-admission-plugins=NamespaceLifecycle` | Default admission plugin required for namespace-scoped CRDs and is on by default in upstream. | ###### Apply the `WorkflowTaskResults` CRD -Once the API server is running, we can apply the CRD directly to it: + +Once the API server is running, we can port forward it and apply the CRD directly to it. + +Port forward in a separate terminal: + +```console +kubectl -n argo port-forward service/argo-wtr-apiserver 6443:6443; +``` + +And then run the `apply`: + +```console +kubectl \ + --server=https://localhost:6443 \ + --token=mytoken \ + --insecure-skip-tls-verify=true \ + apply -f manifests/base/crds/minimal/argoproj.io_workflowtaskresults.yaml +``` + +Also create the `argo` namespace in the API server: ```console kubectl \ - --server=https://127.0.0.1:443 \ + --server=https://localhost:6443 \ --token=mytoken \ --insecure-skip-tls-verify=true \ - apply -f argoproj.io_workflowtaskresults.yaml + create ns argo ``` + ###### Optional convenience: use a Config for `kubectl` and `k9s` To save writing out the args to `kubectl` and `k9s`, you can use this Config: @@ -268,7 +288,7 @@ apiVersion: v1 kind: Config clusters: - cluster: - server: https://127.0.0.1:443 + server: https://localhost:6443 insecure-skip-tls-verify: true name: argo-wtr-cluster users: @@ -292,6 +312,24 @@ KUBECONFIG=api-server-kubeconfig.yaml ./k9s (Download `k9s` to the container if using Dev Containers.) +##### Set Up the Controller Config + +The final step is to tell our Workflows Controller about the offloadTaskResults config. +Based on the above config with the server at `https://localhost:6443`, we can use this `ConfigMap` as `manifests/base/workflow-controller/workflow-controller-configmap.yaml`: + +```yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: workflow-controller-configmap +data: + offloadTaskResults: | + enabled: true + APIServer: https://localhost:6443 +``` + +And finally (with the api-server still port-forwarded) run `make start` to run the workflow controller with workflowtaskresult offloading! + ## Sharding ### One Install Per Namespace diff --git a/manifests/base/workflow-controller/poc-workflow-task-result-offloader.yaml b/manifests/base/workflow-controller/poc-workflow-task-result-offloader.yaml index d870807e9319..7f9b64019984 100644 --- a/manifests/base/workflow-controller/poc-workflow-task-result-offloader.yaml +++ b/manifests/base/workflow-controller/poc-workflow-task-result-offloader.yaml @@ -88,24 +88,29 @@ spec: containers: - name: kube-apiserver image: registry.k8s.io/kube-apiserver:v1.31.9 + env: + - name: POD_IP + valueFrom: + fieldRef: + fieldPath: status.podIP command: - kube-apiserver - --etcd-servers=http://argo-wtr-etcd.argo.svc:2379 - - --advertise-address=127.0.0.1 + - --advertise-address=$(POD_IP) - --allow-privileged=true - --token-auth-file=/var/run/kubernetes/tokens.csv - --service-account-key-file=/var/run/kubernetes/serviceaccount.key - --service-account-signing-key-file=/var/run/kubernetes/serviceaccount.key - --service-account-issuer=https://argo-wtr-apiserver.argo.svc - --api-audiences=https://argo-wtr-apiserver.argo.svc - - --secure-port=443 + - --secure-port=6443 - --service-cluster-ip-range=10.96.0.0/12 - --authorization-mode=AlwaysAllow - --enable-admission-plugins=NamespaceLifecycle - --tls-cert-file=/var/run/kubernetes/tls.crt - --tls-private-key-file=/var/run/kubernetes/tls.key ports: - - containerPort: 443 + - containerPort: 6443 volumeMounts: - name: certs-and-keys-vol mountPath: /var/run/kubernetes @@ -133,6 +138,6 @@ spec: selector: app: argo-wtr-apiserver ports: - - port: 443 - targetPort: 443 + - port: 6443 + targetPort: 6443 name: https diff --git a/manifests/base/workflow-controller/workflow-controller-configmap.yaml b/manifests/base/workflow-controller/workflow-controller-configmap.yaml index ba13c645e115..93514e29a102 100644 --- a/manifests/base/workflow-controller/workflow-controller-configmap.yaml +++ b/manifests/base/workflow-controller/workflow-controller-configmap.yaml @@ -5,4 +5,4 @@ metadata: data: offloadTaskResults: | enabled: true - APIServer: https://localhost:443 + APIServer: https://localhost:6443 diff --git a/workflow/controller/controller.go b/workflow/controller/controller.go index e5e5f283f9a5..689997dff580 100644 --- a/workflow/controller/controller.go +++ b/workflow/controller/controller.go @@ -209,6 +209,12 @@ func NewWorkflowController(ctx context.Context, restConfig *rest.Config, kubecli }, } wftrclientset = versioned.NewForConfigOrDie(&offloadCfg) + _, err := wftrclientset.ArgoprojV1alpha1().WorkflowTaskResults("").List(ctx, metav1.ListOptions{}) + if err != nil { + logger.WithError(err).Error(ctx, "Offload cluster connectivity check failed") + } else { + logger.Info(ctx, "Offload cluster connectivity OK") + } } wfc := WorkflowController{