From c116245c3902eb9bcf841544a1c12c4aeb151fa2 Mon Sep 17 00:00:00 2001 From: Serhii Babak Date: Thu, 12 Mar 2026 12:15:23 +0200 Subject: [PATCH 01/24] SCALRCORE-37660 update docs --- .github/actions/update-app-version/index.js | 89 ++++++++++----------- .pre-commit-config.yaml | 6 ++ charts/agent-docker/README.md | 4 +- charts/agent-docker/README.md.gotmpl | 4 +- charts/agent-job/README.md | 40 ++++++--- charts/agent-job/README.md.gotmpl | 40 ++++++--- charts/agent-k8s/README.md | 8 +- charts/agent-k8s/README.md.gotmpl | 8 +- charts/agent-local/README.md | 2 +- charts/agent-local/README.md.gotmpl | 2 +- 10 files changed, 122 insertions(+), 81 deletions(-) diff --git a/.github/actions/update-app-version/index.js b/.github/actions/update-app-version/index.js index ed096b31..639b6891 100644 --- a/.github/actions/update-app-version/index.js +++ b/.github/actions/update-app-version/index.js @@ -1,78 +1,77 @@ -const yaml = require("js-yaml"); -const core = require("@actions/core"); -const exec = require("@actions/exec"); -const github = require("@actions/github"); -const semver = require("semver"); -const fs = require("fs"); -const path = require("path"); +const yaml = require('js-yaml') +const core = require('@actions/core') +const exec = require('@actions/exec') +const semver = require('semver') +const fs = require('fs') +const path = require('path') -const chartsDir = path.join(process.env.GITHUB_WORKSPACE, "charts"); -const appVersion = core.getInput("app_version", { required: true }); -core.info(`The appVersion ${appVersion}`); +const chartsDir = path.join(process.env.GITHUB_WORKSPACE, 'charts') +const appVersion = core.getInput('app_version', { required: true }) +core.info(`The appVersion ${appVersion}`) function getCharts() { - const files = fs.readdirSync(chartsDir); + const files = fs.readdirSync(chartsDir) const directories = files.filter((file) => { - const filePath = path.join(chartsDir, file); - return fs.statSync(filePath).isDirectory(); - }); - core.debug(`Charts: ${directories}`); - return directories; + const filePath = path.join(chartsDir, file) + return fs.statSync(filePath).isDirectory() + }) + core.debug(`Charts: ${directories}`) + return directories } function updateCharts(chart) { - const chartPath = path.join(chartsDir, chart, "Chart.yaml"); - const chartData = yaml.load(fs.readFileSync(chartPath, "utf8")); + const chartPath = path.join(chartsDir, chart, 'Chart.yaml') + const chartData = yaml.load(fs.readFileSync(chartPath, 'utf8')) - chartData.appVersion = appVersion; - chartData.version = semver.inc(chartData.version, "patch"); - const updatedYaml = yaml.dump(chartData, { lineWidth: -1 }); - fs.writeFileSync(chartPath, updatedYaml, "utf8"); - core.info(`The new version of ${chart} is ${chartData.version}`); - return chartData.version; + chartData.appVersion = appVersion + chartData.version = semver.inc(chartData.version, 'patch') + const updatedYaml = yaml.dump(chartData, { lineWidth: -1 }) + fs.writeFileSync(chartPath, updatedYaml, 'utf8') + core.info(`The new version of ${chart} is ${chartData.version}`) + return chartData.version } function updateCHANGELOG(chart, chartNewVersion) { - const changelogPath = path.join(chartsDir, chart, "CHANGELOG.md"); + const changelogPath = path.join(chartsDir, chart, 'CHANGELOG.md') const newSection = ` ## [v${chartNewVersion}] ### Updated - Bumping chart version to v${chartNewVersion} for scalr-agent v${appVersion} -`; +` const updatedChangelog = fs - .readFileSync(changelogPath, "utf8") - .replace("## [UNRELEASED]\n", `## [UNRELEASED]\n${newSection}`); - fs.writeFileSync(changelogPath, updatedChangelog, "utf8"); + .readFileSync(changelogPath, 'utf8') + .replace('## [UNRELEASED]\n', `## [UNRELEASED]\n${newSection}`) + fs.writeFileSync(changelogPath, updatedChangelog, 'utf8') } async function pushChanges() { - await exec.exec("git fetch"); - await exec.exec("git checkout master"); - await exec.exec('git config user.name "github-actions[bot]"'); - await exec.exec('git config user.email "github-actions[bot]@users.noreply.github.com"'); - await exec.exec("git add charts"); - await exec.exec(`git commit -m "Sync appVersion: ${appVersion}`); - await exec.exec("git push -u origin master"); + await exec.exec('git fetch') + await exec.exec('git checkout master') + await exec.exec('git config user.name "github-actions[bot]"') + await exec.exec('git config user.email "github-actions[bot]@users.noreply.github.com"') + await exec.exec('git add charts') + await exec.exec(`git commit -m "Sync appVersion: ${appVersion}`) + await exec.exec('git push -u origin master') } async function helmDocs() { - await exec.exec("helm-docs"); + await exec.exec('helm-docs') } async function run() { try { - const charts = getCharts(); + const charts = getCharts() charts.forEach(function (chart) { - const chartNewVersion = updateCharts(chart); - updateCHANGELOG(chart, chartNewVersion); - }); - await helmDocs(); - await pushChanges(); + const chartNewVersion = updateCharts(chart) + updateCHANGELOG(chart, chartNewVersion) + }) + await helmDocs() + await pushChanges() } catch (err) { - return core.setFailed(`Error: ${err}`); + return core.setFailed(`Error: ${err}`) } } -run(); +run() diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2dc4a1f2..7a87779e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -25,3 +25,9 @@ repos: args: [--allow-multiple-documents] exclude: "^(charts/agent-k8s/templates|charts/agent-docker/templates|charts/agent-local/templates|charts/agent-job/templates)" - id: check-added-large-files + - repo: https://github.com/lycheeverse/lychee.git + rev: lychee-v0.23.0 + hooks: + - id: lychee + args: ["--no-progress", ".", "--offline", "--include-fragments", "--extensions", "md"] + pass_filenames: false diff --git a/charts/agent-docker/README.md b/charts/agent-docker/README.md index 821730bc..4b250c97 100644 --- a/charts/agent-docker/README.md +++ b/charts/agent-docker/README.md @@ -7,14 +7,14 @@ where runs are executed in [dind](https://hub.docker.com/_/docker) sidecar conta Run phases are isolated into docker containers. > [!WARNING] -> This chart is planned for deprecation. Use [`agent-local`](/charts/agent-local) instead. +> This chart is planned for deprecation. Use [`agent-local`](../agent-local) instead. ## Overview This chart uses the Scalr Agent with the `docker` driver and a Docker-in-Docker sidecar container. Originally built to run the Docker-based Agent on Kubernetes due to the lack of native Kubernetes support. It has been retained due to adoption challenges with the native agent-k8s chart, we recommend using the newer -[agent-local](../charts/agent-local) chart for new installations instead of agent-docker. +[agent-local](../agent-local) chart for new installations instead of agent-docker. This Kubernetes deployment does not scale across multiple replicas. As a result, the compute capacity managed by each agent is limited to a single node. You can run multiple separate Deployments within diff --git a/charts/agent-docker/README.md.gotmpl b/charts/agent-docker/README.md.gotmpl index 3414c94f..e684b29d 100644 --- a/charts/agent-docker/README.md.gotmpl +++ b/charts/agent-docker/README.md.gotmpl @@ -4,14 +4,14 @@ {{ template "chart.description" . }} > [!WARNING] -> This chart is planned for deprecation. Use [`agent-local`](/charts/agent-local) instead. +> This chart is planned for deprecation. Use [`agent-local`](../agent-local) instead. ## Overview This chart uses the Scalr Agent with the `docker` driver and a Docker-in-Docker sidecar container. Originally built to run the Docker-based Agent on Kubernetes due to the lack of native Kubernetes support. It has been retained due to adoption challenges with the native agent-k8s chart, we recommend using the newer -[agent-local](../charts/agent-local) chart for new installations instead of agent-docker. +[agent-local](../agent-local) chart for new installations instead of agent-docker. This Kubernetes deployment does not scale across multiple replicas. As a result, the compute capacity managed by each agent is limited to a single node. You can run multiple separate Deployments within diff --git a/charts/agent-job/README.md b/charts/agent-job/README.md index 4eb49bfc..ec9d5328 100644 --- a/charts/agent-job/README.md +++ b/charts/agent-job/README.md @@ -8,9 +8,6 @@ in its own Kubernetes Job. See the [official documentation](https://docs.scalr.io/docs/agent-pools) for more information about Scalr Agents. -> [!WARNING] -> This chart is in Beta, and implementation details are subject to change. - ## Table of Contents - [Prerequisites](#prerequisites) @@ -36,7 +33,8 @@ See the [official documentation](https://docs.scalr.io/docs/agent-pools) for mor - Kubernetes 1.35+ - Helm 3.0+ -- ReadWriteMany volumes for [Cache Volume Persistence](#cache-volume-persistence) (optional) +- Cluster-admin permissions (or a role with `customresourcedefinitions` create/update at cluster scope) to install the bundled [CRD](#custom-resource-definitions) +- Optional: [ReadWriteMany](https://kubernetes.io/docs/concepts/storage/persistent-volumes/#access-modes) volume for [Cache Volume Persistence](#cache-volume-persistence) ## Installation @@ -468,9 +466,11 @@ Leave `sentryDsn` empty (the default) to disable Sentry integration. ## Custom Resource Definitions -This chart bundles the **AgentTaskTemplate CRD** (`agenttasktemplates.scalr.io`) and installs or upgrades it automatically via Helm. The CRD defines the job template that the controller uses to create task pods, so no separate manual step is required in most environments. +This chart bundles the `agenttasktemplates.scalr.io` CRD and installs or upgrades it automatically via Helm. The CRD defines the job template that the controller uses to create task pods. -**Verify installation:** +Installing the CRD requires cluster-admin permissions or a role with `customresourcedefinitions` create/update at cluster scope. The identity running `helm install` must have these permissions. + +Verify installation: ```shell kubectl get crd agenttasktemplates.scalr.io @@ -488,6 +488,27 @@ Set `rbac.create=false` to bring your own ServiceAccount/Rules, or adjust permis ## Troubleshooting and Support +### CRD Installation Fails: Insufficient Permissions + +If `helm install` fails with an error like: + +``` +Error: failed to install CRD crds/agenttasktemplate.yaml: 1 error occurred: + * customresourcedefinitions.apiextensions.k8s.io is forbidden: User "..." cannot create resource + "customresourcedefinitions" in API group "apiextensions.k8s.io" at the cluster scope +``` + +The identity running `helm install` does not have cluster-admin permissions. This chart installs the [CRD](#custom-resource-definitions) on first install, which requires cluster-scoped `customresourcedefinitions` create/update access. + +**Fix:** Run `helm install` with a cluster-admin account (or an IAM role/user bound to `cluster-admin`). On EKS, this typically means using the IAM entity that created the cluster or one explicitly granted access via `aws-auth` / EKS access entries: + +```shell +# Verify the current identity has sufficient permissions +kubectl auth can-i create customresourcedefinitions --all-namespaces +``` + +If the output is `yes`, proceed with the install. If `no`, switch to a cluster-admin context before running `helm install`. + ### Debug Logging If you encounter internal system errors or unexpected behavior, enable debug logs: @@ -512,11 +533,8 @@ kubectl logs -n --all-containers ### Getting Support -For issues not covered above: - -1. Enable [debug logging](#debug-logging) -2. [Collect logs](#collecting-logs) from the incident timeframe -3. Open a support ticket at [Scalr Support Center](https://scalr-labs.atlassian.net/servicedesk/customer/portal/31) +For issues not covered above, or if you need additional assistance, open a support ticket at [Scalr Support Center](https://scalr-labs.atlassian.net/servicedesk/customer/portal/31). +For errors, see the detailed steps at https://docs.scalr.io/docs/troubleshooting#creating-a-support-ticket on how to gather the right information to speed up issue resolution. ## Maintainers diff --git a/charts/agent-job/README.md.gotmpl b/charts/agent-job/README.md.gotmpl index 254c5557..bbd648f0 100644 --- a/charts/agent-job/README.md.gotmpl +++ b/charts/agent-job/README.md.gotmpl @@ -5,9 +5,6 @@ See the [official documentation](https://docs.scalr.io/docs/agent-pools) for more information about Scalr Agents. -> [!WARNING] -> This chart is in Beta, and implementation details are subject to change. - ## Table of Contents - [Prerequisites](#prerequisites) @@ -33,7 +30,8 @@ See the [official documentation](https://docs.scalr.io/docs/agent-pools) for mor - Kubernetes 1.35+ - Helm 3.0+ -- ReadWriteMany volumes for [Cache Volume Persistence](#cache-volume-persistence) (optional) +- Cluster-admin permissions (or a role with `customresourcedefinitions` create/update at cluster scope) to install the bundled [CRD](#custom-resource-definitions) +- Optional: [ReadWriteMany](https://kubernetes.io/docs/concepts/storage/persistent-volumes/#access-modes) volume for [Cache Volume Persistence](#cache-volume-persistence) ## Installation @@ -465,9 +463,11 @@ Leave `sentryDsn` empty (the default) to disable Sentry integration. ## Custom Resource Definitions -This chart bundles the **AgentTaskTemplate CRD** (`agenttasktemplates.scalr.io`) and installs or upgrades it automatically via Helm. The CRD defines the job template that the controller uses to create task pods, so no separate manual step is required in most environments. +This chart bundles the `agenttasktemplates.scalr.io` CRD and installs or upgrades it automatically via Helm. The CRD defines the job template that the controller uses to create task pods. -**Verify installation:** +Installing the CRD requires cluster-admin permissions or a role with `customresourcedefinitions` create/update at cluster scope. The identity running `helm install` must have these permissions. + +Verify installation: ```shell kubectl get crd agenttasktemplates.scalr.io @@ -485,6 +485,27 @@ Set `rbac.create=false` to bring your own ServiceAccount/Rules, or adjust permis ## Troubleshooting and Support +### CRD Installation Fails: Insufficient Permissions + +If `helm install` fails with an error like: + +``` +Error: failed to install CRD crds/agenttasktemplate.yaml: 1 error occurred: + * customresourcedefinitions.apiextensions.k8s.io is forbidden: User "..." cannot create resource + "customresourcedefinitions" in API group "apiextensions.k8s.io" at the cluster scope +``` + +The identity running `helm install` does not have cluster-admin permissions. This chart installs the [CRD](#custom-resource-definitions) on first install, which requires cluster-scoped `customresourcedefinitions` create/update access. + +**Fix:** Run `helm install` with a cluster-admin account (or an IAM role/user bound to `cluster-admin`). On EKS, this typically means using the IAM entity that created the cluster or one explicitly granted access via `aws-auth` / EKS access entries: + +```shell +# Verify the current identity has sufficient permissions +kubectl auth can-i create customresourcedefinitions --all-namespaces +``` + +If the output is `yes`, proceed with the install. If `no`, switch to a cluster-admin context before running `helm install`. + ### Debug Logging If you encounter internal system errors or unexpected behavior, enable debug logs: @@ -509,11 +530,8 @@ kubectl logs -n --all-containers ### Getting Support -For issues not covered above: - -1. Enable [debug logging](#debug-logging) -2. [Collect logs](#collecting-logs) from the incident timeframe -3. Open a support ticket at [Scalr Support Center](https://scalr-labs.atlassian.net/servicedesk/customer/portal/31) +For issues not covered above, or if you need additional assistance, open a support ticket at [Scalr Support Center](https://scalr-labs.atlassian.net/servicedesk/customer/portal/31). +For errors, see the detailed steps at https://docs.scalr.io/docs/troubleshooting#creating-a-support-ticket on how to gather the right information to speed up issue resolution. {{ template "chart.maintainersSection" . }} diff --git a/charts/agent-k8s/README.md b/charts/agent-k8s/README.md index ca39c2e9..815de5c4 100644 --- a/charts/agent-k8s/README.md +++ b/charts/agent-k8s/README.md @@ -49,8 +49,8 @@ linearly based on the load. ### Cons - Requires access to the Kubernetes API to launch new Pods. -- Requires dedicated node pool. [Details](#Use-dedicated-node-pool). -- Relies on a hostPath volume. [Details](#hostpath-volume). +- Requires dedicated node pool. [Details](#use-dedicated-node-pool). +- Relies on a hostPath volume. [Details](#disk-requirements). ## Deployment Diagram @@ -175,7 +175,7 @@ $ helm upgrade ... \ > This chart is designed around the use of `hostPath`, and using a network volume instead of hostPath means the entire Scalr run and Terraform runtime depend on it. > It is not possible to use a network volume only for the cache, the whole Scalr run would be executed on a network disk, and communication between worker and runner containers will be performed over the network. > This would negatively impact run performance and slow down the entire runtime. Using network disks is generally not recommended. -> If you want to mitigate risks associated with using hostPath, please consider dedicating a separate node pool for Scalr agents, or consider the [`agent-local`](/charts/agent-local) chart or the [`agent-job`](/charts/agent-job) chart instead. +> If you want to mitigate risks associated with using hostPath, please consider dedicating a separate node pool for Scalr agents, or consider the [`agent-local`](../agent-local) chart or the [`agent-job`](../agent-job) chart instead. Amazon EFS can be used as a shared ReadWriteMany volume instead of a node disk. To configure it, install the `Amazon EFS CSI Driver` via an add-on. See the documentation: https://docs.aws.amazon.com/eks/latest/userguide/efs-csi.html#efs-install-driver. @@ -214,7 +214,7 @@ The chart defaults to running the agent as root because `hostPath` storage is ty While you can configure `securityContext` to run as a non-root user, this is intended for [EFS](#amazon-efs) and is not supported for the default `hostPath` setup. The chart is expected to be run on a dedicated node pool to mitigate security risks associated with hostPath and root users. -If you consider this an issue, we recommend looking into the [`agent-local`](/charts/agent-local) or [`agent-job`](/charts/agent-job) (Beta) charts, which provide more secure defaults. +If you consider this an issue, we recommend looking into the [`agent-local`](../agent-local) or [`agent-job`](../agent-job) charts, which provide more secure defaults. ## Restrict Access to VM Metadata Service diff --git a/charts/agent-k8s/README.md.gotmpl b/charts/agent-k8s/README.md.gotmpl index 9cb24415..d26ef111 100644 --- a/charts/agent-k8s/README.md.gotmpl +++ b/charts/agent-k8s/README.md.gotmpl @@ -47,8 +47,8 @@ linearly based on the load. ### Cons - Requires access to the Kubernetes API to launch new Pods. -- Requires dedicated node pool. [Details](#Use-dedicated-node-pool). -- Relies on a hostPath volume. [Details](#hostpath-volume). +- Requires dedicated node pool. [Details](#use-dedicated-node-pool). +- Relies on a hostPath volume. [Details](#disk-requirements). ## Deployment Diagram @@ -173,7 +173,7 @@ $ helm upgrade ... \ > This chart is designed around the use of `hostPath`, and using a network volume instead of hostPath means the entire Scalr run and Terraform runtime depend on it. > It is not possible to use a network volume only for the cache, the whole Scalr run would be executed on a network disk, and communication between worker and runner containers will be performed over the network. > This would negatively impact run performance and slow down the entire runtime. Using network disks is generally not recommended. -> If you want to mitigate risks associated with using hostPath, please consider dedicating a separate node pool for Scalr agents, or consider the [`agent-local`](/charts/agent-local) chart or the [`agent-job`](/charts/agent-job) chart instead. +> If you want to mitigate risks associated with using hostPath, please consider dedicating a separate node pool for Scalr agents, or consider the [`agent-local`](../agent-local) chart or the [`agent-job`](../agent-job) chart instead. Amazon EFS can be used as a shared ReadWriteMany volume instead of a node disk. To configure it, install the `Amazon EFS CSI Driver` via an add-on. See the documentation: https://docs.aws.amazon.com/eks/latest/userguide/efs-csi.html#efs-install-driver. @@ -212,7 +212,7 @@ The chart defaults to running the agent as root because `hostPath` storage is ty While you can configure `securityContext` to run as a non-root user, this is intended for [EFS](#amazon-efs) and is not supported for the default `hostPath` setup. The chart is expected to be run on a dedicated node pool to mitigate security risks associated with hostPath and root users. -If you consider this an issue, we recommend looking into the [`agent-local`](/charts/agent-local) or [`agent-job`](/charts/agent-job) (Beta) charts, which provide more secure defaults. +If you consider this an issue, we recommend looking into the [`agent-local`](../agent-local) or [`agent-job`](../agent-job) charts, which provide more secure defaults. ## Restrict Access to VM Metadata Service diff --git a/charts/agent-local/README.md b/charts/agent-local/README.md index c4603997..91846a2d 100644 --- a/charts/agent-local/README.md +++ b/charts/agent-local/README.md @@ -154,7 +154,7 @@ This chart deploys a set of static agent workers that process runs sequentially This chart is suitable when deploying separate agent deployments (agent pools) for each RBAC perimeter (e.g., Scalr Environment). For example, you can ensure each team in your organization works on individual agent pools without having access to each other's resources. -If you want to manage a single agent deployment across different teams within your organization, consider using the [`agent-job`](/charts/agent-job) chart. +If you want to manage a single agent deployment across different teams within your organization, consider using the [`agent-job`](../agent-job) chart. ### Agent Security Context diff --git a/charts/agent-local/README.md.gotmpl b/charts/agent-local/README.md.gotmpl index 2d21e890..3d1f07c2 100644 --- a/charts/agent-local/README.md.gotmpl +++ b/charts/agent-local/README.md.gotmpl @@ -152,7 +152,7 @@ This chart deploys a set of static agent workers that process runs sequentially This chart is suitable when deploying separate agent deployments (agent pools) for each RBAC perimeter (e.g., Scalr Environment). For example, you can ensure each team in your organization works on individual agent pools without having access to each other's resources. -If you want to manage a single agent deployment across different teams within your organization, consider using the [`agent-job`](/charts/agent-job) chart. +If you want to manage a single agent deployment across different teams within your organization, consider using the [`agent-job`](../agent-job) chart. ### Agent Security Context From cb56c0c39fb9e51530ac1cf14906d47b71ea9c47 Mon Sep 17 00:00:00 2001 From: Serhii Babak Date: Thu, 12 Mar 2026 12:19:04 +0200 Subject: [PATCH 02/24] SCALRCORE-37660 update docs --- .github/actions/update-app-version/index.js | 12 ++++++------ README.md | 3 ++- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/.github/actions/update-app-version/index.js b/.github/actions/update-app-version/index.js index 639b6891..530c516d 100644 --- a/.github/actions/update-app-version/index.js +++ b/.github/actions/update-app-version/index.js @@ -9,7 +9,7 @@ const chartsDir = path.join(process.env.GITHUB_WORKSPACE, 'charts') const appVersion = core.getInput('app_version', { required: true }) core.info(`The appVersion ${appVersion}`) -function getCharts() { +function getCharts () { const files = fs.readdirSync(chartsDir) const directories = files.filter((file) => { const filePath = path.join(chartsDir, file) @@ -19,7 +19,7 @@ function getCharts() { return directories } -function updateCharts(chart) { +function updateCharts (chart) { const chartPath = path.join(chartsDir, chart, 'Chart.yaml') const chartData = yaml.load(fs.readFileSync(chartPath, 'utf8')) @@ -31,7 +31,7 @@ function updateCharts(chart) { return chartData.version } -function updateCHANGELOG(chart, chartNewVersion) { +function updateCHANGELOG (chart, chartNewVersion) { const changelogPath = path.join(chartsDir, chart, 'CHANGELOG.md') const newSection = ` ## [v${chartNewVersion}] @@ -46,7 +46,7 @@ function updateCHANGELOG(chart, chartNewVersion) { fs.writeFileSync(changelogPath, updatedChangelog, 'utf8') } -async function pushChanges() { +async function pushChanges () { await exec.exec('git fetch') await exec.exec('git checkout master') await exec.exec('git config user.name "github-actions[bot]"') @@ -56,11 +56,11 @@ async function pushChanges() { await exec.exec('git push -u origin master') } -async function helmDocs() { +async function helmDocs () { await exec.exec('helm-docs') } -async function run() { +async function run () { try { const charts = getCharts() charts.forEach(function (chart) { diff --git a/README.md b/README.md index 03b3a007..4de3a558 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,8 @@ You can then run `helm search repo scalr-agent-helm` to see the charts. This repository contains multiple charts for different deployment types and use cases. - [agent-local](./charts/agent-local) – Deploys a static number of agents and executes runs in shared agent pods. **This is the recommended default option for Run agent pools** and only option for VCS agent pool. -- [agent-k8s](./charts/agent-k8s) – Deploys an agent controller with a set of agent workers and executes runs in isolated pods. Suitable for environments with strict multi-tenancy requirements. Requires more complex configuration and a separate node pool. +- [agent-job](./charts/agent-job) – Deploys an agent controller and executes runs in isolated stateless jobs. Suitable for environments with strict multi-tenancy requirements. +- [agent-k8s](./charts/agent-k8s) – Deploys an agent controller with a set of statefull agent workers and executes runs in isolated pods. Suitable for environments with strict multi-tenancy requirements. Requires more complex configuration and a separate node pool. ## Development From 871f6ec96a16e72cd1f43bfc724a5531db95e34b Mon Sep 17 00:00:00 2001 From: Serhii Babak Date: Thu, 12 Mar 2026 12:24:33 +0200 Subject: [PATCH 03/24] SCALRCORE-37660 update docs --- charts/agent-docker/README.md | 2 +- charts/agent-docker/README.md.gotmpl | 2 +- charts/agent-job/README.md | 8 +++++--- charts/agent-job/README.md.gotmpl | 8 +++++--- 4 files changed, 12 insertions(+), 8 deletions(-) diff --git a/charts/agent-docker/README.md b/charts/agent-docker/README.md index 4b250c97..0f30b6d5 100644 --- a/charts/agent-docker/README.md +++ b/charts/agent-docker/README.md @@ -7,7 +7,7 @@ where runs are executed in [dind](https://hub.docker.com/_/docker) sidecar conta Run phases are isolated into docker containers. > [!WARNING] -> This chart is planned for deprecation. Use [`agent-local`](../agent-local) instead. +> This chart will be deprecated and will stop receiving updates after March 31, 2026. Please consider to use [`agent-local`](../agent-local) instead. ## Overview diff --git a/charts/agent-docker/README.md.gotmpl b/charts/agent-docker/README.md.gotmpl index e684b29d..1f78da4e 100644 --- a/charts/agent-docker/README.md.gotmpl +++ b/charts/agent-docker/README.md.gotmpl @@ -4,7 +4,7 @@ {{ template "chart.description" . }} > [!WARNING] -> This chart is planned for deprecation. Use [`agent-local`](../agent-local) instead. +> This chart will be deprecated and will stop receiving updates after March 31, 2026. Please consider to use [`agent-local`](../agent-local) instead. ## Overview diff --git a/charts/agent-job/README.md b/charts/agent-job/README.md index ec9d5328..b797901b 100644 --- a/charts/agent-job/README.md +++ b/charts/agent-job/README.md @@ -112,24 +112,26 @@ This chart relies on ImageVolume to provision application components via OCI reg When the agent controller spawns a Kubernetes Job for a Scalr Run, the Job is named using the pattern: -``` +```shell -- ``` Where: + - **basename**: Configurable prefix derived from the chart's fullname (defaults to `scalr-agent`). Override with `task.job.basename`. - **run-id**: Unique identifier assigned by the Scalr platform (e.g., `run-v0p500fu3s9ban8s8`). - **stage**: The execution stage (e.g., `plan`, `apply`, `policy`, etc). -The basename has a 32-character limit. If it exceeds this, it will be omitted. +If the final Job name after concatenation exceeds 63 characters (the Kubernetes Job name limit), the basename prefix will be omitted. -**Examples:** +Examples: | Release Name | `task.job.basename` | Resulting Job Name | |--------------|---------------------|-------------------| | scalr-agent | (empty) | scalr-agent-run-abcd1234-plan | | prod-agent | (empty) | prod-agent-run-abcd1234-apply | | scalr-agent | my-jobs | my-jobs-run-abcd1234-policy | +| scalr-agent | my-extra-long-....-basename | run-abcd1234-policy | To customize the basename: diff --git a/charts/agent-job/README.md.gotmpl b/charts/agent-job/README.md.gotmpl index bbd648f0..6d052cb5 100644 --- a/charts/agent-job/README.md.gotmpl +++ b/charts/agent-job/README.md.gotmpl @@ -109,24 +109,26 @@ This chart relies on ImageVolume to provision application components via OCI reg When the agent controller spawns a Kubernetes Job for a Scalr Run, the Job is named using the pattern: -``` +```shell -- ``` Where: + - **basename**: Configurable prefix derived from the chart's fullname (defaults to `scalr-agent`). Override with `task.job.basename`. - **run-id**: Unique identifier assigned by the Scalr platform (e.g., `run-v0p500fu3s9ban8s8`). - **stage**: The execution stage (e.g., `plan`, `apply`, `policy`, etc). -The basename has a 32-character limit. If it exceeds this, it will be omitted. +If the final Job name after concatenation exceeds 63 characters (the Kubernetes Job name limit), the basename prefix will be omitted. -**Examples:** +Examples: | Release Name | `task.job.basename` | Resulting Job Name | |--------------|---------------------|-------------------| | scalr-agent | (empty) | scalr-agent-run-abcd1234-plan | | prod-agent | (empty) | prod-agent-run-abcd1234-apply | | scalr-agent | my-jobs | my-jobs-run-abcd1234-policy | +| scalr-agent | my-extra-long-....-basename | run-abcd1234-policy | To customize the basename: From 1b75af3479d2a6c07b112da945b7552e277df7d0 Mon Sep 17 00:00:00 2001 From: Serhii Babak Date: Thu, 12 Mar 2026 12:59:18 +0200 Subject: [PATCH 04/24] SCALRCORE-37660 update docs --- charts/agent-job/README.md | 40 ++++++++++++++- charts/agent-job/README.md.gotmpl | 40 ++++++++++++++- charts/agent-job/docs/buffer-pods.md | 74 ++++++++++++++++++++++++++++ 3 files changed, 152 insertions(+), 2 deletions(-) create mode 100644 charts/agent-job/docs/buffer-pods.md diff --git a/charts/agent-job/README.md b/charts/agent-job/README.md index b797901b..91a627c5 100644 --- a/charts/agent-job/README.md +++ b/charts/agent-job/README.md @@ -17,6 +17,7 @@ See the [official documentation](https://docs.scalr.io/docs/agent-pools) for mor - [Planned Changes](#planned-changes) - [Agent Task Naming](#agent-task-naming) - [Custom Runner Images](#custom-runner-images) +- [High Availability](#high-availability) - [Performance Optimization](#performance-optimization) - [Termination](#termination) - [HTTP Proxy](#http-proxy) @@ -165,6 +166,42 @@ helm upgrade --install scalr-agent scalr-charts/agent-job \ --set task.runner.image.tag="v1.2.3" ``` +## High Availability + +This section describes strategies for hardening the deployment for high availability. + +### Multiple Controller Replicas + +By default the chart runs a single controller replica (`agent.replicaCount: 1`). Increasing the replica count distributes the run scheduling load and ensures the agent pool remains available during voluntary disruptions such as node upgrades or pod restarts. + +```shell +helm upgrade --install scalr-agent scalr-agent/agent-job \ + --set agent.replicaCount=2 +``` + +When `agent.replicaCount > 1`, the chart automatically creates a `PodDisruptionBudget` (controlled by `agent.podDisruptionBudget`) that keeps at least one controller available during voluntary disruptions. + +### Separate Controllers and Task Pods + +It is recommended to run controller pods and task job pods on separate node pools. This prevents resource-intensive run workloads from competing with controllers for CPU and memory, which could delay run scheduling or cause controller eviction. It also allows upgrading or resizing the task node pool without interrupting controllers. + +Use `agent.nodeSelector` and `task.nodeSelector` to pin each to its own node pool: + +```yaml +agent: + nodeSelector: + role: main +task: + nodeSelector: + role: scalr-agent-runs +``` + +With task pods on a dedicated node pool, you can also scale that pool down to zero during periods of inactivity and let the cluster autoscaler provision nodes on demand when runs arrive. + +### Deploy Multiple Installations Within a Scalr Agent Pool + +You can connect multiple `agent-job` Helm releases to the same Scalr agent pool, each targeting a different node pool or availability zone. This allows the pool to scale horizontally across infrastructure boundaries without a single point of failure. + ## Performance Optimization The following additional configurations are recommended to optimize Scalr Run startup time and overall chart performance. @@ -174,9 +211,10 @@ The following additional configurations are recommended to optimize Scalr Run st This chart uses Jobs to launch Scalr Runs, so fast Job launch is critical for low Scalr Run startup latency. Common bottlenecks that may introduce latency include slow image pull times on cold nodes. To optimize this, you can: - Use image copies in an OCI-compatible registry mirror (Google Container Registry, Amazon Elastic Container Registry, Azure Container Registry, and similar) located in the same region as your node pool. This enables faster pull times and reduces the risk of hitting Docker Hub rate limits. -- Use a [DaemonSet](https://kubernetes.io/docs/concepts/workloads/controllers/daemonset/) to preemptively cache all images used in this chart (`scalr/agent`, `scalr/runner`). - Enable [Image Streaming](https://docs.cloud.google.com/kubernetes-engine/docs/how-to/image-streaming) (GKE only) to improve Job launch time. - [Build](#custom-runner-images) and use a smaller runner image tailored to your requirements. The default `task.runner.image` includes a wide variety of tools, including cloud CLIs (GCE, AWS, Azure), scripting language interpreters, and more, which makes it a relatively large image and may negatively impact image pull times. +- Use a [DaemonSet](https://kubernetes.io/docs/concepts/workloads/controllers/daemonset/) to preemptively cache all images used in this chart (`scalr/agent`, `scalr/runner`) on clusters with a fixed number of nodes. +- Use [buffer pods](docs/buffer-pods.md) on clusters with autoscaling enabled — buffer pods keep nodes warm to eliminate cluster autoscaler cold-start delays and pre-cache images at the same time. ### Use Persistent Cache diff --git a/charts/agent-job/README.md.gotmpl b/charts/agent-job/README.md.gotmpl index 6d052cb5..27ab0f80 100644 --- a/charts/agent-job/README.md.gotmpl +++ b/charts/agent-job/README.md.gotmpl @@ -14,6 +14,7 @@ See the [official documentation](https://docs.scalr.io/docs/agent-pools) for mor - [Planned Changes](#planned-changes) - [Agent Task Naming](#agent-task-naming) - [Custom Runner Images](#custom-runner-images) +- [High Availability](#high-availability) - [Performance Optimization](#performance-optimization) - [Termination](#termination) - [HTTP Proxy](#http-proxy) @@ -162,6 +163,42 @@ helm upgrade --install scalr-agent scalr-charts/{{ template "chart.name" . }} \ --set task.runner.image.tag="v1.2.3" ``` +## High Availability + +This section describes strategies for hardening the deployment for high availability. + +### Multiple Controller Replicas + +By default the chart runs a single controller replica (`agent.replicaCount: 1`). Increasing the replica count distributes the run scheduling load and ensures the agent pool remains available during voluntary disruptions such as node upgrades or pod restarts. + +```shell +helm upgrade --install scalr-agent scalr-agent/{{ template "chart.name" . }} \ + --set agent.replicaCount=2 +``` + +When `agent.replicaCount > 1`, the chart automatically creates a `PodDisruptionBudget` (controlled by `agent.podDisruptionBudget`) that keeps at least one controller available during voluntary disruptions. + +### Separate Controllers and Task Pods + +It is recommended to run controller pods and task job pods on separate node pools. This prevents resource-intensive run workloads from competing with controllers for CPU and memory, which could delay run scheduling or cause controller eviction. It also allows upgrading or resizing the task node pool without interrupting controllers. + +Use `agent.nodeSelector` and `task.nodeSelector` to pin each to its own node pool: + +```yaml +agent: + nodeSelector: + role: main +task: + nodeSelector: + role: scalr-agent-runs +``` + +With task pods on a dedicated node pool, you can also scale that pool down to zero during periods of inactivity and let the cluster autoscaler provision nodes on demand when runs arrive. + +### Deploy Multiple Installations Within a Scalr Agent Pool + +You can connect multiple `agent-job` Helm releases to the same Scalr agent pool, each targeting a different node pool or availability zone. This allows the pool to scale horizontally across infrastructure boundaries without a single point of failure. + ## Performance Optimization The following additional configurations are recommended to optimize Scalr Run startup time and overall chart performance. @@ -171,9 +208,10 @@ The following additional configurations are recommended to optimize Scalr Run st This chart uses Jobs to launch Scalr Runs, so fast Job launch is critical for low Scalr Run startup latency. Common bottlenecks that may introduce latency include slow image pull times on cold nodes. To optimize this, you can: - Use image copies in an OCI-compatible registry mirror (Google Container Registry, Amazon Elastic Container Registry, Azure Container Registry, and similar) located in the same region as your node pool. This enables faster pull times and reduces the risk of hitting Docker Hub rate limits. -- Use a [DaemonSet](https://kubernetes.io/docs/concepts/workloads/controllers/daemonset/) to preemptively cache all images used in this chart (`scalr/agent`, `scalr/runner`). - Enable [Image Streaming](https://docs.cloud.google.com/kubernetes-engine/docs/how-to/image-streaming) (GKE only) to improve Job launch time. - [Build](#custom-runner-images) and use a smaller runner image tailored to your requirements. The default `task.runner.image` includes a wide variety of tools, including cloud CLIs (GCE, AWS, Azure), scripting language interpreters, and more, which makes it a relatively large image and may negatively impact image pull times. +- Use a [DaemonSet](https://kubernetes.io/docs/concepts/workloads/controllers/daemonset/) to preemptively cache all images used in this chart (`scalr/agent`, `scalr/runner`) on clusters with a fixed number of nodes. +- Use [buffer pods](docs/buffer-pods.md) on clusters with autoscaling enabled — buffer pods keep nodes warm to eliminate cluster autoscaler cold-start delays and pre-cache images at the same time. ### Use Persistent Cache diff --git a/charts/agent-job/docs/buffer-pods.md b/charts/agent-job/docs/buffer-pods.md new file mode 100644 index 00000000..bd2f9f44 --- /dev/null +++ b/charts/agent-job/docs/buffer-pods.md @@ -0,0 +1,74 @@ +# Buffer Pods: Keeping Nodes Warm for Fast Run Startup + +On clusters with autoscaling enabled, the cluster autoscaler only provisions new nodes when unschedulable pods appear. This means the first Scalr Run after a period of inactivity may wait several minutes for a new node to become ready before the task pod can start. + +The buffer pod pattern addresses this by deploying a low-priority `Deployment` that pre-occupies resources on existing nodes. When a higher-priority task pod is scheduled, Kubernetes [preempts](https://kubernetes.io/docs/concepts/scheduling-eviction/pod-priority-preemption/) the buffer pods to free capacity immediately. The cluster autoscaler then replaces the evicted buffer pods in the background, keeping nodes warm for the next run. + +## Step 1 — Create a low-priority PriorityClass + +```yaml +apiVersion: scheduling.k8s.io/v1 +kind: PriorityClass +metadata: + name: scalr-agent-buffer +value: -10 +globalDefault: false +preemptionPolicy: Never +description: "Low-priority buffer pods that reserve node capacity for Scalr task pods." +``` + +## Step 2 — Deploy buffer pods sized to match your task pods + +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: scalr-agent-buffer + namespace: +spec: + replicas: 2 # keep as many nodes warm as needed + selector: + matchLabels: + app: scalr-agent-buffer + template: + metadata: + labels: + app: scalr-agent-buffer + spec: + priorityClassName: scalr-agent-buffer + terminationGracePeriodSeconds: 0 + nodeSelector: {} # match the same node pool as your task pods + tolerations: [] # match task.tolerations if set + containers: + - name: worker + image: scalr/agent:latest # pin to the same tag as agent.image.tag + command: ["sleep", "infinity"] + resources: + requests: + cpu: 250m + memory: 256Mi + - name: runner + image: scalr/runner:0.2.0 # pin to the same tag as task.runner.image.tag + command: ["sleep", "infinity"] + resources: + requests: + # Size to match task pod total: task.worker + task.runner requests + cpu: 500m + memory: 512Mi +``` + +Using the actual `scalr/agent` and `scalr/runner` images serves a second purpose: the container runtime pulls and caches the images on the node. When a real task pod is scheduled on that node, the images are already present and the pull step is skipped, eliminating a major source of cold-start latency. + +## Tuning + +- **`replicas`** — set to the number of nodes you want to keep warm. +- **`resources.requests`** — match the requests of the worker and runner containers separately (`task.worker.resources.requests` and `task.runner.resources.requests`). The values above reflect the chart defaults. +- **`image` tags** — pin to the same tags as `agent.image.tag` and `task.runner.image.tag` in your Helm values so the cached image is the one actually used by task pods. +- **`nodeSelector` / `tolerations`** — mirror `task.nodeSelector` and `task.tolerations` so buffer pods land on the same node pool as task pods. + +> [!NOTE] +> `terminationGracePeriodSeconds: 0` ensures buffer pods are evicted instantly when preempted, releasing capacity for the incoming task pod without delay. + +### Links + +- [Understanding and Combining GKE Autoscaling Strategies](https://www.skills.google/focuses/15636?locale=pt_PT&parent=catalog&qlcampaign=5k-dodl-65) From 15920b124491c94c6ef7e35b9bdf04bb51216459 Mon Sep 17 00:00:00 2001 From: Serhii Babak Date: Thu, 12 Mar 2026 13:00:03 +0200 Subject: [PATCH 05/24] SCALRCORE-37660 update docs --- charts/agent-job/README.md | 2 +- charts/agent-job/README.md.gotmpl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/charts/agent-job/README.md b/charts/agent-job/README.md index 91a627c5..c836f931 100644 --- a/charts/agent-job/README.md +++ b/charts/agent-job/README.md @@ -214,7 +214,7 @@ This chart uses Jobs to launch Scalr Runs, so fast Job launch is critical for lo - Enable [Image Streaming](https://docs.cloud.google.com/kubernetes-engine/docs/how-to/image-streaming) (GKE only) to improve Job launch time. - [Build](#custom-runner-images) and use a smaller runner image tailored to your requirements. The default `task.runner.image` includes a wide variety of tools, including cloud CLIs (GCE, AWS, Azure), scripting language interpreters, and more, which makes it a relatively large image and may negatively impact image pull times. - Use a [DaemonSet](https://kubernetes.io/docs/concepts/workloads/controllers/daemonset/) to preemptively cache all images used in this chart (`scalr/agent`, `scalr/runner`) on clusters with a fixed number of nodes. -- Use [buffer pods](docs/buffer-pods.md) on clusters with autoscaling enabled — buffer pods keep nodes warm to eliminate cluster autoscaler cold-start delays and pre-cache images at the same time. +- Use [buffer pods](docs/buffer-pods.md) on clusters with autoscaling enabled - buffer pods keep nodes warm to eliminate cluster autoscaler cold-start delays and pre-cache images at the same time. ### Use Persistent Cache diff --git a/charts/agent-job/README.md.gotmpl b/charts/agent-job/README.md.gotmpl index 27ab0f80..f141d264 100644 --- a/charts/agent-job/README.md.gotmpl +++ b/charts/agent-job/README.md.gotmpl @@ -211,7 +211,7 @@ This chart uses Jobs to launch Scalr Runs, so fast Job launch is critical for lo - Enable [Image Streaming](https://docs.cloud.google.com/kubernetes-engine/docs/how-to/image-streaming) (GKE only) to improve Job launch time. - [Build](#custom-runner-images) and use a smaller runner image tailored to your requirements. The default `task.runner.image` includes a wide variety of tools, including cloud CLIs (GCE, AWS, Azure), scripting language interpreters, and more, which makes it a relatively large image and may negatively impact image pull times. - Use a [DaemonSet](https://kubernetes.io/docs/concepts/workloads/controllers/daemonset/) to preemptively cache all images used in this chart (`scalr/agent`, `scalr/runner`) on clusters with a fixed number of nodes. -- Use [buffer pods](docs/buffer-pods.md) on clusters with autoscaling enabled — buffer pods keep nodes warm to eliminate cluster autoscaler cold-start delays and pre-cache images at the same time. +- Use [buffer pods](docs/buffer-pods.md) on clusters with autoscaling enabled - buffer pods keep nodes warm to eliminate cluster autoscaler cold-start delays and pre-cache images at the same time. ### Use Persistent Cache From a12d6a159f786dc2e2c6cb94985adf75a2ed8d66 Mon Sep 17 00:00:00 2001 From: Serhii Babak Date: Thu, 12 Mar 2026 13:10:23 +0200 Subject: [PATCH 06/24] SCALRCORE-37660 update docs --- charts/agent-job/README.md | 2 ++ charts/agent-job/README.md.gotmpl | 2 ++ 2 files changed, 4 insertions(+) diff --git a/charts/agent-job/README.md b/charts/agent-job/README.md index c836f931..a2b77f32 100644 --- a/charts/agent-job/README.md +++ b/charts/agent-job/README.md @@ -216,6 +216,8 @@ This chart uses Jobs to launch Scalr Runs, so fast Job launch is critical for lo - Use a [DaemonSet](https://kubernetes.io/docs/concepts/workloads/controllers/daemonset/) to preemptively cache all images used in this chart (`scalr/agent`, `scalr/runner`) on clusters with a fixed number of nodes. - Use [buffer pods](docs/buffer-pods.md) on clusters with autoscaling enabled - buffer pods keep nodes warm to eliminate cluster autoscaler cold-start delays and pre-cache images at the same time. +You can configure [OTLP monitoring](#metrics-and-observability) and use the [scalr_agent.core.kubernetes_job_startup_latency_seconds](https://docs.scalr.io/docs/metrics#scalr_agentcorekubernetes_job_startup_latency_seconds) metric to track run startup time. You may observe spikes during node scaling, but good average values range from 3 to 5 seconds. + ### Use Persistent Cache A major performance bottleneck in any IaC pipeline is the time spent re-downloading binaries, providers, and modules during each run. To optimize this, we recommend enabling [Cache Directory Persistence](#cache-volume-persistence). diff --git a/charts/agent-job/README.md.gotmpl b/charts/agent-job/README.md.gotmpl index f141d264..f0172aa0 100644 --- a/charts/agent-job/README.md.gotmpl +++ b/charts/agent-job/README.md.gotmpl @@ -213,6 +213,8 @@ This chart uses Jobs to launch Scalr Runs, so fast Job launch is critical for lo - Use a [DaemonSet](https://kubernetes.io/docs/concepts/workloads/controllers/daemonset/) to preemptively cache all images used in this chart (`scalr/agent`, `scalr/runner`) on clusters with a fixed number of nodes. - Use [buffer pods](docs/buffer-pods.md) on clusters with autoscaling enabled - buffer pods keep nodes warm to eliminate cluster autoscaler cold-start delays and pre-cache images at the same time. +You can configure [OTLP monitoring](#metrics-and-observability) and use the [scalr_agent.core.kubernetes_job_startup_latency_seconds](https://docs.scalr.io/docs/metrics#scalr_agentcorekubernetes_job_startup_latency_seconds) metric to track run startup time. You may observe spikes during node scaling, but good average values range from 3 to 5 seconds. + ### Use Persistent Cache A major performance bottleneck in any IaC pipeline is the time spent re-downloading binaries, providers, and modules during each run. To optimize this, we recommend enabling [Cache Directory Persistence](#cache-volume-persistence). From 95b6d2ecaef97175c7b893045f720fbd11d7ce33 Mon Sep 17 00:00:00 2001 From: Serhii Babak Date: Thu, 12 Mar 2026 22:19:58 +0200 Subject: [PATCH 07/24] SCALRCORE-37660 update docs --- charts/agent-job/README.md | 1 - charts/agent-job/README.md.gotmpl | 1 - 2 files changed, 2 deletions(-) diff --git a/charts/agent-job/README.md b/charts/agent-job/README.md index a2b77f32..64a8adde 100644 --- a/charts/agent-job/README.md +++ b/charts/agent-job/README.md @@ -34,7 +34,6 @@ See the [official documentation](https://docs.scalr.io/docs/agent-pools) for mor - Kubernetes 1.35+ - Helm 3.0+ -- Cluster-admin permissions (or a role with `customresourcedefinitions` create/update at cluster scope) to install the bundled [CRD](#custom-resource-definitions) - Optional: [ReadWriteMany](https://kubernetes.io/docs/concepts/storage/persistent-volumes/#access-modes) volume for [Cache Volume Persistence](#cache-volume-persistence) ## Installation diff --git a/charts/agent-job/README.md.gotmpl b/charts/agent-job/README.md.gotmpl index f0172aa0..aa0d5bba 100644 --- a/charts/agent-job/README.md.gotmpl +++ b/charts/agent-job/README.md.gotmpl @@ -31,7 +31,6 @@ See the [official documentation](https://docs.scalr.io/docs/agent-pools) for mor - Kubernetes 1.35+ - Helm 3.0+ -- Cluster-admin permissions (or a role with `customresourcedefinitions` create/update at cluster scope) to install the bundled [CRD](#custom-resource-definitions) - Optional: [ReadWriteMany](https://kubernetes.io/docs/concepts/storage/persistent-volumes/#access-modes) volume for [Cache Volume Persistence](#cache-volume-persistence) ## Installation From 31f60c2176615d696f8bf69363bef223ce17961b Mon Sep 17 00:00:00 2001 From: Serhii Babak Date: Thu, 12 Mar 2026 22:55:59 +0200 Subject: [PATCH 08/24] SCALRCORE-37660 update docs --- charts/agent-job/README.md | 35 +++++++++++++++++++++++++++-- charts/agent-job/README.md.gotmpl | 35 +++++++++++++++++++++++++++-- charts/agent-local/README.md | 29 ++++++++++++++++++++++++ charts/agent-local/README.md.gotmpl | 29 ++++++++++++++++++++++++ 4 files changed, 124 insertions(+), 4 deletions(-) diff --git a/charts/agent-job/README.md b/charts/agent-job/README.md index 64a8adde..b8ebf5bb 100644 --- a/charts/agent-job/README.md +++ b/charts/agent-job/README.md @@ -14,6 +14,7 @@ See the [official documentation](https://docs.scalr.io/docs/agent-pools) for mor - [Installation](#installation) - [Overview](#overview) - [Architecture Diagram](#architecture-diagram) +- [Versioning Policy](#versioning-policy) - [Planned Changes](#planned-changes) - [Agent Task Naming](#agent-task-naming) - [Custom Runner Images](#custom-runner-images) @@ -26,6 +27,7 @@ See the [official documentation](https://docs.scalr.io/docs/agent-pools) for mor - [Security](#security) - [Job History Management](#job-history-management) - [Metrics and Observability](#metrics-and-observability) +- [Network Requirements](#network-requirements) - [Custom Resource Definitions](#custom-resource-definitions) - [RBAC](#rbac) - [Troubleshooting and Support](#troubleshooting-and-support) @@ -98,6 +100,18 @@ See [template](https://github.com/Scalr/agent-helm/blob/master/charts/agent-job/

+## Versioning Policy + +This chart deploys the [Scalr Agent](https://docs.scalr.io/docs/agent-pools) using the [`scalr/agent`](https://hub.docker.com/r/scalr/agent) image. The agent supports multiple runtimes beyond Kubernetes and is versioned independently from this chart. + +Each new agent release triggers a new chart release with an updated `appVersion`. The two changelogs cover different scopes: + +- [Scalr Agent changelog](https://docs.scalr.io/docs/changelog) — application-level changes and new Scalr platform functionality +- [CHANGELOG.md](CHANGELOG.md) — chart-level changes: Kubernetes resources, values, and defaults + +> [!WARNING] +> Overriding `appVersion` to a version other than the one shipped with the chart is not recommended. Releases are tested and coordinated with a specific agent version, and mismatched combinations may include breaking changes between application and infrastructure code. + ## Planned Changes This section outlines planned architecture changes that may be relevant for long-term chart maintenance. @@ -182,7 +196,7 @@ When `agent.replicaCount > 1`, the chart automatically creates a `PodDisruptionB ### Separate Controllers and Task Pods -It is recommended to run controller pods and task job pods on separate node pools. This prevents resource-intensive run workloads from competing with controllers for CPU and memory, which could delay run scheduling or cause controller eviction. It also allows upgrading or resizing the task node pool without interrupting controllers. +It is recommended to run agent controller pods and task job pods on separate node pools. This prevents resource-intensive run workloads from competing with controllers for CPU and memory, which could delay run scheduling or cause controller eviction. It also allows upgrading or resizing the task node pool without interrupting controllers which is responsible for scheduling incoming runs. Use `agent.nodeSelector` and `task.nodeSelector` to pin each to its own node pool: @@ -199,7 +213,9 @@ With task pods on a dedicated node pool, you can also scale that pool down to ze ### Deploy Multiple Installations Within a Scalr Agent Pool -You can connect multiple `agent-job` Helm releases to the same Scalr agent pool, each targeting a different node pool or availability zone. This allows the pool to scale horizontally across infrastructure boundaries without a single point of failure. +You can connect multiple `agent-job` Helm releases to the same Scalr agent pool, each targeting a different node pool or availability zone. + +This allows the pool to scale horizontally across infrastructure boundaries without a single point of failure. ## Performance Optimization @@ -437,6 +453,21 @@ Ensure your cluster uses a CNI plugin that supports egress NetworkPolicies. Test | GKE | Dataplane V1 (Calico) | ❌ | | GKE | [Dataplane V2](https://docs.cloud.google.com/kubernetes-engine/docs/how-to/dataplane-v2) (Cilium/eBPF) | ✅ | +## Network Requirements + +The agent requires outbound HTTPS access to the following endpoints: + +| Hostname | Port | Purpose | +| ------------------------------------- | ---- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| scalr.io | 443 | Polling for new tasks, posting status updates and logs, downloading IaC configuration versions, private modules, and software binary releases | +| docker.io, docker.com, cloudfront.net | 443 | Pulling the [scalr/agent](https://hub.docker.com/r/scalr/agent) and [scalr/runner](https://hub.docker.com/r/scalr/runner) images | +| registry.opentofu.org | 443 | Downloading public providers and modules from the OpenTofu Registry | +| registry.terraform.io | 443 | Downloading public providers and modules from the Terraform Registry | + +Ensure the agent can also reach any services required by your OpenTofu/Terraform configurations or hook scripts, such as cloud provider APIs, VCS providers, or custom software distribution endpoints. + +If you use custom module or provider registries, or Docker registry mirrors, additional network access rules may be required. + ## Job History Management Kubernetes automatically removes Jobs after `task.job.ttlSecondsAfterFinished` seconds (default: 60). Increase this value for debugging or to preserve job history longer, or decrease it to optimize cluster resource usage. diff --git a/charts/agent-job/README.md.gotmpl b/charts/agent-job/README.md.gotmpl index aa0d5bba..9bcc2917 100644 --- a/charts/agent-job/README.md.gotmpl +++ b/charts/agent-job/README.md.gotmpl @@ -11,6 +11,7 @@ See the [official documentation](https://docs.scalr.io/docs/agent-pools) for mor - [Installation](#installation) - [Overview](#overview) - [Architecture Diagram](#architecture-diagram) +- [Versioning Policy](#versioning-policy) - [Planned Changes](#planned-changes) - [Agent Task Naming](#agent-task-naming) - [Custom Runner Images](#custom-runner-images) @@ -23,6 +24,7 @@ See the [official documentation](https://docs.scalr.io/docs/agent-pools) for mor - [Security](#security) - [Job History Management](#job-history-management) - [Metrics and Observability](#metrics-and-observability) +- [Network Requirements](#network-requirements) - [Custom Resource Definitions](#custom-resource-definitions) - [RBAC](#rbac) - [Troubleshooting and Support](#troubleshooting-and-support) @@ -95,6 +97,18 @@ See [template](https://github.com/Scalr/agent-helm/blob/master/charts/agent-job/

+## Versioning Policy + +This chart deploys the [Scalr Agent](https://docs.scalr.io/docs/agent-pools) using the [`scalr/agent`](https://hub.docker.com/r/scalr/agent) image. The agent supports multiple runtimes beyond Kubernetes and is versioned independently from this chart. + +Each new agent release triggers a new chart release with an updated `appVersion`. The two changelogs cover different scopes: + +- [Scalr Agent changelog](https://docs.scalr.io/docs/changelog) — application-level changes and new Scalr platform functionality +- [CHANGELOG.md](CHANGELOG.md) — chart-level changes: Kubernetes resources, values, and defaults + +> [!WARNING] +> Overriding `appVersion` to a version other than the one shipped with the chart is not recommended. Releases are tested and coordinated with a specific agent version, and mismatched combinations may include breaking changes between application and infrastructure code. + ## Planned Changes This section outlines planned architecture changes that may be relevant for long-term chart maintenance. @@ -179,7 +193,7 @@ When `agent.replicaCount > 1`, the chart automatically creates a `PodDisruptionB ### Separate Controllers and Task Pods -It is recommended to run controller pods and task job pods on separate node pools. This prevents resource-intensive run workloads from competing with controllers for CPU and memory, which could delay run scheduling or cause controller eviction. It also allows upgrading or resizing the task node pool without interrupting controllers. +It is recommended to run agent controller pods and task job pods on separate node pools. This prevents resource-intensive run workloads from competing with controllers for CPU and memory, which could delay run scheduling or cause controller eviction. It also allows upgrading or resizing the task node pool without interrupting controllers which is responsible for scheduling incoming runs. Use `agent.nodeSelector` and `task.nodeSelector` to pin each to its own node pool: @@ -196,7 +210,9 @@ With task pods on a dedicated node pool, you can also scale that pool down to ze ### Deploy Multiple Installations Within a Scalr Agent Pool -You can connect multiple `agent-job` Helm releases to the same Scalr agent pool, each targeting a different node pool or availability zone. This allows the pool to scale horizontally across infrastructure boundaries without a single point of failure. +You can connect multiple `agent-job` Helm releases to the same Scalr agent pool, each targeting a different node pool or availability zone. + +This allows the pool to scale horizontally across infrastructure boundaries without a single point of failure. ## Performance Optimization @@ -434,6 +450,21 @@ Ensure your cluster uses a CNI plugin that supports egress NetworkPolicies. Test | GKE | Dataplane V1 (Calico) | ❌ | | GKE | [Dataplane V2](https://docs.cloud.google.com/kubernetes-engine/docs/how-to/dataplane-v2) (Cilium/eBPF) | ✅ | +## Network Requirements + +The agent requires outbound HTTPS access to the following endpoints: + +| Hostname | Port | Purpose | +| ------------------------------------- | ---- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| scalr.io | 443 | Polling for new tasks, posting status updates and logs, downloading IaC configuration versions, private modules, and software binary releases | +| docker.io, docker.com, cloudfront.net | 443 | Pulling the [scalr/agent](https://hub.docker.com/r/scalr/agent) and [scalr/runner](https://hub.docker.com/r/scalr/runner) images | +| registry.opentofu.org | 443 | Downloading public providers and modules from the OpenTofu Registry | +| registry.terraform.io | 443 | Downloading public providers and modules from the Terraform Registry | + +Ensure the agent can also reach any services required by your OpenTofu/Terraform configurations or hook scripts, such as cloud provider APIs, VCS providers, or custom software distribution endpoints. + +If you use custom module or provider registries, or Docker registry mirrors, additional network access rules may be required. + ## Job History Management Kubernetes automatically removes Jobs after `task.job.ttlSecondsAfterFinished` seconds (default: 60). Increase this value for debugging or to preserve job history longer, or decrease it to optimize cluster resource usage. diff --git a/charts/agent-local/README.md b/charts/agent-local/README.md index 91846a2d..f2adc733 100644 --- a/charts/agent-local/README.md +++ b/charts/agent-local/README.md @@ -11,11 +11,13 @@ Deploys a static number of agents and executes runs in shared agent pods. - [Installation](#installation) - [Overview](#overview) - [Architecture Diagram](#architecture-diagram) +- [Versioning Policy](#versioning-policy) - [Configuration](#configuration) - [Customizing Environment](#customizing-environment) - [Volumes](#volumes) - [Security](#security) - [Metrics and Observability](#metrics-and-observability) +- [Network Requirements](#network-requirements) - [Termination](#termination) - [Troubleshooting](#troubleshooting) @@ -68,6 +70,18 @@ The concurrency of each agent instance is limited to 1. To scale concurrency, th

+## Versioning Policy + +This chart deploys the [Scalr Agent](https://docs.scalr.io/docs/agent-pools) using the [`scalr/agent`](https://hub.docker.com/r/scalr/agent) image. The agent supports multiple runtimes beyond Kubernetes and is versioned independently from this chart. + +Each new agent release triggers a new chart release with an updated `appVersion`. The two changelogs cover different scopes: + +- [Scalr Agent changelog](https://docs.scalr.io/docs/changelog) — application-level changes and new Scalr platform functionality +- [CHANGELOG.md](CHANGELOG.md) — chart-level changes: Kubernetes resources, values, and defaults + +> [!WARNING] +> Overriding `appVersion` to a version other than the one shipped with the chart is not recommended. Releases are tested and coordinated with a specific agent version, and mismatched combinations may include breaking changes between application and infrastructure code. + ## Configuration The Scalr Agent is configured using environment variables, which can be set using the `extraEnv` option in the Helm chart. @@ -262,6 +276,21 @@ When a agent container exceeds its memory limit, Kubernetes sends SIGKILL direct Monitor your run's resource usage and configure resource requests and limits accordingly to mitigate the risk of unexpected termination. +## Network Requirements + +The agent requires outbound HTTPS access to the following endpoints: + +| Hostname | Port | Purpose | +| ------------------------------------- | ---- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| scalr.io | 443 | Polling for new tasks, posting status updates and logs, downloading IaC configuration versions, private modules, and software binary releases | +| docker.io, docker.com, cloudfront.net | 443 | Pulling the [scalr/agent](https://hub.docker.com/r/scalr/agent) and [scalr/runner](https://hub.docker.com/r/scalr/runner) images | +| registry.opentofu.org | 443 | Downloading public providers and modules from the OpenTofu Registry | +| registry.terraform.io | 443 | Downloading public providers and modules from the Terraform Registry | + +Ensure the agent can also reach any services required by your OpenTofu/Terraform configurations or hook scripts, such as cloud provider APIs, VCS providers, or custom software distribution endpoints. + +If you use custom module or provider registries, or Docker registry mirrors, additional network access rules may be required. + ## Troubleshooting If you encounter internal system errors or unexpected behavior, please open a Scalr Support request at [Scalr Support Center](https://scalr-labs.atlassian.net/servicedesk/customer/portal/31). diff --git a/charts/agent-local/README.md.gotmpl b/charts/agent-local/README.md.gotmpl index 3d1f07c2..dc332f6a 100644 --- a/charts/agent-local/README.md.gotmpl +++ b/charts/agent-local/README.md.gotmpl @@ -9,11 +9,13 @@ - [Installation](#installation) - [Overview](#overview) - [Architecture Diagram](#architecture-diagram) +- [Versioning Policy](#versioning-policy) - [Configuration](#configuration) - [Customizing Environment](#customizing-environment) - [Volumes](#volumes) - [Security](#security) - [Metrics and Observability](#metrics-and-observability) +- [Network Requirements](#network-requirements) - [Termination](#termination) - [Troubleshooting](#troubleshooting) @@ -66,6 +68,18 @@ The concurrency of each agent instance is limited to 1. To scale concurrency, th

+## Versioning Policy + +This chart deploys the [Scalr Agent](https://docs.scalr.io/docs/agent-pools) using the [`scalr/agent`](https://hub.docker.com/r/scalr/agent) image. The agent supports multiple runtimes beyond Kubernetes and is versioned independently from this chart. + +Each new agent release triggers a new chart release with an updated `appVersion`. The two changelogs cover different scopes: + +- [Scalr Agent changelog](https://docs.scalr.io/docs/changelog) — application-level changes and new Scalr platform functionality +- [CHANGELOG.md](CHANGELOG.md) — chart-level changes: Kubernetes resources, values, and defaults + +> [!WARNING] +> Overriding `appVersion` to a version other than the one shipped with the chart is not recommended. Releases are tested and coordinated with a specific agent version, and mismatched combinations may include breaking changes between application and infrastructure code. + ## Configuration The Scalr Agent is configured using environment variables, which can be set using the `extraEnv` option in the Helm chart. @@ -260,6 +274,21 @@ When a agent container exceeds its memory limit, Kubernetes sends SIGKILL direct Monitor your run's resource usage and configure resource requests and limits accordingly to mitigate the risk of unexpected termination. +## Network Requirements + +The agent requires outbound HTTPS access to the following endpoints: + +| Hostname | Port | Purpose | +| ------------------------------------- | ---- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| scalr.io | 443 | Polling for new tasks, posting status updates and logs, downloading IaC configuration versions, private modules, and software binary releases | +| docker.io, docker.com, cloudfront.net | 443 | Pulling the [scalr/agent](https://hub.docker.com/r/scalr/agent) and [scalr/runner](https://hub.docker.com/r/scalr/runner) images | +| registry.opentofu.org | 443 | Downloading public providers and modules from the OpenTofu Registry | +| registry.terraform.io | 443 | Downloading public providers and modules from the Terraform Registry | + +Ensure the agent can also reach any services required by your OpenTofu/Terraform configurations or hook scripts, such as cloud provider APIs, VCS providers, or custom software distribution endpoints. + +If you use custom module or provider registries, or Docker registry mirrors, additional network access rules may be required. + ## Troubleshooting If you encounter internal system errors or unexpected behavior, please open a Scalr Support request at [Scalr Support Center](https://scalr-labs.atlassian.net/servicedesk/customer/portal/31). From abf1ebc19852ec1890916ca833276c4069a1703c Mon Sep 17 00:00:00 2001 From: Serhii Babak Date: Thu, 12 Mar 2026 23:15:07 +0200 Subject: [PATCH 09/24] SCALRCORE-37660 update docs --- charts/agent-job/README.md | 38 +++++++++++++++---------------- charts/agent-job/README.md.gotmpl | 38 +++++++++++++++---------------- 2 files changed, 38 insertions(+), 38 deletions(-) diff --git a/charts/agent-job/README.md b/charts/agent-job/README.md index b8ebf5bb..0ece8bb6 100644 --- a/charts/agent-job/README.md +++ b/charts/agent-job/README.md @@ -15,7 +15,6 @@ See the [official documentation](https://docs.scalr.io/docs/agent-pools) for mor - [Overview](#overview) - [Architecture Diagram](#architecture-diagram) - [Versioning Policy](#versioning-policy) -- [Planned Changes](#planned-changes) - [Agent Task Naming](#agent-task-naming) - [Custom Runner Images](#custom-runner-images) - [High Availability](#high-availability) @@ -25,11 +24,12 @@ See the [official documentation](https://docs.scalr.io/docs/agent-pools) for mor - [Custom Certificate Authorities](#custom-certificate-authorities) - [Volumes](#volumes) - [Security](#security) +- [Network Requirements](#network-requirements) - [Job History Management](#job-history-management) - [Metrics and Observability](#metrics-and-observability) -- [Network Requirements](#network-requirements) -- [Custom Resource Definitions](#custom-resource-definitions) - [RBAC](#rbac) +- [Custom Resource Definitions](#custom-resource-definitions) +- [Planned Changes](#planned-changes) - [Troubleshooting and Support](#troubleshooting-and-support) ## Prerequisites @@ -112,16 +112,6 @@ Each new agent release triggers a new chart release with an updated `appVersion` > [!WARNING] > Overriding `appVersion` to a version other than the one shipped with the chart is not recommended. Releases are tested and coordinated with a specific agent version, and mismatched combinations may include breaking changes between application and infrastructure code. -## Planned Changes - -This section outlines planned architecture changes that may be relevant for long-term chart maintenance. - -### Update Minimum Requirements to Kubernetes 1.36 Once GA - -Update the minimum required Kubernetes version to 1.36, which includes the stable [ImageVolume](https://kubernetes.io/docs/tasks/configure-pod-container/image-volumes/) feature and containerd 2.2+ with [subPath](https://github.com/containerd/containerd/pull/11578) support for ImageVolume. -In Kubernetes 1.35 (current minimal required version), ImageVolume is in Beta status but enabled by default, and we consider it ready for limited usage. -This chart relies on ImageVolume to provision application components via OCI registry and plans to use this feature more heavily in the future. - ## Agent Task Naming When the agent controller spawns a Kubernetes Job for a Scalr Run, the Job is named using the pattern: @@ -536,6 +526,16 @@ agent: Leave `sentryDsn` empty (the default) to disable Sentry integration. +## RBAC + +By default the chart provisions: + +- **ServiceAccount** used by the controller and task pods +- **Role/RoleBinding** with namespaced access to manage pods/jobs and related resources needed for task execution +- **ClusterRole/ClusterRoleBinding** granting read access to `AgentTaskTemplate` resources (`agenttasktemplates.scalr.io`) + +Set `rbac.create=false` to bring your own ServiceAccount/Rules, or adjust permissions with `rbac.rules` and `rbac.clusterRules`. + ## Custom Resource Definitions This chart bundles the `agenttasktemplates.scalr.io` CRD and installs or upgrades it automatically via Helm. The CRD defines the job template that the controller uses to create task pods. @@ -548,15 +548,15 @@ Verify installation: kubectl get crd agenttasktemplates.scalr.io ``` -## RBAC +## Planned Changes -By default the chart provisions: +This section outlines planned architecture changes that may be relevant for long-term chart maintenance. -- **ServiceAccount** used by the controller and task pods -- **Role/RoleBinding** with namespaced access to manage pods/jobs and related resources needed for task execution -- **ClusterRole/ClusterRoleBinding** granting read access to `AgentTaskTemplate` resources (`agenttasktemplates.scalr.io`) +### Update Minimum Requirements to Kubernetes 1.36 Once GA -Set `rbac.create=false` to bring your own ServiceAccount/Rules, or adjust permissions with `rbac.rules` and `rbac.clusterRules`. +Update the minimum required Kubernetes version to 1.36, which includes the stable [ImageVolume](https://kubernetes.io/docs/tasks/configure-pod-container/image-volumes/) feature and containerd 2.2+ with [subPath](https://github.com/containerd/containerd/pull/11578) support for ImageVolume. +In Kubernetes 1.35 (current minimal required version), ImageVolume is in Beta status but enabled by default, and we consider it ready for limited usage. +This chart relies on ImageVolume to provision application components via OCI registry and plans to use this feature more heavily in the future. ## Troubleshooting and Support diff --git a/charts/agent-job/README.md.gotmpl b/charts/agent-job/README.md.gotmpl index 9bcc2917..901b9ddd 100644 --- a/charts/agent-job/README.md.gotmpl +++ b/charts/agent-job/README.md.gotmpl @@ -12,7 +12,6 @@ See the [official documentation](https://docs.scalr.io/docs/agent-pools) for mor - [Overview](#overview) - [Architecture Diagram](#architecture-diagram) - [Versioning Policy](#versioning-policy) -- [Planned Changes](#planned-changes) - [Agent Task Naming](#agent-task-naming) - [Custom Runner Images](#custom-runner-images) - [High Availability](#high-availability) @@ -22,11 +21,12 @@ See the [official documentation](https://docs.scalr.io/docs/agent-pools) for mor - [Custom Certificate Authorities](#custom-certificate-authorities) - [Volumes](#volumes) - [Security](#security) +- [Network Requirements](#network-requirements) - [Job History Management](#job-history-management) - [Metrics and Observability](#metrics-and-observability) -- [Network Requirements](#network-requirements) -- [Custom Resource Definitions](#custom-resource-definitions) - [RBAC](#rbac) +- [Custom Resource Definitions](#custom-resource-definitions) +- [Planned Changes](#planned-changes) - [Troubleshooting and Support](#troubleshooting-and-support) ## Prerequisites @@ -109,16 +109,6 @@ Each new agent release triggers a new chart release with an updated `appVersion` > [!WARNING] > Overriding `appVersion` to a version other than the one shipped with the chart is not recommended. Releases are tested and coordinated with a specific agent version, and mismatched combinations may include breaking changes between application and infrastructure code. -## Planned Changes - -This section outlines planned architecture changes that may be relevant for long-term chart maintenance. - -### Update Minimum Requirements to Kubernetes 1.36 Once GA - -Update the minimum required Kubernetes version to 1.36, which includes the stable [ImageVolume](https://kubernetes.io/docs/tasks/configure-pod-container/image-volumes/) feature and containerd 2.2+ with [subPath](https://github.com/containerd/containerd/pull/11578) support for ImageVolume. -In Kubernetes 1.35 (current minimal required version), ImageVolume is in Beta status but enabled by default, and we consider it ready for limited usage. -This chart relies on ImageVolume to provision application components via OCI registry and plans to use this feature more heavily in the future. - ## Agent Task Naming When the agent controller spawns a Kubernetes Job for a Scalr Run, the Job is named using the pattern: @@ -533,6 +523,16 @@ agent: Leave `sentryDsn` empty (the default) to disable Sentry integration. +## RBAC + +By default the chart provisions: + +- **ServiceAccount** used by the controller and task pods +- **Role/RoleBinding** with namespaced access to manage pods/jobs and related resources needed for task execution +- **ClusterRole/ClusterRoleBinding** granting read access to `AgentTaskTemplate` resources (`agenttasktemplates.scalr.io`) + +Set `rbac.create=false` to bring your own ServiceAccount/Rules, or adjust permissions with `rbac.rules` and `rbac.clusterRules`. + ## Custom Resource Definitions This chart bundles the `agenttasktemplates.scalr.io` CRD and installs or upgrades it automatically via Helm. The CRD defines the job template that the controller uses to create task pods. @@ -545,15 +545,15 @@ Verify installation: kubectl get crd agenttasktemplates.scalr.io ``` -## RBAC +## Planned Changes -By default the chart provisions: +This section outlines planned architecture changes that may be relevant for long-term chart maintenance. -- **ServiceAccount** used by the controller and task pods -- **Role/RoleBinding** with namespaced access to manage pods/jobs and related resources needed for task execution -- **ClusterRole/ClusterRoleBinding** granting read access to `AgentTaskTemplate` resources (`agenttasktemplates.scalr.io`) +### Update Minimum Requirements to Kubernetes 1.36 Once GA -Set `rbac.create=false` to bring your own ServiceAccount/Rules, or adjust permissions with `rbac.rules` and `rbac.clusterRules`. +Update the minimum required Kubernetes version to 1.36, which includes the stable [ImageVolume](https://kubernetes.io/docs/tasks/configure-pod-container/image-volumes/) feature and containerd 2.2+ with [subPath](https://github.com/containerd/containerd/pull/11578) support for ImageVolume. +In Kubernetes 1.35 (current minimal required version), ImageVolume is in Beta status but enabled by default, and we consider it ready for limited usage. +This chart relies on ImageVolume to provision application components via OCI registry and plans to use this feature more heavily in the future. ## Troubleshooting and Support From 4f6a867fbf6bd7fe36aeb6d2bacce470d51c0b2c Mon Sep 17 00:00:00 2001 From: Serhii Babak Date: Thu, 12 Mar 2026 23:36:45 +0200 Subject: [PATCH 10/24] SCALRCORE-37660 update docs --- charts/agent-job/README.md | 19 +++++++++++++++++++ charts/agent-job/README.md.gotmpl | 19 +++++++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/charts/agent-job/README.md b/charts/agent-job/README.md index 0ece8bb6..e4fad3f4 100644 --- a/charts/agent-job/README.md +++ b/charts/agent-job/README.md @@ -402,6 +402,25 @@ Optionally, you can configure a PVC using `persistence.data.enabled` and `persis ## Security +This section describes the security model of the chart, covering how the agent authenticates with the Scalr platform, how run workloads are isolated, and how access to cloud credentials is controlled. + +### Authentication and Authorization + +The agent authenticates with the Scalr platform using a token hierarchy with progressively narrower scopes — each token is a JWT and carries only the permissions required for its specific role: + +1. Agent Pool Token — a long-lived JWT configured via the `agent.token` Helm value (exposed as `SCALR_AGENT_TOKEN`). It identifies the agent to the Scalr platform and is used only during initial registration and startup. +2. Agent Session Token — a short-lived token issued by the Scalr API in exchange for the Agent Pool Token after successful registration. It is held in memory only (never written to disk) and used for task acquisition and status updates during the lifetime of the agent controller and agent workers. +3. Agent Task Token — a scoped token included in each task payload by the Scalr platform, valid only for that specific task execution. Used by the agent worker for task-specific API calls, downloading the configuration version, and streaming logs. Exists only for the lifetime of the task. +4. Scalr Run Token — a token exposed to the run environment as `SCALR_TOKEN` shell variable inside an agent runner container (i.e. available to OpenTofu/Terraform and provisioner scripts). It is scoped to the minimum permissions required for a run: `workspaces:read`, `module-versions:read`, and `state-versions:create` within a context of run's workspace. + +Communication with the Scalr platform uses HTTPS exclusively, making all traffic transparent for proxying and monitoring by agent operators. + +The agent establishes an outbound connection to the Scalr relay service (`relay.`) — an HTTP long-polling channel used for Scalr-to-agent messaging. The agent authenticates via `Authorization: Bearer ` headers. The platform pushes task assignments and control commands (e.g. cancel) through this relay. All connections are outbound — the platform never initiates inbound connections to the agent. + +All tokens are passed to containers via Kubernetes Secrets and mounted as environment variables — they are never embedded in plaintext in Pod specs, ConfigMaps, or chart values. + +If a token is revoked or returns a `401`, the agent shuts down immediately. + ### Multi-tenant Isolation This chart provides strong isolation for multi-tenant environments by deploying each run in a separate container with restricted filesystem access. diff --git a/charts/agent-job/README.md.gotmpl b/charts/agent-job/README.md.gotmpl index 901b9ddd..714570fe 100644 --- a/charts/agent-job/README.md.gotmpl +++ b/charts/agent-job/README.md.gotmpl @@ -399,6 +399,25 @@ Optionally, you can configure a PVC using `persistence.data.enabled` and `persis ## Security +This section describes the security model of the chart, covering how the agent authenticates with the Scalr platform, how run workloads are isolated, and how access to cloud credentials is controlled. + +### Authentication and Authorization + +The agent authenticates with the Scalr platform using a token hierarchy with progressively narrower scopes — each token is a JWT and carries only the permissions required for its specific role: + +1. Agent Pool Token — a long-lived JWT configured via the `agent.token` Helm value (exposed as `SCALR_AGENT_TOKEN`). It identifies the agent to the Scalr platform and is used only during initial registration and startup. +2. Agent Session Token — a short-lived token issued by the Scalr API in exchange for the Agent Pool Token after successful registration. It is held in memory only (never written to disk) and used for task acquisition and status updates during the lifetime of the agent controller and agent workers. +3. Agent Task Token — a scoped token included in each task payload by the Scalr platform, valid only for that specific task execution. Used by the agent worker for task-specific API calls, downloading the configuration version, and streaming logs. Exists only for the lifetime of the task. +4. Scalr Run Token — a token exposed to the run environment as `SCALR_TOKEN` shell variable inside an agent runner container (i.e. available to OpenTofu/Terraform and provisioner scripts). It is scoped to the minimum permissions required for a run: `workspaces:read`, `module-versions:read`, and `state-versions:create` within a context of run's workspace. + +Communication with the Scalr platform uses HTTPS exclusively, making all traffic transparent for proxying and monitoring by agent operators. + +The agent establishes an outbound connection to the Scalr relay service (`relay.`) — an HTTP long-polling channel used for Scalr-to-agent messaging. The agent authenticates via `Authorization: Bearer ` headers. The platform pushes task assignments and control commands (e.g. cancel) through this relay. All connections are outbound — the platform never initiates inbound connections to the agent. + +All tokens are passed to containers via Kubernetes Secrets and mounted as environment variables — they are never embedded in plaintext in Pod specs, ConfigMaps, or chart values. + +If a token is revoked or returns a `401`, the agent shuts down immediately. + ### Multi-tenant Isolation This chart provides strong isolation for multi-tenant environments by deploying each run in a separate container with restricted filesystem access. From 76ec9ae01049bcc6318edefd3de0280668b9d81f Mon Sep 17 00:00:00 2001 From: Serhii Babak Date: Thu, 12 Mar 2026 23:41:20 +0200 Subject: [PATCH 11/24] SCALRCORE-37660 update docs --- charts/agent-job/README.md | 10 +++++----- charts/agent-job/README.md.gotmpl | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/charts/agent-job/README.md b/charts/agent-job/README.md index e4fad3f4..704b5746 100644 --- a/charts/agent-job/README.md +++ b/charts/agent-job/README.md @@ -408,14 +408,14 @@ This section describes the security model of the chart, covering how the agent a The agent authenticates with the Scalr platform using a token hierarchy with progressively narrower scopes — each token is a JWT and carries only the permissions required for its specific role: -1. Agent Pool Token — a long-lived JWT configured via the `agent.token` Helm value (exposed as `SCALR_AGENT_TOKEN`). It identifies the agent to the Scalr platform and is used only during initial registration and startup. -2. Agent Session Token — a short-lived token issued by the Scalr API in exchange for the Agent Pool Token after successful registration. It is held in memory only (never written to disk) and used for task acquisition and status updates during the lifetime of the agent controller and agent workers. -3. Agent Task Token — a scoped token included in each task payload by the Scalr platform, valid only for that specific task execution. Used by the agent worker for task-specific API calls, downloading the configuration version, and streaming logs. Exists only for the lifetime of the task. -4. Scalr Run Token — a token exposed to the run environment as `SCALR_TOKEN` shell variable inside an agent runner container (i.e. available to OpenTofu/Terraform and provisioner scripts). It is scoped to the minimum permissions required for a run: `workspaces:read`, `module-versions:read`, and `state-versions:create` within a context of run's workspace. +1. Agent Pool Token — a long-lived token configured via the `agent.token` Helm value (passed to agent services as `SCALR_AGENT_TOKEN`). It identifies the agent to the Scalr platform and is used only during initial registration and startup. +2. Agent Session Token — a token issued by the Scalr API in exchange for the Agent Pool Token after successful registration. Held in memory only and used for task acquisition and status updates during the lifetime of the agent controller and agent workers. +3. Agent Task Token — a token included in each task payload by the Scalr platform, valid only for that specific task execution. Used by the agent worker for task-specific API calls, downloading the configuration version, and streaming logs. +4. Scalr Run Token — a token exposed to the run environment as the `SCALR_TOKEN` environment variable (available to OpenTofu/Terraform and provisioner scripts). Scoped to the minimum permissions required for a run: `workspaces:read`, `module-versions:read`, and `state-versions:create` within the context of the run's workspace. Exists only for the lifetime of the task. Communication with the Scalr platform uses HTTPS exclusively, making all traffic transparent for proxying and monitoring by agent operators. -The agent establishes an outbound connection to the Scalr relay service (`relay.`) — an HTTP long-polling channel used for Scalr-to-agent messaging. The agent authenticates via `Authorization: Bearer ` headers. The platform pushes task assignments and control commands (e.g. cancel) through this relay. All connections are outbound — the platform never initiates inbound connections to the agent. +The agent establishes an outbound connection to the Scalr relay service (`relay.`) — an HTTP long-polling channel used for Scalr-to-agent messaging. The agent authenticates via `Authorization: Bearer ` headers. The platform pushes messages about available tasks and cancellation signals through this relay. All connections are outbound — the platform never initiates inbound connections to the agent. All tokens are passed to containers via Kubernetes Secrets and mounted as environment variables — they are never embedded in plaintext in Pod specs, ConfigMaps, or chart values. diff --git a/charts/agent-job/README.md.gotmpl b/charts/agent-job/README.md.gotmpl index 714570fe..8fc8d755 100644 --- a/charts/agent-job/README.md.gotmpl +++ b/charts/agent-job/README.md.gotmpl @@ -405,14 +405,14 @@ This section describes the security model of the chart, covering how the agent a The agent authenticates with the Scalr platform using a token hierarchy with progressively narrower scopes — each token is a JWT and carries only the permissions required for its specific role: -1. Agent Pool Token — a long-lived JWT configured via the `agent.token` Helm value (exposed as `SCALR_AGENT_TOKEN`). It identifies the agent to the Scalr platform and is used only during initial registration and startup. -2. Agent Session Token — a short-lived token issued by the Scalr API in exchange for the Agent Pool Token after successful registration. It is held in memory only (never written to disk) and used for task acquisition and status updates during the lifetime of the agent controller and agent workers. -3. Agent Task Token — a scoped token included in each task payload by the Scalr platform, valid only for that specific task execution. Used by the agent worker for task-specific API calls, downloading the configuration version, and streaming logs. Exists only for the lifetime of the task. -4. Scalr Run Token — a token exposed to the run environment as `SCALR_TOKEN` shell variable inside an agent runner container (i.e. available to OpenTofu/Terraform and provisioner scripts). It is scoped to the minimum permissions required for a run: `workspaces:read`, `module-versions:read`, and `state-versions:create` within a context of run's workspace. +1. Agent Pool Token — a long-lived token configured via the `agent.token` Helm value (passed to agent services as `SCALR_AGENT_TOKEN`). It identifies the agent to the Scalr platform and is used only during initial registration and startup. +2. Agent Session Token — a token issued by the Scalr API in exchange for the Agent Pool Token after successful registration. Held in memory only and used for task acquisition and status updates during the lifetime of the agent controller and agent workers. +3. Agent Task Token — a token included in each task payload by the Scalr platform, valid only for that specific task execution. Used by the agent worker for task-specific API calls, downloading the configuration version, and streaming logs. +4. Scalr Run Token — a token exposed to the run environment as the `SCALR_TOKEN` environment variable (available to OpenTofu/Terraform and provisioner scripts). Scoped to the minimum permissions required for a run: `workspaces:read`, `module-versions:read`, and `state-versions:create` within the context of the run's workspace. Exists only for the lifetime of the task. Communication with the Scalr platform uses HTTPS exclusively, making all traffic transparent for proxying and monitoring by agent operators. -The agent establishes an outbound connection to the Scalr relay service (`relay.`) — an HTTP long-polling channel used for Scalr-to-agent messaging. The agent authenticates via `Authorization: Bearer ` headers. The platform pushes task assignments and control commands (e.g. cancel) through this relay. All connections are outbound — the platform never initiates inbound connections to the agent. +The agent establishes an outbound connection to the Scalr relay service (`relay.`) — an HTTP long-polling channel used for Scalr-to-agent messaging. The agent authenticates via `Authorization: Bearer ` headers. The platform pushes messages about available tasks and cancellation signals through this relay. All connections are outbound — the platform never initiates inbound connections to the agent. All tokens are passed to containers via Kubernetes Secrets and mounted as environment variables — they are never embedded in plaintext in Pod specs, ConfigMaps, or chart values. From e5979d61ed6343aee150adc9be17d5234914a5e6 Mon Sep 17 00:00:00 2001 From: Serhii Babak Date: Thu, 12 Mar 2026 23:44:45 +0200 Subject: [PATCH 12/24] SCALRCORE-37660 update docs --- charts/agent-job/README.md | 2 +- charts/agent-job/README.md.gotmpl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/charts/agent-job/README.md b/charts/agent-job/README.md index 704b5746..4dd3e4fe 100644 --- a/charts/agent-job/README.md +++ b/charts/agent-job/README.md @@ -415,7 +415,7 @@ The agent authenticates with the Scalr platform using a token hierarchy with pro Communication with the Scalr platform uses HTTPS exclusively, making all traffic transparent for proxying and monitoring by agent operators. -The agent establishes an outbound connection to the Scalr relay service (`relay.`) — an HTTP long-polling channel used for Scalr-to-agent messaging. The agent authenticates via `Authorization: Bearer ` headers. The platform pushes messages about available tasks and cancellation signals through this relay. All connections are outbound — the platform never initiates inbound connections to the agent. +The agent establishes an outbound connection to the Scalr relay service (`relay.`) — an HTTP long-polling channel used for Scalr-to-agent messaging. The agent authenticates via `Authorization: Bearer ` headers. The platform pushes messages about available tasks and cancellation signals through this relay. All connections are outbound — the Scalr platform never initiates inbound connections to the agent, and the agent never exposes any TCP ports. All tokens are passed to containers via Kubernetes Secrets and mounted as environment variables — they are never embedded in plaintext in Pod specs, ConfigMaps, or chart values. diff --git a/charts/agent-job/README.md.gotmpl b/charts/agent-job/README.md.gotmpl index 8fc8d755..3cb627e7 100644 --- a/charts/agent-job/README.md.gotmpl +++ b/charts/agent-job/README.md.gotmpl @@ -412,7 +412,7 @@ The agent authenticates with the Scalr platform using a token hierarchy with pro Communication with the Scalr platform uses HTTPS exclusively, making all traffic transparent for proxying and monitoring by agent operators. -The agent establishes an outbound connection to the Scalr relay service (`relay.`) — an HTTP long-polling channel used for Scalr-to-agent messaging. The agent authenticates via `Authorization: Bearer ` headers. The platform pushes messages about available tasks and cancellation signals through this relay. All connections are outbound — the platform never initiates inbound connections to the agent. +The agent establishes an outbound connection to the Scalr relay service (`relay.`) — an HTTP long-polling channel used for Scalr-to-agent messaging. The agent authenticates via `Authorization: Bearer ` headers. The platform pushes messages about available tasks and cancellation signals through this relay. All connections are outbound — the Scalr platform never initiates inbound connections to the agent, and the agent never exposes any TCP ports. All tokens are passed to containers via Kubernetes Secrets and mounted as environment variables — they are never embedded in plaintext in Pod specs, ConfigMaps, or chart values. From ed2efff94ea7ca337e64311f583793f186cf81de Mon Sep 17 00:00:00 2001 From: Serhii Babak Date: Fri, 13 Mar 2026 10:04:34 +0200 Subject: [PATCH 13/24] SCALRCORE-37660 update docs --- charts/agent-job/README.md | 4 +++- charts/agent-job/README.md.gotmpl | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/charts/agent-job/README.md b/charts/agent-job/README.md index 4dd3e4fe..1cc31e14 100644 --- a/charts/agent-job/README.md +++ b/charts/agent-job/README.md @@ -419,7 +419,9 @@ The agent establishes an outbound connection to the Scalr relay service (`relay. All tokens are passed to containers via Kubernetes Secrets and mounted as environment variables — they are never embedded in plaintext in Pod specs, ConfigMaps, or chart values. -If a token is revoked or returns a `401`, the agent shuts down immediately. +If an Agent Pool Token is revoked (e.g. from the Scalr Agent Pool tokens page), all associated Agent Session Tokens are revoked. Subsequent API calls return `401`, which causes the agents to shut down immediately. + +The Agent Task Token and Scalr Run Token are tied to the lifecycle of the run task and are revoked automatically when the task ends. ### Multi-tenant Isolation diff --git a/charts/agent-job/README.md.gotmpl b/charts/agent-job/README.md.gotmpl index 3cb627e7..5e98076d 100644 --- a/charts/agent-job/README.md.gotmpl +++ b/charts/agent-job/README.md.gotmpl @@ -416,7 +416,9 @@ The agent establishes an outbound connection to the Scalr relay service (`relay. All tokens are passed to containers via Kubernetes Secrets and mounted as environment variables — they are never embedded in plaintext in Pod specs, ConfigMaps, or chart values. -If a token is revoked or returns a `401`, the agent shuts down immediately. +If an Agent Pool Token is revoked (e.g. from the Scalr Agent Pool tokens page), all associated Agent Session Tokens are revoked. Subsequent API calls return `401`, which causes the agents to shut down immediately. + +The Agent Task Token and Scalr Run Token are tied to the lifecycle of the run task and are revoked automatically when the task ends. ### Multi-tenant Isolation From 74822be7ea36ae5e2ee39f0106a87b43f6de49de Mon Sep 17 00:00:00 2001 From: Serhii Babak Date: Fri, 13 Mar 2026 10:13:59 +0200 Subject: [PATCH 14/24] SCALRCORE-37660 update docs --- charts/agent-job/README.md | 4 ++-- charts/agent-job/README.md.gotmpl | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/charts/agent-job/README.md b/charts/agent-job/README.md index 1cc31e14..67158a3e 100644 --- a/charts/agent-job/README.md +++ b/charts/agent-job/README.md @@ -410,8 +410,8 @@ The agent authenticates with the Scalr platform using a token hierarchy with pro 1. Agent Pool Token — a long-lived token configured via the `agent.token` Helm value (passed to agent services as `SCALR_AGENT_TOKEN`). It identifies the agent to the Scalr platform and is used only during initial registration and startup. 2. Agent Session Token — a token issued by the Scalr API in exchange for the Agent Pool Token after successful registration. Held in memory only and used for task acquisition and status updates during the lifetime of the agent controller and agent workers. -3. Agent Task Token — a token included in each task payload by the Scalr platform, valid only for that specific task execution. Used by the agent worker for task-specific API calls, downloading the configuration version, and streaming logs. -4. Scalr Run Token — a token exposed to the run environment as the `SCALR_TOKEN` environment variable (available to OpenTofu/Terraform and provisioner scripts). Scoped to the minimum permissions required for a run: `workspaces:read`, `module-versions:read`, and `state-versions:create` within the context of the run's workspace. Exists only for the lifetime of the task. +3. Agent Task Token — a token generated by the Scalr platform during task acquisition for the agent worker, valid only for that specific task execution. Used for task-specific API calls, downloading the configuration version, and streaming logs. Scoped to the run's workspace. Exists only for the lifetime of the task. +4. Scalr Run Token — a token generated by the Scalr platform during task acquisition for the runner container, valid only for that specific task execution. Passed to the run environment as the `SCALR_TOKEN` environment variable for the OpenTofu/Terraform remote state backend. Scoped to the minimum permissions required for a run: `workspaces:read`, `module-versions:read`, and `state-versions:create` within the context of the run's workspace. Exists only for the lifetime of the task. Communication with the Scalr platform uses HTTPS exclusively, making all traffic transparent for proxying and monitoring by agent operators. diff --git a/charts/agent-job/README.md.gotmpl b/charts/agent-job/README.md.gotmpl index 5e98076d..c436cb71 100644 --- a/charts/agent-job/README.md.gotmpl +++ b/charts/agent-job/README.md.gotmpl @@ -407,8 +407,8 @@ The agent authenticates with the Scalr platform using a token hierarchy with pro 1. Agent Pool Token — a long-lived token configured via the `agent.token` Helm value (passed to agent services as `SCALR_AGENT_TOKEN`). It identifies the agent to the Scalr platform and is used only during initial registration and startup. 2. Agent Session Token — a token issued by the Scalr API in exchange for the Agent Pool Token after successful registration. Held in memory only and used for task acquisition and status updates during the lifetime of the agent controller and agent workers. -3. Agent Task Token — a token included in each task payload by the Scalr platform, valid only for that specific task execution. Used by the agent worker for task-specific API calls, downloading the configuration version, and streaming logs. -4. Scalr Run Token — a token exposed to the run environment as the `SCALR_TOKEN` environment variable (available to OpenTofu/Terraform and provisioner scripts). Scoped to the minimum permissions required for a run: `workspaces:read`, `module-versions:read`, and `state-versions:create` within the context of the run's workspace. Exists only for the lifetime of the task. +3. Agent Task Token — a token generated by the Scalr platform during task acquisition for the agent worker, valid only for that specific task execution. Used for task-specific API calls, downloading the configuration version, and streaming logs. Scoped to the run's workspace. Exists only for the lifetime of the task. +4. Scalr Run Token — a token generated by the Scalr platform during task acquisition for the runner container, valid only for that specific task execution. Passed to the run environment as the `SCALR_TOKEN` environment variable for the OpenTofu/Terraform remote state backend. Scoped to the minimum permissions required for a run: `workspaces:read`, `module-versions:read`, and `state-versions:create` within the context of the run's workspace. Exists only for the lifetime of the task. Communication with the Scalr platform uses HTTPS exclusively, making all traffic transparent for proxying and monitoring by agent operators. From 502df0ee84282c8eb3cb0f9370d63aede717fdd7 Mon Sep 17 00:00:00 2001 From: Serhii Babak Date: Fri, 13 Mar 2026 10:16:44 +0200 Subject: [PATCH 15/24] SCALRCORE-37660 update docs --- charts/agent-job/README.md | 2 +- charts/agent-job/README.md.gotmpl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/charts/agent-job/README.md b/charts/agent-job/README.md index 67158a3e..cc36fdd8 100644 --- a/charts/agent-job/README.md +++ b/charts/agent-job/README.md @@ -415,7 +415,7 @@ The agent authenticates with the Scalr platform using a token hierarchy with pro Communication with the Scalr platform uses HTTPS exclusively, making all traffic transparent for proxying and monitoring by agent operators. -The agent establishes an outbound connection to the Scalr relay service (`relay.`) — an HTTP long-polling channel used for Scalr-to-agent messaging. The agent authenticates via `Authorization: Bearer ` headers. The platform pushes messages about available tasks and cancellation signals through this relay. All connections are outbound — the Scalr platform never initiates inbound connections to the agent, and the agent never exposes any TCP ports. +In addition to regular HTTP API calls, the agent establishes an additional outbound connection to the Scalr relay service (`relay.`) — an HTTP long-polling channel used for Scalr-to-agent messaging. The agent authenticates via `Authorization: Bearer ` headers. The platform pushes messages about available tasks and cancellation signals through this relay. All connections are outbound — the Scalr platform never initiates inbound connections to the agent, and the agent never exposes any TCP ports. All tokens are passed to containers via Kubernetes Secrets and mounted as environment variables — they are never embedded in plaintext in Pod specs, ConfigMaps, or chart values. diff --git a/charts/agent-job/README.md.gotmpl b/charts/agent-job/README.md.gotmpl index c436cb71..32828cad 100644 --- a/charts/agent-job/README.md.gotmpl +++ b/charts/agent-job/README.md.gotmpl @@ -412,7 +412,7 @@ The agent authenticates with the Scalr platform using a token hierarchy with pro Communication with the Scalr platform uses HTTPS exclusively, making all traffic transparent for proxying and monitoring by agent operators. -The agent establishes an outbound connection to the Scalr relay service (`relay.`) — an HTTP long-polling channel used for Scalr-to-agent messaging. The agent authenticates via `Authorization: Bearer ` headers. The platform pushes messages about available tasks and cancellation signals through this relay. All connections are outbound — the Scalr platform never initiates inbound connections to the agent, and the agent never exposes any TCP ports. +In addition to regular HTTP API calls, the agent establishes an additional outbound connection to the Scalr relay service (`relay.`) — an HTTP long-polling channel used for Scalr-to-agent messaging. The agent authenticates via `Authorization: Bearer ` headers. The platform pushes messages about available tasks and cancellation signals through this relay. All connections are outbound — the Scalr platform never initiates inbound connections to the agent, and the agent never exposes any TCP ports. All tokens are passed to containers via Kubernetes Secrets and mounted as environment variables — they are never embedded in plaintext in Pod specs, ConfigMaps, or chart values. From cd42da7f33ba520c9c09b2803cb282c7620df0ad Mon Sep 17 00:00:00 2001 From: Serhii Babak Date: Fri, 13 Mar 2026 10:18:41 +0200 Subject: [PATCH 16/24] SCALRCORE-37660 update docs --- charts/agent-job/README.md | 10 ++++++---- charts/agent-job/README.md.gotmpl | 10 ++++++---- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/charts/agent-job/README.md b/charts/agent-job/README.md index cc36fdd8..78837ebb 100644 --- a/charts/agent-job/README.md +++ b/charts/agent-job/README.md @@ -413,16 +413,18 @@ The agent authenticates with the Scalr platform using a token hierarchy with pro 3. Agent Task Token — a token generated by the Scalr platform during task acquisition for the agent worker, valid only for that specific task execution. Used for task-specific API calls, downloading the configuration version, and streaming logs. Scoped to the run's workspace. Exists only for the lifetime of the task. 4. Scalr Run Token — a token generated by the Scalr platform during task acquisition for the runner container, valid only for that specific task execution. Passed to the run environment as the `SCALR_TOKEN` environment variable for the OpenTofu/Terraform remote state backend. Scoped to the minimum permissions required for a run: `workspaces:read`, `module-versions:read`, and `state-versions:create` within the context of the run's workspace. Exists only for the lifetime of the task. -Communication with the Scalr platform uses HTTPS exclusively, making all traffic transparent for proxying and monitoring by agent operators. +The agent authenticates via `Authorization: Bearer ` headers. + +In addition to regular HTTP API calls, the agent establishes an additional outbound connection to the Scalr relay service (`relay.`) — an HTTP long-polling channel used for Scalr-to-agent messaging. +The Scalr platform pushes messages about available tasks and cancellation signals through this relay. -In addition to regular HTTP API calls, the agent establishes an additional outbound connection to the Scalr relay service (`relay.`) — an HTTP long-polling channel used for Scalr-to-agent messaging. The agent authenticates via `Authorization: Bearer ` headers. The platform pushes messages about available tasks and cancellation signals through this relay. All connections are outbound — the Scalr platform never initiates inbound connections to the agent, and the agent never exposes any TCP ports. +Communication with the Scalr platform uses HTTPS exclusively, making all traffic transparent for proxying and monitoring by agent operators. +All connections are outbound — the Scalr platform never initiates inbound connections to the agent, and the agent never exposes any TCP ports. All tokens are passed to containers via Kubernetes Secrets and mounted as environment variables — they are never embedded in plaintext in Pod specs, ConfigMaps, or chart values. If an Agent Pool Token is revoked (e.g. from the Scalr Agent Pool tokens page), all associated Agent Session Tokens are revoked. Subsequent API calls return `401`, which causes the agents to shut down immediately. -The Agent Task Token and Scalr Run Token are tied to the lifecycle of the run task and are revoked automatically when the task ends. - ### Multi-tenant Isolation This chart provides strong isolation for multi-tenant environments by deploying each run in a separate container with restricted filesystem access. diff --git a/charts/agent-job/README.md.gotmpl b/charts/agent-job/README.md.gotmpl index 32828cad..169940c8 100644 --- a/charts/agent-job/README.md.gotmpl +++ b/charts/agent-job/README.md.gotmpl @@ -410,16 +410,18 @@ The agent authenticates with the Scalr platform using a token hierarchy with pro 3. Agent Task Token — a token generated by the Scalr platform during task acquisition for the agent worker, valid only for that specific task execution. Used for task-specific API calls, downloading the configuration version, and streaming logs. Scoped to the run's workspace. Exists only for the lifetime of the task. 4. Scalr Run Token — a token generated by the Scalr platform during task acquisition for the runner container, valid only for that specific task execution. Passed to the run environment as the `SCALR_TOKEN` environment variable for the OpenTofu/Terraform remote state backend. Scoped to the minimum permissions required for a run: `workspaces:read`, `module-versions:read`, and `state-versions:create` within the context of the run's workspace. Exists only for the lifetime of the task. -Communication with the Scalr platform uses HTTPS exclusively, making all traffic transparent for proxying and monitoring by agent operators. +The agent authenticates via `Authorization: Bearer ` headers. + +In addition to regular HTTP API calls, the agent establishes an additional outbound connection to the Scalr relay service (`relay.`) — an HTTP long-polling channel used for Scalr-to-agent messaging. +The Scalr platform pushes messages about available tasks and cancellation signals through this relay. -In addition to regular HTTP API calls, the agent establishes an additional outbound connection to the Scalr relay service (`relay.`) — an HTTP long-polling channel used for Scalr-to-agent messaging. The agent authenticates via `Authorization: Bearer ` headers. The platform pushes messages about available tasks and cancellation signals through this relay. All connections are outbound — the Scalr platform never initiates inbound connections to the agent, and the agent never exposes any TCP ports. +Communication with the Scalr platform uses HTTPS exclusively, making all traffic transparent for proxying and monitoring by agent operators. +All connections are outbound — the Scalr platform never initiates inbound connections to the agent, and the agent never exposes any TCP ports. All tokens are passed to containers via Kubernetes Secrets and mounted as environment variables — they are never embedded in plaintext in Pod specs, ConfigMaps, or chart values. If an Agent Pool Token is revoked (e.g. from the Scalr Agent Pool tokens page), all associated Agent Session Tokens are revoked. Subsequent API calls return `401`, which causes the agents to shut down immediately. -The Agent Task Token and Scalr Run Token are tied to the lifecycle of the run task and are revoked automatically when the task ends. - ### Multi-tenant Isolation This chart provides strong isolation for multi-tenant environments by deploying each run in a separate container with restricted filesystem access. From 42d62c29987c125cc27b9dd2d0b8d2cb17bf67c7 Mon Sep 17 00:00:00 2001 From: Serhii Babak Date: Fri, 13 Mar 2026 10:19:18 +0200 Subject: [PATCH 17/24] SCALRCORE-37660 update docs --- charts/agent-job/README.md | 4 ++-- charts/agent-job/README.md.gotmpl | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/charts/agent-job/README.md b/charts/agent-job/README.md index 78837ebb..3cd47841 100644 --- a/charts/agent-job/README.md +++ b/charts/agent-job/README.md @@ -415,14 +415,14 @@ The agent authenticates with the Scalr platform using a token hierarchy with pro The agent authenticates via `Authorization: Bearer ` headers. +All tokens are passed to containers via Kubernetes Secrets and mounted as environment variables — they are never embedded in plaintext in Pod specs, ConfigMaps, or chart values. + In addition to regular HTTP API calls, the agent establishes an additional outbound connection to the Scalr relay service (`relay.`) — an HTTP long-polling channel used for Scalr-to-agent messaging. The Scalr platform pushes messages about available tasks and cancellation signals through this relay. Communication with the Scalr platform uses HTTPS exclusively, making all traffic transparent for proxying and monitoring by agent operators. All connections are outbound — the Scalr platform never initiates inbound connections to the agent, and the agent never exposes any TCP ports. -All tokens are passed to containers via Kubernetes Secrets and mounted as environment variables — they are never embedded in plaintext in Pod specs, ConfigMaps, or chart values. - If an Agent Pool Token is revoked (e.g. from the Scalr Agent Pool tokens page), all associated Agent Session Tokens are revoked. Subsequent API calls return `401`, which causes the agents to shut down immediately. ### Multi-tenant Isolation diff --git a/charts/agent-job/README.md.gotmpl b/charts/agent-job/README.md.gotmpl index 169940c8..fbaff70f 100644 --- a/charts/agent-job/README.md.gotmpl +++ b/charts/agent-job/README.md.gotmpl @@ -412,14 +412,14 @@ The agent authenticates with the Scalr platform using a token hierarchy with pro The agent authenticates via `Authorization: Bearer ` headers. +All tokens are passed to containers via Kubernetes Secrets and mounted as environment variables — they are never embedded in plaintext in Pod specs, ConfigMaps, or chart values. + In addition to regular HTTP API calls, the agent establishes an additional outbound connection to the Scalr relay service (`relay.`) — an HTTP long-polling channel used for Scalr-to-agent messaging. The Scalr platform pushes messages about available tasks and cancellation signals through this relay. Communication with the Scalr platform uses HTTPS exclusively, making all traffic transparent for proxying and monitoring by agent operators. All connections are outbound — the Scalr platform never initiates inbound connections to the agent, and the agent never exposes any TCP ports. -All tokens are passed to containers via Kubernetes Secrets and mounted as environment variables — they are never embedded in plaintext in Pod specs, ConfigMaps, or chart values. - If an Agent Pool Token is revoked (e.g. from the Scalr Agent Pool tokens page), all associated Agent Session Tokens are revoked. Subsequent API calls return `401`, which causes the agents to shut down immediately. ### Multi-tenant Isolation From 36c4c3f9549817cc411cfff86b8151286ac08c79 Mon Sep 17 00:00:00 2001 From: Serhii Babak Date: Fri, 13 Mar 2026 10:20:03 +0200 Subject: [PATCH 18/24] SCALRCORE-37660 update docs --- charts/agent-job/README.md | 2 +- charts/agent-job/README.md.gotmpl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/charts/agent-job/README.md b/charts/agent-job/README.md index 3cd47841..b1ccc4ea 100644 --- a/charts/agent-job/README.md +++ b/charts/agent-job/README.md @@ -423,7 +423,7 @@ The Scalr platform pushes messages about available tasks and cancellation signal Communication with the Scalr platform uses HTTPS exclusively, making all traffic transparent for proxying and monitoring by agent operators. All connections are outbound — the Scalr platform never initiates inbound connections to the agent, and the agent never exposes any TCP ports. -If an Agent Pool Token is revoked (e.g. from the Scalr Agent Pool tokens page), all associated Agent Session Tokens are revoked. Subsequent API calls return `401`, which causes the agents to shut down immediately. +If an Agent Pool Token is revoked (e.g. from the Scalr Agent Pool tokens page), all associated Agent Session Tokens are revoked. Subsequent API calls return `401`, which causes the agents to shut down. ### Multi-tenant Isolation diff --git a/charts/agent-job/README.md.gotmpl b/charts/agent-job/README.md.gotmpl index fbaff70f..8b1b29ee 100644 --- a/charts/agent-job/README.md.gotmpl +++ b/charts/agent-job/README.md.gotmpl @@ -420,7 +420,7 @@ The Scalr platform pushes messages about available tasks and cancellation signal Communication with the Scalr platform uses HTTPS exclusively, making all traffic transparent for proxying and monitoring by agent operators. All connections are outbound — the Scalr platform never initiates inbound connections to the agent, and the agent never exposes any TCP ports. -If an Agent Pool Token is revoked (e.g. from the Scalr Agent Pool tokens page), all associated Agent Session Tokens are revoked. Subsequent API calls return `401`, which causes the agents to shut down immediately. +If an Agent Pool Token is revoked (e.g. from the Scalr Agent Pool tokens page), all associated Agent Session Tokens are revoked. Subsequent API calls return `401`, which causes the agents to shut down. ### Multi-tenant Isolation From 7d0805b4daf388c5f5ceb42bc8cae4d9d554a637 Mon Sep 17 00:00:00 2001 From: Serhii Babak Date: Fri, 13 Mar 2026 11:22:19 +0200 Subject: [PATCH 19/24] SCALRCORE-37660 update docs --- charts/agent-job/README.md | 2 +- charts/agent-job/README.md.gotmpl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/charts/agent-job/README.md b/charts/agent-job/README.md index b1ccc4ea..c5d1fc64 100644 --- a/charts/agent-job/README.md +++ b/charts/agent-job/README.md @@ -411,7 +411,7 @@ The agent authenticates with the Scalr platform using a token hierarchy with pro 1. Agent Pool Token — a long-lived token configured via the `agent.token` Helm value (passed to agent services as `SCALR_AGENT_TOKEN`). It identifies the agent to the Scalr platform and is used only during initial registration and startup. 2. Agent Session Token — a token issued by the Scalr API in exchange for the Agent Pool Token after successful registration. Held in memory only and used for task acquisition and status updates during the lifetime of the agent controller and agent workers. 3. Agent Task Token — a token generated by the Scalr platform during task acquisition for the agent worker, valid only for that specific task execution. Used for task-specific API calls, downloading the configuration version, and streaming logs. Scoped to the run's workspace. Exists only for the lifetime of the task. -4. Scalr Run Token — a token generated by the Scalr platform during task acquisition for the runner container, valid only for that specific task execution. Passed to the run environment as the `SCALR_TOKEN` environment variable for the OpenTofu/Terraform remote state backend. Scoped to the minimum permissions required for a run: `workspaces:read`, `module-versions:read`, and `state-versions:create` within the context of the run's workspace. Exists only for the lifetime of the task. +4. Scalr Run Token — a token generated by the Scalr platform during task acquisition for the runner container, valid only for that specific task execution. Passed to the run environment as the `SCALR_TOKEN` environment variable for the OpenTofu/Terraform remote state backend. Scoped to the minimum permissions required for a run: `workspaces:read`, `workspaces:lock`, `module-versions:read`, `state-versions:read` and `state-versions:create` within the context of the run's workspace. Exists only for the lifetime of the task. The agent authenticates via `Authorization: Bearer ` headers. diff --git a/charts/agent-job/README.md.gotmpl b/charts/agent-job/README.md.gotmpl index 8b1b29ee..43090896 100644 --- a/charts/agent-job/README.md.gotmpl +++ b/charts/agent-job/README.md.gotmpl @@ -408,7 +408,7 @@ The agent authenticates with the Scalr platform using a token hierarchy with pro 1. Agent Pool Token — a long-lived token configured via the `agent.token` Helm value (passed to agent services as `SCALR_AGENT_TOKEN`). It identifies the agent to the Scalr platform and is used only during initial registration and startup. 2. Agent Session Token — a token issued by the Scalr API in exchange for the Agent Pool Token after successful registration. Held in memory only and used for task acquisition and status updates during the lifetime of the agent controller and agent workers. 3. Agent Task Token — a token generated by the Scalr platform during task acquisition for the agent worker, valid only for that specific task execution. Used for task-specific API calls, downloading the configuration version, and streaming logs. Scoped to the run's workspace. Exists only for the lifetime of the task. -4. Scalr Run Token — a token generated by the Scalr platform during task acquisition for the runner container, valid only for that specific task execution. Passed to the run environment as the `SCALR_TOKEN` environment variable for the OpenTofu/Terraform remote state backend. Scoped to the minimum permissions required for a run: `workspaces:read`, `module-versions:read`, and `state-versions:create` within the context of the run's workspace. Exists only for the lifetime of the task. +4. Scalr Run Token — a token generated by the Scalr platform during task acquisition for the runner container, valid only for that specific task execution. Passed to the run environment as the `SCALR_TOKEN` environment variable for the OpenTofu/Terraform remote state backend. Scoped to the minimum permissions required for a run: `workspaces:read`, `workspaces:lock`, `module-versions:read`, `state-versions:read` and `state-versions:create` within the context of the run's workspace. Exists only for the lifetime of the task. The agent authenticates via `Authorization: Bearer ` headers. From 43b67d666519ab13b3628b50c000afdc8cd9c805 Mon Sep 17 00:00:00 2001 From: Serhii Babak Date: Fri, 13 Mar 2026 11:32:02 +0200 Subject: [PATCH 20/24] SCALRCORE-37660 update docs --- charts/agent-job/README.md | 12 ++++-------- charts/agent-job/README.md.gotmpl | 12 ++++-------- 2 files changed, 8 insertions(+), 16 deletions(-) diff --git a/charts/agent-job/README.md b/charts/agent-job/README.md index c5d1fc64..45e01e79 100644 --- a/charts/agent-job/README.md +++ b/charts/agent-job/README.md @@ -411,17 +411,13 @@ The agent authenticates with the Scalr platform using a token hierarchy with pro 1. Agent Pool Token — a long-lived token configured via the `agent.token` Helm value (passed to agent services as `SCALR_AGENT_TOKEN`). It identifies the agent to the Scalr platform and is used only during initial registration and startup. 2. Agent Session Token — a token issued by the Scalr API in exchange for the Agent Pool Token after successful registration. Held in memory only and used for task acquisition and status updates during the lifetime of the agent controller and agent workers. 3. Agent Task Token — a token generated by the Scalr platform during task acquisition for the agent worker, valid only for that specific task execution. Used for task-specific API calls, downloading the configuration version, and streaming logs. Scoped to the run's workspace. Exists only for the lifetime of the task. -4. Scalr Run Token — a token generated by the Scalr platform during task acquisition for the runner container, valid only for that specific task execution. Passed to the run environment as the `SCALR_TOKEN` environment variable for the OpenTofu/Terraform remote state backend. Scoped to the minimum permissions required for a run: `workspaces:read`, `workspaces:lock`, `module-versions:read`, `state-versions:read` and `state-versions:create` within the context of the run's workspace. Exists only for the lifetime of the task. +4. Scalr Run Token — a token generated by the Scalr platform during task acquisition for the runner container, valid only for that specific task execution. Passed to the run environment as the `SCALR_TOKEN` environment variable for the OpenTofu/Terraform remote state backend. Scoped to the minimum permissions required for a run: `workspaces:read`, `workspaces:lock`, `module-versions:read`, `state-versions:read`, and `state-versions:create` within the context of the run's workspace. Exists only for the lifetime of the task. -The agent authenticates via `Authorization: Bearer ` headers. +All API calls authenticate via `Authorization: Bearer ` headers. All tokens are passed to containers via Kubernetes Secrets and mounted as environment variables — they are never embedded in plaintext in Pod specs, ConfigMaps, or chart values. -All tokens are passed to containers via Kubernetes Secrets and mounted as environment variables — they are never embedded in plaintext in Pod specs, ConfigMaps, or chart values. +Communication with the Scalr platform uses HTTPS exclusively, making all traffic transparent for proxying and monitoring by agent operators. All connections are outbound — the Scalr platform never initiates inbound connections to the agent, and the agent never exposes any TCP ports. -In addition to regular HTTP API calls, the agent establishes an additional outbound connection to the Scalr relay service (`relay.`) — an HTTP long-polling channel used for Scalr-to-agent messaging. -The Scalr platform pushes messages about available tasks and cancellation signals through this relay. - -Communication with the Scalr platform uses HTTPS exclusively, making all traffic transparent for proxying and monitoring by agent operators. -All connections are outbound — the Scalr platform never initiates inbound connections to the agent, and the agent never exposes any TCP ports. +In addition to regular HTTP API calls, the agent establishes an outbound connection to the Scalr relay service (`relay.`) — an HTTP long-polling channel used for Scalr-to-agent messaging. The Scalr platform pushes messages about available tasks and cancellation signals through this relay. If an Agent Pool Token is revoked (e.g. from the Scalr Agent Pool tokens page), all associated Agent Session Tokens are revoked. Subsequent API calls return `401`, which causes the agents to shut down. diff --git a/charts/agent-job/README.md.gotmpl b/charts/agent-job/README.md.gotmpl index 43090896..6f51013f 100644 --- a/charts/agent-job/README.md.gotmpl +++ b/charts/agent-job/README.md.gotmpl @@ -408,17 +408,13 @@ The agent authenticates with the Scalr platform using a token hierarchy with pro 1. Agent Pool Token — a long-lived token configured via the `agent.token` Helm value (passed to agent services as `SCALR_AGENT_TOKEN`). It identifies the agent to the Scalr platform and is used only during initial registration and startup. 2. Agent Session Token — a token issued by the Scalr API in exchange for the Agent Pool Token after successful registration. Held in memory only and used for task acquisition and status updates during the lifetime of the agent controller and agent workers. 3. Agent Task Token — a token generated by the Scalr platform during task acquisition for the agent worker, valid only for that specific task execution. Used for task-specific API calls, downloading the configuration version, and streaming logs. Scoped to the run's workspace. Exists only for the lifetime of the task. -4. Scalr Run Token — a token generated by the Scalr platform during task acquisition for the runner container, valid only for that specific task execution. Passed to the run environment as the `SCALR_TOKEN` environment variable for the OpenTofu/Terraform remote state backend. Scoped to the minimum permissions required for a run: `workspaces:read`, `workspaces:lock`, `module-versions:read`, `state-versions:read` and `state-versions:create` within the context of the run's workspace. Exists only for the lifetime of the task. +4. Scalr Run Token — a token generated by the Scalr platform during task acquisition for the runner container, valid only for that specific task execution. Passed to the run environment as the `SCALR_TOKEN` environment variable for the OpenTofu/Terraform remote state backend. Scoped to the minimum permissions required for a run: `workspaces:read`, `workspaces:lock`, `module-versions:read`, `state-versions:read`, and `state-versions:create` within the context of the run's workspace. Exists only for the lifetime of the task. -The agent authenticates via `Authorization: Bearer ` headers. +All API calls authenticate via `Authorization: Bearer ` headers. All tokens are passed to containers via Kubernetes Secrets and mounted as environment variables — they are never embedded in plaintext in Pod specs, ConfigMaps, or chart values. -All tokens are passed to containers via Kubernetes Secrets and mounted as environment variables — they are never embedded in plaintext in Pod specs, ConfigMaps, or chart values. +Communication with the Scalr platform uses HTTPS exclusively, making all traffic transparent for proxying and monitoring by agent operators. All connections are outbound — the Scalr platform never initiates inbound connections to the agent, and the agent never exposes any TCP ports. -In addition to regular HTTP API calls, the agent establishes an additional outbound connection to the Scalr relay service (`relay.`) — an HTTP long-polling channel used for Scalr-to-agent messaging. -The Scalr platform pushes messages about available tasks and cancellation signals through this relay. - -Communication with the Scalr platform uses HTTPS exclusively, making all traffic transparent for proxying and monitoring by agent operators. -All connections are outbound — the Scalr platform never initiates inbound connections to the agent, and the agent never exposes any TCP ports. +In addition to regular HTTP API calls, the agent establishes an outbound connection to the Scalr relay service (`relay.`) — an HTTP long-polling channel used for Scalr-to-agent messaging. The Scalr platform pushes messages about available tasks and cancellation signals through this relay. If an Agent Pool Token is revoked (e.g. from the Scalr Agent Pool tokens page), all associated Agent Session Tokens are revoked. Subsequent API calls return `401`, which causes the agents to shut down. From d93c068862c4791cd5f6beeb0836808bf8e1ef5d Mon Sep 17 00:00:00 2001 From: Serhii Babak Date: Fri, 13 Mar 2026 15:46:19 +0200 Subject: [PATCH 21/24] SCALRCORE-37660 update docs --- charts/agent-job/README.md | 7 +++---- charts/agent-job/README.md.gotmpl | 7 +++---- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/charts/agent-job/README.md b/charts/agent-job/README.md index 45e01e79..ffde4d23 100644 --- a/charts/agent-job/README.md +++ b/charts/agent-job/README.md @@ -409,9 +409,8 @@ This section describes the security model of the chart, covering how the agent a The agent authenticates with the Scalr platform using a token hierarchy with progressively narrower scopes — each token is a JWT and carries only the permissions required for its specific role: 1. Agent Pool Token — a long-lived token configured via the `agent.token` Helm value (passed to agent services as `SCALR_AGENT_TOKEN`). It identifies the agent to the Scalr platform and is used only during initial registration and startup. -2. Agent Session Token — a token issued by the Scalr API in exchange for the Agent Pool Token after successful registration. Held in memory only and used for task acquisition and status updates during the lifetime of the agent controller and agent workers. -3. Agent Task Token — a token generated by the Scalr platform during task acquisition for the agent worker, valid only for that specific task execution. Used for task-specific API calls, downloading the configuration version, and streaming logs. Scoped to the run's workspace. Exists only for the lifetime of the task. -4. Scalr Run Token — a token generated by the Scalr platform during task acquisition for the runner container, valid only for that specific task execution. Passed to the run environment as the `SCALR_TOKEN` environment variable for the OpenTofu/Terraform remote state backend. Scoped to the minimum permissions required for a run: `workspaces:read`, `workspaces:lock`, `module-versions:read`, `state-versions:read`, and `state-versions:create` within the context of the run's workspace. Exists only for the lifetime of the task. +2. Agent Task Token — a token generated by the Scalr platform during task acquisition for the agent worker, valid only for that specific task execution. Used for task-specific API calls, downloading the configuration version, and streaming logs. Scoped to the run's workspace. Exists only for the lifetime of the task. +3. Scalr Run Token — a token generated by the Scalr platform during task acquisition for the runner container, valid only for that specific task execution. Passed to the run environment as the `SCALR_TOKEN` environment variable for the OpenTofu/Terraform remote state backend. Scoped to the minimum permissions required for a run: `workspaces:read`, `workspaces:lock`, `module-versions:read`, `state-versions:read`, and `state-versions:create` within the context of the run's workspace. Exists only for the lifetime of the task. All API calls authenticate via `Authorization: Bearer ` headers. All tokens are passed to containers via Kubernetes Secrets and mounted as environment variables — they are never embedded in plaintext in Pod specs, ConfigMaps, or chart values. @@ -419,7 +418,7 @@ Communication with the Scalr platform uses HTTPS exclusively, making all traffic In addition to regular HTTP API calls, the agent establishes an outbound connection to the Scalr relay service (`relay.`) — an HTTP long-polling channel used for Scalr-to-agent messaging. The Scalr platform pushes messages about available tasks and cancellation signals through this relay. -If an Agent Pool Token is revoked (e.g. from the Scalr Agent Pool tokens page), all associated Agent Session Tokens are revoked. Subsequent API calls return `401`, which causes the agents to shut down. +If an Agent Pool Token is revoked (e.g. from the Scalr Agent Pool tokens page), subsequent API calls return `401`, which causes the agents using it to shut down. ### Multi-tenant Isolation diff --git a/charts/agent-job/README.md.gotmpl b/charts/agent-job/README.md.gotmpl index 6f51013f..2244fb51 100644 --- a/charts/agent-job/README.md.gotmpl +++ b/charts/agent-job/README.md.gotmpl @@ -406,9 +406,8 @@ This section describes the security model of the chart, covering how the agent a The agent authenticates with the Scalr platform using a token hierarchy with progressively narrower scopes — each token is a JWT and carries only the permissions required for its specific role: 1. Agent Pool Token — a long-lived token configured via the `agent.token` Helm value (passed to agent services as `SCALR_AGENT_TOKEN`). It identifies the agent to the Scalr platform and is used only during initial registration and startup. -2. Agent Session Token — a token issued by the Scalr API in exchange for the Agent Pool Token after successful registration. Held in memory only and used for task acquisition and status updates during the lifetime of the agent controller and agent workers. -3. Agent Task Token — a token generated by the Scalr platform during task acquisition for the agent worker, valid only for that specific task execution. Used for task-specific API calls, downloading the configuration version, and streaming logs. Scoped to the run's workspace. Exists only for the lifetime of the task. -4. Scalr Run Token — a token generated by the Scalr platform during task acquisition for the runner container, valid only for that specific task execution. Passed to the run environment as the `SCALR_TOKEN` environment variable for the OpenTofu/Terraform remote state backend. Scoped to the minimum permissions required for a run: `workspaces:read`, `workspaces:lock`, `module-versions:read`, `state-versions:read`, and `state-versions:create` within the context of the run's workspace. Exists only for the lifetime of the task. +2. Agent Task Token — a token generated by the Scalr platform during task acquisition for the agent worker, valid only for that specific task execution. Used for task-specific API calls, downloading the configuration version, and streaming logs. Scoped to the run's workspace. Exists only for the lifetime of the task. +3. Scalr Run Token — a token generated by the Scalr platform during task acquisition for the runner container, valid only for that specific task execution. Passed to the run environment as the `SCALR_TOKEN` environment variable for the OpenTofu/Terraform remote state backend. Scoped to the minimum permissions required for a run: `workspaces:read`, `workspaces:lock`, `module-versions:read`, `state-versions:read`, and `state-versions:create` within the context of the run's workspace. Exists only for the lifetime of the task. All API calls authenticate via `Authorization: Bearer ` headers. All tokens are passed to containers via Kubernetes Secrets and mounted as environment variables — they are never embedded in plaintext in Pod specs, ConfigMaps, or chart values. @@ -416,7 +415,7 @@ Communication with the Scalr platform uses HTTPS exclusively, making all traffic In addition to regular HTTP API calls, the agent establishes an outbound connection to the Scalr relay service (`relay.`) — an HTTP long-polling channel used for Scalr-to-agent messaging. The Scalr platform pushes messages about available tasks and cancellation signals through this relay. -If an Agent Pool Token is revoked (e.g. from the Scalr Agent Pool tokens page), all associated Agent Session Tokens are revoked. Subsequent API calls return `401`, which causes the agents to shut down. +If an Agent Pool Token is revoked (e.g. from the Scalr Agent Pool tokens page), subsequent API calls return `401`, which causes the agents using it to shut down. ### Multi-tenant Isolation From 3d3e244d19ac904c6b9ce68176e09b1fefbac285 Mon Sep 17 00:00:00 2001 From: aleatoricmbnt Date: Fri, 13 Mar 2026 16:58:32 +0200 Subject: [PATCH 22/24] SCALRCORE-37660 Added metadata service to the noProxy; added info on proxy network policy --- charts/agent-job/README.md.gotmpl | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/charts/agent-job/README.md.gotmpl b/charts/agent-job/README.md.gotmpl index 2244fb51..67594097 100644 --- a/charts/agent-job/README.md.gotmpl +++ b/charts/agent-job/README.md.gotmpl @@ -277,11 +277,17 @@ global: enabled: true httpProxy: "http://proxy.example.com:8080" httpsProxy: "http://proxy.example.com:8080" - noProxy: "localhost,127.0.0.1,.svc,.cluster.local,10.0.0.0/8,172.16.0.0/12,192.168.0.0/16" + noProxy: "localhost,127.0.0.1,.svc,.cluster.local,10.0.0.0/8,172.16.0.0/12,192.168.0.0/16,169.254.169.254" ``` The `noProxy` setting should include Kubernetes internal domains to avoid routing cluster traffic through the proxy. +**NetworkPolicy and proxy port** + +When `task.allowMetadataService` is false (the default), the chart creates a NetworkPolicy for task pods that allows DNS (port 53) and egress to the internet while blocking the VM metadata service. That policy does not allow egress to arbitrary ports (such as a proxy). If you use an HTTP proxy (e.g. on port 3128 or 8080), task pods must be allowed to reach it. + +Create a **separate** NetworkPolicy that allows egress from task pods to the proxy (by port and, if you want, by pod or namespace selector). Do not edit the policy created by the chart. Chart upgrades or redeploys can overwrite that policy; keeping proxy egress in your own policy keeps it under your control and avoids it being reverted. + ## Custom Certificate Authorities If your environment uses custom or self-signed certificates, you can configure the CA bundle used by the agent for TLS validation. This configuration sets the **primary CA bundle** for all agent HTTPS connections (to Scalr API, VCS providers, provider registries, etc.). From 5f80a68371b4425d0ac63a5da03be460d99a22ad Mon Sep 17 00:00:00 2001 From: Serhii Babak Date: Fri, 13 Mar 2026 17:01:38 +0200 Subject: [PATCH 23/24] SCALRCORE-37660 update docs --- charts/agent-job/README.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/charts/agent-job/README.md b/charts/agent-job/README.md index ffde4d23..3932aa0a 100644 --- a/charts/agent-job/README.md +++ b/charts/agent-job/README.md @@ -280,11 +280,17 @@ global: enabled: true httpProxy: "http://proxy.example.com:8080" httpsProxy: "http://proxy.example.com:8080" - noProxy: "localhost,127.0.0.1,.svc,.cluster.local,10.0.0.0/8,172.16.0.0/12,192.168.0.0/16" + noProxy: "localhost,127.0.0.1,.svc,.cluster.local,10.0.0.0/8,172.16.0.0/12,192.168.0.0/16,169.254.169.254" ``` The `noProxy` setting should include Kubernetes internal domains to avoid routing cluster traffic through the proxy. +**NetworkPolicy and proxy port** + +When `task.allowMetadataService` is false (the default), the chart creates a NetworkPolicy for task pods that allows DNS (port 53) and egress to the internet while blocking the VM metadata service. That policy does not allow egress to arbitrary ports (such as a proxy). If you use an HTTP proxy (e.g. on port 3128 or 8080), task pods must be allowed to reach it. + +Create a **separate** NetworkPolicy that allows egress from task pods to the proxy (by port and, if you want, by pod or namespace selector). Do not edit the policy created by the chart. Chart upgrades or redeploys can overwrite that policy; keeping proxy egress in your own policy keeps it under your control and avoids it being reverted. + ## Custom Certificate Authorities If your environment uses custom or self-signed certificates, you can configure the CA bundle used by the agent for TLS validation. This configuration sets the **primary CA bundle** for all agent HTTPS connections (to Scalr API, VCS providers, provider registries, etc.). From ef146e7a58a3dae47a2b926cb4760b0b28cc6845 Mon Sep 17 00:00:00 2001 From: aleatoricmbnt Date: Tue, 17 Mar 2026 16:28:34 +0200 Subject: [PATCH 24/24] SCALRCORE-37660 Update docs --- charts/agent-docker/README.md | 2 +- charts/agent-job/README.md | 340 +++++++++++++----------------- charts/agent-job/README.md.gotmpl | 14 +- charts/agent-k8s/README.md | 2 +- charts/agent-local/README.md | 143 +++++-------- 5 files changed, 207 insertions(+), 294 deletions(-) diff --git a/charts/agent-docker/README.md b/charts/agent-docker/README.md index 0f30b6d5..3ea2fb94 100644 --- a/charts/agent-docker/README.md +++ b/charts/agent-docker/README.md @@ -79,4 +79,4 @@ the same Kubernetes cluster to increase overall capacity. | tolerations | list | `[]` | Tolerations for the Scalr Agent pods, allowing them to run on tainted nodes | ---------------------------------------------- -Autogenerated from chart metadata using [helm-docs v1.14.2](https://github.com/norwoodj/helm-docs/releases/v1.14.2) +Autogenerated from chart metadata using [helm-docs v1.11.0](https://github.com/norwoodj/helm-docs/releases/v1.11.0) diff --git a/charts/agent-job/README.md b/charts/agent-job/README.md index 3932aa0a..3ef1fb4a 100644 --- a/charts/agent-job/README.md +++ b/charts/agent-job/README.md @@ -461,11 +461,14 @@ This feature relies on egress NetworkPolicy enforcement, which requires a compat Ensure your cluster uses a CNI plugin that supports egress NetworkPolicies. Tested configurations: -| Cluster | CNI | IMDS Blocked | -|---------|-----|:------------:| -| AWS EKS | Calico | ✅ | -| GKE | Dataplane V1 (Calico) | ❌ | -| GKE | [Dataplane V2](https://docs.cloud.google.com/kubernetes-engine/docs/how-to/dataplane-v2) (Cilium/eBPF) | ✅ | +| Cluster | CNI / network setup | IMDS blocked | +|-----------|--------------------------------------------------------------------------------------|:------------:| +| AWS EKS | Amazon VPC CNI (data plane) + Calico for network policy only (tigera-operator with `cni.type: AmazonVPC`) | ✅ | +| GKE | Dataplane V1 (Calico) | ❌ | +| GKE | [Dataplane V2](https://docs.cloud.google.com/kubernetes-engine/docs/how-to/dataplane-v2) (Cilium/eBPF) | ✅ | +| Azure AKS | Azure CNI (network plugin) + Cilium (network data-plane) | ✅ | + +**Note (EKS):** Pod networking is provided by Amazon VPC CNI. Calico is used only as the network policy engine (no Calico data plane); the VPC CNI is patched so Calico can enforce policy (e.g. `ANNOTATE_POD_IP=true` on the aws-node DaemonSet) ## Network Requirements @@ -640,196 +643,149 @@ For errors, see the detailed steps at https://docs.scalr.io/docs/troubleshooting ## Values -### Agent - -| Key | Type | Default | Description | -|-----|------|---------|-------------| -| agent.affinity | object | `{}` | Node affinity for the controller pod. | -| agent.annotations | object | `{}` | Additional annotations for the Deployment (workload object). | -| agent.cacheDir | string | `"/var/lib/scalr-agent/cache"` | Cache directory where the agent stores provider binaries, plugin cache, and metadata. This directory must be readable, writable, and executable. | -| agent.controller | object | `{"extraEnv":[],"extraEnvFrom":[],"securityContext":{}}` | Controller-specific configuration. | -| agent.controller.extraEnv | list | `[]` | Additional environment variables for the controller container only. | -| agent.controller.extraEnvFrom | list | `[]` | Additional environment variable sources for the controller container. | -| agent.controller.securityContext | object | `{}` | Default security context for agent controller container. | -| agent.dataDir | string | `"/var/lib/scalr-agent/data"` | Data directory where the agent stores workspace data (configuration versions, modules, and providers). This directory must be readable, writable, and executable. | -| agent.debug | string | `"0"` | Enable debug logging. | -| agent.extraEnv | object | `{}` | Additional environment variables for agent controller and worker containers. | -| agent.image | object | `{"pullPolicy":"IfNotPresent","repository":"scalr/agent","tag":""}` | Agent image configuration (used by both controller and worker containers). | -| agent.image.pullPolicy | string | `"IfNotPresent"` | Image pull policy. | -| agent.image.repository | string | `"scalr/agent"` | Docker repository for the Scalr Agent image. | -| agent.image.tag | string | `""` | Image tag. Defaults to the chart appVersion if not specified. | -| agent.labels | object | `{}` | Additional labels for the Deployment (workload object). | -| agent.logFormat | string | `"json"` | The log formatter. Options: plain, dev or json. Defaults to json. | -| agent.moduleCache.concurrency | int | `10` | Maximum number of threads used for module cache operations (initialization and caching). This value is global for the Scalr service and applies across all concurrent runs. Increasing it will increase resource consumption and may improve module cache speed, but the effect depends on individual setups. | -| agent.moduleCache.enabled | bool | `false` | Enable module caching. Disabled by default since the default configuration uses an ephemeral volume for the cache directory. | -| agent.moduleCache.sizeLimit | string | `"40Gi"` | Module cache soft limit. Must be tuned according to cache directory size. | -| agent.nodeSelector | object | `{}` | Node selector for assigning the controller pod to specific nodes. Example: `--set agent.nodeSelector."node-type"="agent-controller"` | -| agent.podAnnotations | object | `{}` | Controller-specific pod annotations (merged with global.podAnnotations, overrides duplicate keys). | -| agent.podDisruptionBudget | object | `{"enabled":true,"maxUnavailable":null,"minAvailable":1}` | PodDisruptionBudget configuration for controller high availability. Only applied when replicaCount > 1. Ensures minimum availability during voluntary disruptions. | -| agent.podDisruptionBudget.enabled | bool | `true` | Enable PodDisruptionBudget for the controller. | -| agent.podDisruptionBudget.maxUnavailable | string | `nil` | Maximum number of controller pods that can be unavailable. Either minAvailable or maxUnavailable must be set, not both. | -| agent.podDisruptionBudget.minAvailable | int | `1` | Minimum number of controller pods that must be available. Either minAvailable or maxUnavailable must be set, not both. | -| agent.podLabels | object | `{}` | Controller-specific pod labels (merged with global.podLabels, overrides duplicate keys). | -| agent.podSecurityContext | object | `{}` | Controller-specific pod security context (merged with global.podSecurityContext, overrides duplicate keys). | -| agent.providerCache.concurrency | int | `10` | Maximum number of threads used for provider installations. This value is global for the Scalr service and applies across all concurrent runs. Increasing it will increase resource consumption and may improve provider installation speed, but the effect depends on individual setups. | -| agent.providerCache.enabled | bool | `false` | Enable provider caching. Disabled by default since the default configuration uses an ephemeral volume for the cache directory. | -| agent.providerCache.sizeLimit | string | `"40Gi"` | Provider cache soft limit. Must be tuned according to cache directory size. | -| agent.replicaCount | int | `1` | Number of agent controller replicas. | -| agent.resources | object | `{"requests":{"cpu":"100m","memory":"256Mi"}}` | Resource requests and limits for the agent controller container. | -| agent.sentryDsn | string | `""` | Sentry DSN for error tracking. Leave empty to disable. | -| agent.terminationGracePeriodSeconds | int | `180` | Grace period in seconds before forcibly terminating the controller container. | -| agent.token | string | `""` | The agent pool token for authentication. | -| agent.tokenExistingSecret | object | `{"key":"token","name":""}` | Pre-existing Kubernetes secret for the Scalr Agent token. | -| agent.tokenExistingSecret.key | string | `"token"` | Key within the secret that holds the token value. | -| agent.tokenExistingSecret.name | string | `""` | Name of the secret containing the token. | -| agent.tolerations | list | `[]` | Node tolerations for the controller pod. Expects input structure as per specification . Example: `--set agent.tolerations[0].key=dedicated,agent.tolerations[0].operator=Equal,agent.tolerations[0].value=agent-controller,agent.tolerations[0].effect=NoSchedule` | -| agent.topologySpreadConstraints | object | `{}` | Topology spread constraints for the controller pod. | -| agent.url | string | `""` | The Scalr URL to connect the agent to. | - -### Global - -| Key | Type | Default | Description | -|-----|------|---------|-------------| -| global.annotations | object | `{}` | Global annotations applied to all chart resources (metadata.annotations). | -| global.imageNamespace | string | "" | Global image namespace/organization override for all images. Replaces the namespace in repositories (e.g., "myorg" changes "scalr/runner" to "myorg/runner"). Combined: registry="gcr.io/project" + namespace="myorg" + repo="scalr/runner" → "gcr.io/project/myorg/runner:tag" Leave empty to preserve original namespace. | -| global.imagePullSecrets | list | `[]` | Global image pull secrets for private registries. | -| global.imageRegistry | string | "" | Global Docker registry override for all images. Prepended to image repositories. Example: "us-central1-docker.pkg.dev/myorg/images" Leave empty to use default Docker Hub. | -| global.labels | object | `{}` | Global labels applied to all chart resources (metadata.labels). | -| global.podAnnotations | object | `{}` | Global pod annotations applied to all pods. | -| global.podLabels | object | `{}` | Global pod labels applied to all pods. | -| global.podSecurityContext | object | `{"fsGroup":1000,"fsGroupChangePolicy":"OnRootMismatch","runAsGroup":1000,"runAsNonRoot":true,"runAsUser":1000,"seLinuxOptions":{},"seccompProfile":{"type":"RuntimeDefault"},"supplementalGroups":[],"sysctls":[]}` | Security context applied to all pods. | -| global.podSecurityContext.fsGroup | int | `1000` | File system group for volume ownership. | -| global.podSecurityContext.fsGroupChangePolicy | string | `"OnRootMismatch"` | File system group change policy. | -| global.podSecurityContext.runAsGroup | int | `1000` | Group ID for all containers in the pod. | -| global.podSecurityContext.runAsNonRoot | bool | `true` | Run pod as non-root for security. | -| global.podSecurityContext.runAsUser | int | `1000` | User ID for all containers in the pod. | -| global.podSecurityContext.seLinuxOptions | object | `{}` | SELinux options for the pod. | -| global.podSecurityContext.seccompProfile | object | `{"type":"RuntimeDefault"}` | Seccomp profile for enhanced security. | -| global.podSecurityContext.supplementalGroups | list | `[]` | Supplemental groups for the containers. | -| global.podSecurityContext.sysctls | list | `[]` | Sysctls for the pod. | - -### Global.Proxy - -| Key | Type | Default | Description | -|-----|------|---------|-------------| -| global.proxy | object | `{"enabled":false,"httpProxy":"","httpsProxy":"","noProxy":""}` | HTTP proxy configuration for external connectivity. | -| global.proxy.enabled | bool | `false` | Enable injection of HTTP(S) proxy settings into all agent pods. | -| global.proxy.httpProxy | string | `""` | HTTP proxy URL applied to all agent containers (HTTP_PROXY). Example: "http://proxy.example.com:8080" | -| global.proxy.httpsProxy | string | `""` | HTTPS proxy URL applied to all agent containers (HTTPS_PROXY). Example: "http://proxy.example.com:8080" | -| global.proxy.noProxy | string | `""` | Comma-separated domains/IPs that bypass the proxy (NO_PROXY). Recommended to include Kubernetes internal domains to avoid routing cluster traffic through the proxy. Example: "localhost,127.0.0.1,.svc,.cluster.local,10.0.0.0/8,172.16.0.0/12,192.168.0.0/16" | - -### Global.TLS - -| Key | Type | Default | Description | -|-----|------|---------|-------------| -| global.tls | object | `{"caBundle":"","caBundleSecret":{"key":"ca-bundle.crt","name":""}}` | TLS/SSL configuration for custom certificate authorities. | -| global.tls.caBundle | string | `""` | Inline CA bundle content as an alternative to caBundleSecret. Provide the complete CA certificate chain in PEM format. If both caBundleSecret.name and caBundle are set, caBundleSecret takes precedence. Example: caBundle: | -----BEGIN CERTIFICATE----- MIIDXTCCAkWgAwIBAgIJAKZ... -----END CERTIFICATE----- -----BEGIN CERTIFICATE----- MIIEFzCCAv+gAwIBAgIUDiCT... -----END CERTIFICATE----- | -| global.tls.caBundleSecret | object | `{"key":"ca-bundle.crt","name":""}` | Reference to an existing Kubernetes secret containing a CA bundle. This CA bundle is mounted to all agent pods and used for outbound TLS validation (e.g., Scalr API, VCS, registries). The secret must exist in the same namespace as the chart installation. If both caBundleSecret.name and caBundle are set, caBundleSecret takes precedence. | -| global.tls.caBundleSecret.key | string | `"ca-bundle.crt"` | Key within the secret that contains the CA bundle file. | -| global.tls.caBundleSecret.name | string | `""` | Name of the Kubernetes secret containing the CA bundle. Leave empty to use the inline caBundle or system certificates. | - -### OpenTelemetry - -| Key | Type | Default | Description | -|-----|------|---------|-------------| -| otel.enabled | bool | `false` | Enable OpenTelemetry integration. | -| otel.endpoint | string | `"http://otel-collector:4317"` | OpenTelemetry collector endpoint. | -| otel.metricsEnabled | bool | `true` | Collect and export metrics. | -| otel.tracesEnabled | bool | `false` | Collect and export traces. | - -### Persistence - -| Key | Type | Default | Description | -|-----|------|---------|-------------| -| persistence.cache | object | `{"emptyDir":{"sizeLimit":"1Gi"},"enabled":false,"persistentVolumeClaim":{"accessMode":"ReadWriteMany","claimName":"","storage":"90Gi","storageClassName":"","subPath":""}}` | Cache directory storage configuration. Stores OpenTofu/Terraform providers, modules and binaries. Mounted to both worker (for agent cache) and runner (for binary/plugin cache) containers. | -| persistence.cache.emptyDir | object | `{"sizeLimit":"1Gi"}` | EmptyDir volume configuration (used when enabled is false). | -| persistence.cache.emptyDir.sizeLimit | string | `"1Gi"` | Size limit for the emptyDir volume. | -| persistence.cache.enabled | bool | `false` | Enable persistent storage for cache directory. Highly recommended: Avoids re-downloading providers and binaries (saves 1-5 minutes per run). When false, providers and binaries are downloaded fresh for each task. When true, cache is shared across all task pods for significant performance improvement (may vary depending on RWM volume performace). | -| persistence.cache.persistentVolumeClaim | object | `{"accessMode":"ReadWriteMany","claimName":"","storage":"90Gi","storageClassName":"","subPath":""}` | PersistentVolumeClaim configuration (used when enabled is true). | -| persistence.cache.persistentVolumeClaim.accessMode | string | `"ReadWriteMany"` | Access mode for the PVC. Use ReadWriteMany to share cache across multiple task pods. Note: ReadWriteMany requires compatible storage class (e.g., NFS, EFS, Filestore). | -| persistence.cache.persistentVolumeClaim.claimName | string | `""` | Name of an existing PVC. If empty, a new PVC named `-cache` is created. | -| persistence.cache.persistentVolumeClaim.storage | string | `"90Gi"` | Storage size for the PVC. | -| persistence.cache.persistentVolumeClaim.storageClassName | string | `""` | Storage class for the PVC. Leave empty to use the cluster's default storage class. | -| persistence.cache.persistentVolumeClaim.subPath | string | `""` | Optional subPath for mounting a specific subdirectory of the volume. Useful when sharing a single PVC across multiple installations. | -| persistence.data | object | `{"emptyDir":{"sizeLimit":"4Gi"},"enabled":false,"persistentVolumeClaim":{"accessMode":"ReadWriteOnce","claimName":"","storage":"4Gi","storageClassName":"","subPath":""}}` | Data directory storage configuration. Stores workspace data including configuration versions, modules, and run metadata. This directory is mounted to the worker sidecar container. | -| persistence.data.emptyDir | object | `{"sizeLimit":"4Gi"}` | EmptyDir volume configuration (used when enabled is false). | -| persistence.data.emptyDir.sizeLimit | string | `"4Gi"` | Size limit for the emptyDir volume. | -| persistence.data.enabled | bool | `false` | Enable persistent storage for data directory. When false, uses emptyDir (ephemeral, recommended for most use cases as each run gets fresh workspace). When true, uses PVC (persistent across pod restarts, useful for debugging or sharing data between runs). | -| persistence.data.persistentVolumeClaim | object | `{"accessMode":"ReadWriteOnce","claimName":"","storage":"4Gi","storageClassName":"","subPath":""}` | PersistentVolumeClaim configuration (used when enabled is true). | -| persistence.data.persistentVolumeClaim.accessMode | string | `"ReadWriteOnce"` | Access mode for the PVC. | -| persistence.data.persistentVolumeClaim.claimName | string | `""` | Name of an existing PVC. If empty, a new PVC named `-data` is created. | -| persistence.data.persistentVolumeClaim.storage | string | `"4Gi"` | Storage size for the PVC. | -| persistence.data.persistentVolumeClaim.storageClassName | string | `""` | Storage class for the PVC. Leave empty to use the cluster's default storage class. | -| persistence.data.persistentVolumeClaim.subPath | string | `""` | Optional subPath for mounting a specific subdirectory of the volume. | - -### RBAC - -| Key | Type | Default | Description | -|-----|------|---------|-------------| -| rbac.clusterRules | list | `[{"apiGroups":["scalr.io"],"resources":["agenttasktemplates"],"verbs":["get","list","watch"]}]` | Cluster-wide RBAC rules (applied via ClusterRole bound in the release namespace). | -| rbac.create | bool | `true` | Create the namespaced Role/RoleBinding and cluster-scope RoleBinding. | -| rbac.rules | list | `[{"apiGroups":[""],"resources":["pods"],"verbs":["get","list","watch","create","delete","deletecollection","patch","update"]},{"apiGroups":[""],"resources":["pods/log"],"verbs":["get"]},{"apiGroups":[""],"resources":["pods/exec"],"verbs":["get","create"]},{"apiGroups":[""],"resources":["pods/status"],"verbs":["get","patch","update"]},{"apiGroups":["apps"],"resources":["deployments"],"verbs":["get","list","watch"]},{"apiGroups":["batch"],"resources":["jobs"],"verbs":["get","list","watch","create","delete","deletecollection","patch","update"]},{"apiGroups":["batch"],"resources":["jobs/status"],"verbs":["get","patch","update"]},{"apiGroups":[""],"resources":["events"],"verbs":["list"]}]` | Namespaced RBAC rules granted to the controller ServiceAccount. | - -### Service account - -| Key | Type | Default | Description | -|-----|------|---------|-------------| -| serviceAccount.annotations | object | `{}` | Annotations for the service account. | -| serviceAccount.automountToken | bool | `true` | Whether to automount the service account token in pods. | -| serviceAccount.create | bool | `true` | Create a Kubernetes service account for the Scalr Agent. | -| serviceAccount.labels | object | `{}` | Additional labels for the service account. | -| serviceAccount.name | string | `""` | Name of the service account. Generated if not set and create is true. | -| serviceAccount.tokenTTL | int | `3600` | Token expiration period in seconds. | - -### Task - -| Key | Type | Default | Description | -|-----|------|---------|-------------| -| task.affinity | object | `{}` | Node affinity for task job pods. | -| task.allowMetadataService | bool | `false` | When set to `true`, disables the NetworkPolicy that blocks access to the VM metadata service (`169.254.169.254`) for agent task containers. When set to `false` (default), a NetworkPolicy is created to prevent workloads from accessing cloud credentials or instance metadata. | -| task.extraVolumes | list | `[]` | Additional volumes for task job pods. | -| task.job | object | `{"basename":"","ttlSecondsAfterFinished":60}` | Job configuration for task execution. | -| task.job.basename | string | `""` | Base name prefix for spawned Kubernetes Jobs (defaults to fullname, e.g., "scalr-agent"). Jobs are named as `-`. See README for details on task naming. | -| task.job.ttlSecondsAfterFinished | int | `60` | Time in seconds after job completion before it is automatically deleted. | -| task.jobAnnotations | object | `{}` | Additional annotations for the Job (workload object). | -| task.jobLabels | object | `{}` | Additional labels for the Job (workload object). | -| task.nodeSelector | object | `{}` | Node selector for assigning task job pods to specific nodes. Example: `--set task.nodeSelector."node-type"="agent-worker"` | -| task.podAnnotations | object | `{}` | Task-specific pod annotations (merged with global.podAnnotations, overrides duplicate keys). | -| task.podLabels | object | `{}` | Task-specific pod labels (merged with global.podLabels, overrides duplicate keys). | -| task.podSecurityContext | object | `{}` | Task-specific pod security context (merged with global.podSecurityContext, overrides duplicate keys). | -| task.runner | object | `{"extraEnv":{},"extraVolumeMounts":[],"image":{"pullPolicy":"IfNotPresent","repository":"scalr/runner","tag":"0.2.0"},"memorySoftLimitPercent":80,"memoryWarnPercent":90,"resources":{"limits":{"cpu":"4000m","memory":"2048Mi"},"requests":{"cpu":"500m","memory":"512Mi"}},"securityContext":{"allowPrivilegeEscalation":false,"capabilities":{"drop":["ALL"]},"privileged":false,"readOnlyRootFilesystem":true,"runAsNonRoot":true,"seLinuxOptions":{}}}` | Runner container configuration (environment where Terraform/OpenTofu commands are executed). | -| task.runner.extraEnv | object | `{}` | Additional environment variables for the runner container. | -| task.runner.extraVolumeMounts | list | `[]` | Additional volume mounts for the runner container. | -| task.runner.image | object | `{"pullPolicy":"IfNotPresent","repository":"scalr/runner","tag":"0.2.0"}` | Runner container image settings. Default image: https://hub.docker.com/r/scalr/runner, repository: https://github.com/Scalr/runner Note: For Scalr-managed agents, this may be overridden by Scalr account image settings. | -| task.runner.image.pullPolicy | string | `"IfNotPresent"` | Image pull policy. | -| task.runner.image.repository | string | `"scalr/runner"` | Default repository for the runner image. | -| task.runner.image.tag | string | `"0.2.0"` | Default tag for the runner image. | -| task.runner.memorySoftLimitPercent | int | `80` | Memory soft limit as a percentage of the hard limit (task.runner.resources.limits.memory). When memory usage exceeds this value, the process will be gracefully terminated by the agent. Graceful termination ensures that OpenTofu/Terraform workloads push state before exiting, preventing state loss. Setting this value too high reduces the memory headroom available for state push and increases the risk of state loss. Have no effect when task.runner.resources.limits.memory is not set. For example, when task.runner.resources.limits.memory is set to 1000Mi and memorySoftLimitPercent is 80%, the workload will be gracefully terminated when memory usage reaches 800Mi. | -| task.runner.memoryWarnPercent | int | `90` | Memory warning threshold as a percentage of the soft limit (task.runner.memorySoftLimitPercent). A warning is logged to the run console when memory usage exceeds this value, indicating that the workload is at risk of being terminated due to high memory usage. The warning is reported after the run completes. Has no effect when task.runner.memorySoftLimitPercent or task.runner.resources.limits.memory are not set. | -| task.runner.resources | object | `{"limits":{"cpu":"4000m","memory":"2048Mi"},"requests":{"cpu":"500m","memory":"512Mi"}}` | Resource requests and limits for the runner container. Note: For scalr-managed agents, this may be overridden by Scalr platform billing resource tier presets. | -| task.runner.securityContext | object | `{"allowPrivilegeEscalation":false,"capabilities":{"drop":["ALL"]},"privileged":false,"readOnlyRootFilesystem":true,"runAsNonRoot":true,"seLinuxOptions":{}}` | Security context for the runner container. The default declaration duplicates some critical options from podSecurityContext to keep them independent. | -| task.runner.securityContext.allowPrivilegeEscalation | bool | `false` | Allow privilege escalation. | -| task.runner.securityContext.capabilities | object | `{"drop":["ALL"]}` | Container capabilities restrictions for security. | -| task.runner.securityContext.privileged | bool | `false` | Run container in privileged mode. | -| task.runner.securityContext.readOnlyRootFilesystem | bool | `true` | Read-only root filesystem. | -| task.runner.securityContext.runAsNonRoot | bool | `true` | Run container as non-root user for security. | -| task.runner.securityContext.seLinuxOptions | object | `{}` | SELinux options for the container. | -| task.sidecars | list | `[]` | Additional sidecar containers for task job pods. | -| task.startupTimeoutSeconds | int | `180` | Maximum time in seconds for the agent worker container to become ready and begin Scalr run execution. If the pod does not start within this period, the controller fails the Scalr run and deletes the job. | -| task.terminationGracePeriodSeconds | int | `360` | Grace period in seconds before forcibly terminating task job containers. | -| task.tolerations | list | `[]` | Node tolerations for task job pods. Expects input structure as per specification . Example: `--set task.tolerations[0].key=dedicated,task.tolerations[0].operator=Equal,task.tolerations[0].value=agent-worker,task.tolerations[0].effect=NoSchedule` | -| task.worker | object | `{"extraEnv":{},"extraVolumeMounts":[],"resources":{"limits":{"memory":"1024Mi"},"requests":{"cpu":"250m","memory":"256Mi"}},"securityContext":{}}` | Worker container configuration (sidecar that supervises task execution). | -| task.worker.extraEnv | object | `{}` | Additional environment variables for the worker container (merged with agent.extraEnv). | -| task.worker.extraVolumeMounts | list | `[]` | Additional volume mounts for the worker container. | -| task.worker.resources | object | `{"limits":{"memory":"1024Mi"},"requests":{"cpu":"250m","memory":"256Mi"}}` | Resource requests and limits for the worker container. | -| task.worker.securityContext | object | `{}` | Security context for the worker container. | - -### Other Values - | Key | Type | Default | Description | |-----|------|---------|-------------| +| agent.affinity | object | `{}` | Node affinity for the controller pod. @section -- Agent | +| agent.annotations | object | `{}` | Additional annotations for the Deployment (workload object). @section -- Agent | +| agent.cacheDir | string | `"/var/lib/scalr-agent/cache"` | Cache directory where the agent stores provider binaries, plugin cache, and metadata. This directory must be readable, writable, and executable. @section -- Agent | +| agent.controller | object | `{"extraEnv":[],"extraEnvFrom":[],"securityContext":{}}` | Controller-specific configuration. @section -- Agent | +| agent.controller.extraEnv | list | `[]` | Additional environment variables for the controller container only. @section -- Agent | +| agent.controller.extraEnvFrom | list | `[]` | Additional environment variable sources for the controller container. @section -- Agent | +| agent.controller.securityContext | object | `{}` | Default security context for agent controller container. @section -- Agent | +| agent.dataDir | string | `"/var/lib/scalr-agent/data"` | Data directory where the agent stores workspace data (configuration versions, modules, and providers). This directory must be readable, writable, and executable. @section -- Agent | +| agent.debug | string | `"0"` | Enable debug logging. @section -- Agent | +| agent.extraEnv | object | `{}` | Additional environment variables for agent controller and worker containers. @section -- Agent | +| agent.image | object | `{"pullPolicy":"IfNotPresent","repository":"scalr/agent","tag":""}` | Agent image configuration (used by both controller and worker containers). @section -- Agent | +| agent.image.pullPolicy | string | `"IfNotPresent"` | Image pull policy. @section -- Agent | +| agent.image.repository | string | `"scalr/agent"` | Docker repository for the Scalr Agent image. @section -- Agent | +| agent.image.tag | string | `""` | Image tag. Defaults to the chart appVersion if not specified. @section -- Agent | +| agent.labels | object | `{}` | Additional labels for the Deployment (workload object). @section -- Agent | +| agent.logFormat | string | `"json"` | The log formatter. Options: plain, dev or json. Defaults to json. @section -- Agent | +| agent.moduleCache.concurrency | int | `10` | Maximum number of threads used for module cache operations (initialization and caching). This value is global for the Scalr service and applies across all concurrent runs. Increasing it will increase resource consumption and may improve module cache speed, but the effect depends on individual setups. @section -- Agent | +| agent.moduleCache.enabled | bool | `false` | Enable module caching. Disabled by default since the default configuration uses an ephemeral volume for the cache directory. @section -- Agent | +| agent.moduleCache.sizeLimit | string | `"40Gi"` | Module cache soft limit. Must be tuned according to cache directory size. @section -- Agent | +| agent.nodeSelector | object | `{}` | Node selector for assigning the controller pod to specific nodes. Example: `--set agent.nodeSelector."node-type"="agent-controller"` @section -- Agent | +| agent.podAnnotations | object | `{}` | Controller-specific pod annotations (merged with global.podAnnotations, overrides duplicate keys). @section -- Agent | +| agent.podDisruptionBudget | object | `{"enabled":true,"maxUnavailable":null,"minAvailable":1}` | PodDisruptionBudget configuration for controller high availability. Only applied when replicaCount > 1. Ensures minimum availability during voluntary disruptions. @section -- Agent | +| agent.podDisruptionBudget.enabled | bool | `true` | Enable PodDisruptionBudget for the controller. @section -- Agent | +| agent.podDisruptionBudget.maxUnavailable | string | `nil` | Maximum number of controller pods that can be unavailable. Either minAvailable or maxUnavailable must be set, not both. @section -- Agent | +| agent.podDisruptionBudget.minAvailable | int | `1` | Minimum number of controller pods that must be available. Either minAvailable or maxUnavailable must be set, not both. @section -- Agent | +| agent.podLabels | object | `{}` | Controller-specific pod labels (merged with global.podLabels, overrides duplicate keys). @section -- Agent | +| agent.podSecurityContext | object | `{}` | Controller-specific pod security context (merged with global.podSecurityContext, overrides duplicate keys). @section -- Agent | +| agent.providerCache.concurrency | int | `10` | Maximum number of threads used for provider installations. This value is global for the Scalr service and applies across all concurrent runs. Increasing it will increase resource consumption and may improve provider installation speed, but the effect depends on individual setups. @section -- Agent | +| agent.providerCache.enabled | bool | `false` | Enable provider caching. Disabled by default since the default configuration uses an ephemeral volume for the cache directory. @section -- Agent | +| agent.providerCache.sizeLimit | string | `"40Gi"` | Provider cache soft limit. Must be tuned according to cache directory size. @section -- Agent | +| agent.replicaCount | int | `1` | Number of agent controller replicas. @section -- Agent | +| agent.resources | object | `{"requests":{"cpu":"100m","memory":"256Mi"}}` | Resource requests and limits for the agent controller container. @section -- Agent | +| agent.sentryDsn | string | `""` | Sentry DSN for error tracking. Leave empty to disable. @section -- Agent | +| agent.terminationGracePeriodSeconds | int | `180` | Grace period in seconds before forcibly terminating the controller container. @section -- Agent | +| agent.token | string | `""` | The agent pool token for authentication. @section -- Agent | +| agent.tokenExistingSecret | object | `{"key":"token","name":""}` | Pre-existing Kubernetes secret for the Scalr Agent token. @section -- Agent | +| agent.tokenExistingSecret.key | string | `"token"` | Key within the secret that holds the token value. @section -- Agent | +| agent.tokenExistingSecret.name | string | `""` | Name of the secret containing the token. @section -- Agent | +| agent.tolerations | list | `[]` | Node tolerations for the controller pod. Expects input structure as per specification . Example: `--set agent.tolerations[0].key=dedicated,agent.tolerations[0].operator=Equal,agent.tolerations[0].value=agent-controller,agent.tolerations[0].effect=NoSchedule` @section -- Agent | +| agent.topologySpreadConstraints | object | `{}` | Topology spread constraints for the controller pod. @section -- Agent | +| agent.url | string | `""` | The Scalr URL to connect the agent to. @section -- Agent | | fullnameOverride | string | `""` | Override the full name of resources (takes precedence over nameOverride). | +| global.annotations | object | `{}` | Global annotations applied to all chart resources (metadata.annotations). @section -- Global | +| global.imageNamespace | string | "" | Global image namespace/organization override for all images. Replaces the namespace in repositories (e.g., "myorg" changes "scalr/runner" to "myorg/runner"). Combined: registry="gcr.io/project" + namespace="myorg" + repo="scalr/runner" → "gcr.io/project/myorg/runner:tag" Leave empty to preserve original namespace. @section -- Global | +| global.imagePullSecrets | list | `[]` | Global image pull secrets for private registries. @section -- Global | +| global.imageRegistry | string | "" | Global Docker registry override for all images. Prepended to image repositories. Example: "us-central1-docker.pkg.dev/myorg/images" Leave empty to use default Docker Hub. @section -- Global | +| global.labels | object | `{}` | Global labels applied to all chart resources (metadata.labels). @section -- Global | +| global.podAnnotations | object | `{}` | Global pod annotations applied to all pods. @section -- Global | +| global.podLabels | object | `{}` | Global pod labels applied to all pods. @section -- Global | +| global.podSecurityContext | object | `{"fsGroup":1000,"fsGroupChangePolicy":"OnRootMismatch","runAsGroup":1000,"runAsNonRoot":true,"runAsUser":1000,"seLinuxOptions":{},"seccompProfile":{"type":"RuntimeDefault"},"supplementalGroups":[],"sysctls":[]}` | Security context applied to all pods. @section -- Global | +| global.podSecurityContext.fsGroup | int | `1000` | File system group for volume ownership. @section -- Global | +| global.podSecurityContext.fsGroupChangePolicy | string | `"OnRootMismatch"` | File system group change policy. @section -- Global | +| global.podSecurityContext.runAsGroup | int | `1000` | Group ID for all containers in the pod. @section -- Global | +| global.podSecurityContext.runAsNonRoot | bool | `true` | Run pod as non-root for security. @section -- Global | +| global.podSecurityContext.runAsUser | int | `1000` | User ID for all containers in the pod. @section -- Global | +| global.podSecurityContext.seLinuxOptions | object | `{}` | SELinux options for the pod. @section -- Global | +| global.podSecurityContext.seccompProfile | object | `{"type":"RuntimeDefault"}` | Seccomp profile for enhanced security. @section -- Global | +| global.podSecurityContext.supplementalGroups | list | `[]` | Supplemental groups for the containers. @section -- Global | +| global.podSecurityContext.sysctls | list | `[]` | Sysctls for the pod. @section -- Global | +| global.proxy | object | `{"enabled":false,"httpProxy":"","httpsProxy":"","noProxy":""}` | HTTP proxy configuration for external connectivity. @section -- Global.Proxy | +| global.proxy.enabled | bool | `false` | Enable injection of HTTP(S) proxy settings into all agent pods. @section -- Global.Proxy | +| global.proxy.httpProxy | string | `""` | HTTP proxy URL applied to all agent containers (HTTP_PROXY). Example: "http://proxy.example.com:8080" @section -- Global.Proxy | +| global.proxy.httpsProxy | string | `""` | HTTPS proxy URL applied to all agent containers (HTTPS_PROXY). Example: "http://proxy.example.com:8080" @section -- Global.Proxy | +| global.proxy.noProxy | string | `""` | Comma-separated domains/IPs that bypass the proxy (NO_PROXY). Recommended to include Kubernetes internal domains to avoid routing cluster traffic through the proxy. Example: "localhost,127.0.0.1,.svc,.cluster.local,10.0.0.0/8,172.16.0.0/12,192.168.0.0/16" @section -- Global.Proxy | +| global.tls | object | `{"caBundle":"","caBundleSecret":{"key":"ca-bundle.crt","name":""}}` | TLS/SSL configuration for custom certificate authorities. @section -- Global.TLS | +| global.tls.caBundle | string | `""` | Inline CA bundle content as an alternative to caBundleSecret. Provide the complete CA certificate chain in PEM format. If both caBundleSecret.name and caBundle are set, caBundleSecret takes precedence. Example: caBundle: | -----BEGIN CERTIFICATE----- MIIDXTCCAkWgAwIBAgIJAKZ... -----END CERTIFICATE----- -----BEGIN CERTIFICATE----- MIIEFzCCAv+gAwIBAgIUDiCT... -----END CERTIFICATE----- @section -- Global.TLS | +| global.tls.caBundleSecret | object | `{"key":"ca-bundle.crt","name":""}` | Reference to an existing Kubernetes secret containing a CA bundle. This CA bundle is mounted to all agent pods and used for outbound TLS validation (e.g., Scalr API, VCS, registries). The secret must exist in the same namespace as the chart installation. If both caBundleSecret.name and caBundle are set, caBundleSecret takes precedence. @section -- Global.TLS | +| global.tls.caBundleSecret.key | string | `"ca-bundle.crt"` | Key within the secret that contains the CA bundle file. @section -- Global.TLS | +| global.tls.caBundleSecret.name | string | `""` | Name of the Kubernetes secret containing the CA bundle. Leave empty to use the inline caBundle or system certificates. @section -- Global.TLS | | nameOverride | string | `""` | Override the base name used in resource names (defaults to "scalr-agent"). | +| otel.enabled | bool | `false` | Enable OpenTelemetry integration. @section -- OpenTelemetry | +| otel.endpoint | string | `"http://otel-collector:4317"` | OpenTelemetry collector endpoint. @section -- OpenTelemetry | +| otel.metricsEnabled | bool | `true` | Collect and export metrics. @section -- OpenTelemetry | +| otel.tracesEnabled | bool | `false` | Collect and export traces. @section -- OpenTelemetry | +| persistence.cache | object | `{"emptyDir":{"sizeLimit":"1Gi"},"enabled":false,"persistentVolumeClaim":{"accessMode":"ReadWriteMany","claimName":"","storage":"90Gi","storageClassName":"","subPath":""}}` | Cache directory storage configuration. Stores OpenTofu/Terraform providers, modules and binaries. Mounted to both worker (for agent cache) and runner (for binary/plugin cache) containers. @section -- Persistence | +| persistence.cache.emptyDir | object | `{"sizeLimit":"1Gi"}` | EmptyDir volume configuration (used when enabled is false). @section -- Persistence | +| persistence.cache.emptyDir.sizeLimit | string | `"1Gi"` | Size limit for the emptyDir volume. @section -- Persistence | +| persistence.cache.enabled | bool | `false` | Enable persistent storage for cache directory. Highly recommended: Avoids re-downloading providers and binaries (saves 1-5 minutes per run). When false, providers and binaries are downloaded fresh for each task. When true, cache is shared across all task pods for significant performance improvement (may vary depending on RWM volume performace). @section -- Persistence | +| persistence.cache.persistentVolumeClaim | object | `{"accessMode":"ReadWriteMany","claimName":"","storage":"90Gi","storageClassName":"","subPath":""}` | PersistentVolumeClaim configuration (used when enabled is true). @section -- Persistence | +| persistence.cache.persistentVolumeClaim.accessMode | string | `"ReadWriteMany"` | Access mode for the PVC. Use ReadWriteMany to share cache across multiple task pods. Note: ReadWriteMany requires compatible storage class (e.g., NFS, EFS, Filestore). @section -- Persistence | +| persistence.cache.persistentVolumeClaim.claimName | string | `""` | Name of an existing PVC. If empty, a new PVC named `-cache` is created. @section -- Persistence | +| persistence.cache.persistentVolumeClaim.storage | string | `"90Gi"` | Storage size for the PVC. @section -- Persistence | +| persistence.cache.persistentVolumeClaim.storageClassName | string | `""` | Storage class for the PVC. Leave empty to use the cluster's default storage class. @section -- Persistence | +| persistence.cache.persistentVolumeClaim.subPath | string | `""` | Optional subPath for mounting a specific subdirectory of the volume. Useful when sharing a single PVC across multiple installations. @section -- Persistence | +| persistence.data | object | `{"emptyDir":{"sizeLimit":"4Gi"},"enabled":false,"persistentVolumeClaim":{"accessMode":"ReadWriteOnce","claimName":"","storage":"4Gi","storageClassName":"","subPath":""}}` | Data directory storage configuration. Stores workspace data including configuration versions, modules, and run metadata. This directory is mounted to the worker sidecar container. @section -- Persistence | +| persistence.data.emptyDir | object | `{"sizeLimit":"4Gi"}` | EmptyDir volume configuration (used when enabled is false). @section -- Persistence | +| persistence.data.emptyDir.sizeLimit | string | `"4Gi"` | Size limit for the emptyDir volume. @section -- Persistence | +| persistence.data.enabled | bool | `false` | Enable persistent storage for data directory. When false, uses emptyDir (ephemeral, recommended for most use cases as each run gets fresh workspace). When true, uses PVC (persistent across pod restarts, useful for debugging or sharing data between runs). @section -- Persistence | +| persistence.data.persistentVolumeClaim | object | `{"accessMode":"ReadWriteOnce","claimName":"","storage":"4Gi","storageClassName":"","subPath":""}` | PersistentVolumeClaim configuration (used when enabled is true). @section -- Persistence | +| persistence.data.persistentVolumeClaim.accessMode | string | `"ReadWriteOnce"` | Access mode for the PVC. @section -- Persistence | +| persistence.data.persistentVolumeClaim.claimName | string | `""` | Name of an existing PVC. If empty, a new PVC named `-data` is created. @section -- Persistence | +| persistence.data.persistentVolumeClaim.storage | string | `"4Gi"` | Storage size for the PVC. @section -- Persistence | +| persistence.data.persistentVolumeClaim.storageClassName | string | `""` | Storage class for the PVC. Leave empty to use the cluster's default storage class. @section -- Persistence | +| persistence.data.persistentVolumeClaim.subPath | string | `""` | Optional subPath for mounting a specific subdirectory of the volume. @section -- Persistence | +| rbac.clusterRules | list | `[{"apiGroups":["scalr.io"],"resources":["agenttasktemplates"],"verbs":["get","list","watch"]}]` | Cluster-wide RBAC rules (applied via ClusterRole bound in the release namespace). @section -- RBAC | +| rbac.create | bool | `true` | Create the namespaced Role/RoleBinding and cluster-scope RoleBinding. @section -- RBAC | +| rbac.rules | list | `[{"apiGroups":[""],"resources":["pods"],"verbs":["get","list","watch","create","delete","deletecollection","patch","update"]},{"apiGroups":[""],"resources":["pods/log"],"verbs":["get"]},{"apiGroups":[""],"resources":["pods/exec"],"verbs":["get","create"]},{"apiGroups":[""],"resources":["pods/status"],"verbs":["get","patch","update"]},{"apiGroups":["apps"],"resources":["deployments"],"verbs":["get","list","watch"]},{"apiGroups":["batch"],"resources":["jobs"],"verbs":["get","list","watch","create","delete","deletecollection","patch","update"]},{"apiGroups":["batch"],"resources":["jobs/status"],"verbs":["get","patch","update"]},{"apiGroups":[""],"resources":["events"],"verbs":["list"]}]` | Namespaced RBAC rules granted to the controller ServiceAccount. @section -- RBAC | +| serviceAccount.annotations | object | `{}` | Annotations for the service account. @section -- Service account | +| serviceAccount.automountToken | bool | `true` | Whether to automount the service account token in pods. @section -- Service account | +| serviceAccount.create | bool | `true` | Create a Kubernetes service account for the Scalr Agent. @section -- Service account | +| serviceAccount.labels | object | `{}` | Additional labels for the service account. @section -- Service account | +| serviceAccount.name | string | `""` | Name of the service account. Generated if not set and create is true. @section -- Service account | +| serviceAccount.tokenTTL | int | `3600` | Token expiration period in seconds. @section -- Service account | +| task.affinity | object | `{}` | Node affinity for task job pods. @section -- Task | +| task.allowMetadataService | bool | `false` | When set to `true`, disables the NetworkPolicy that blocks access to the VM metadata service (`169.254.169.254`) for agent task containers. When set to `false` (default), a NetworkPolicy is created to prevent workloads from accessing cloud credentials or instance metadata. @section -- Task | +| task.extraVolumes | list | `[]` | Additional volumes for task job pods. @section -- Task | +| task.job | object | `{"basename":"","ttlSecondsAfterFinished":60}` | Job configuration for task execution. @section -- Task | +| task.job.basename | string | `""` | Base name prefix for spawned Kubernetes Jobs (defaults to fullname, e.g., "scalr-agent"). Jobs are named as `-`. See README for details on task naming. @section -- Task | +| task.job.ttlSecondsAfterFinished | int | `60` | Time in seconds after job completion before it is automatically deleted. @section -- Task | +| task.jobAnnotations | object | `{}` | Additional annotations for the Job (workload object). @section -- Task | +| task.jobLabels | object | `{}` | Additional labels for the Job (workload object). @section -- Task | +| task.nodeSelector | object | `{}` | Node selector for assigning task job pods to specific nodes. Example: `--set task.nodeSelector."node-type"="agent-worker"` @section -- Task | +| task.podAnnotations | object | `{}` | Task-specific pod annotations (merged with global.podAnnotations, overrides duplicate keys). @section -- Task | +| task.podLabels | object | `{}` | Task-specific pod labels (merged with global.podLabels, overrides duplicate keys). @section -- Task | +| task.podSecurityContext | object | `{}` | Task-specific pod security context (merged with global.podSecurityContext, overrides duplicate keys). @section -- Task | +| task.runner | object | `{"extraEnv":{},"extraVolumeMounts":[],"image":{"pullPolicy":"IfNotPresent","repository":"scalr/runner","tag":"0.2.0"},"memorySoftLimitPercent":80,"memoryWarnPercent":90,"resources":{"limits":{"cpu":"4000m","memory":"2048Mi"},"requests":{"cpu":"500m","memory":"512Mi"}},"securityContext":{"allowPrivilegeEscalation":false,"capabilities":{"drop":["ALL"]},"privileged":false,"readOnlyRootFilesystem":true,"runAsNonRoot":true,"seLinuxOptions":{}}}` | Runner container configuration (environment where Terraform/OpenTofu commands are executed). @section -- Task | +| task.runner.extraEnv | object | `{}` | Additional environment variables for the runner container. @section -- Task | +| task.runner.extraVolumeMounts | list | `[]` | Additional volume mounts for the runner container. @section -- Task | +| task.runner.image | object | `{"pullPolicy":"IfNotPresent","repository":"scalr/runner","tag":"0.2.0"}` | Runner container image settings. Default image: https://hub.docker.com/r/scalr/runner, repository: https://github.com/Scalr/runner Note: For Scalr-managed agents, this may be overridden by Scalr account image settings. @section -- Task | +| task.runner.image.pullPolicy | string | `"IfNotPresent"` | Image pull policy. @section -- Task | +| task.runner.image.repository | string | `"scalr/runner"` | Default repository for the runner image. @section -- Task | +| task.runner.image.tag | string | `"0.2.0"` | Default tag for the runner image. @section -- Task | +| task.runner.memorySoftLimitPercent | int | `80` | Memory soft limit as a percentage of the hard limit (task.runner.resources.limits.memory). When memory usage exceeds this value, the process will be gracefully terminated by the agent. Graceful termination ensures that OpenTofu/Terraform workloads push state before exiting, preventing state loss. Setting this value too high reduces the memory headroom available for state push and increases the risk of state loss. Have no effect when task.runner.resources.limits.memory is not set. For example, when task.runner.resources.limits.memory is set to 1000Mi and memorySoftLimitPercent is 80%, the workload will be gracefully terminated when memory usage reaches 800Mi. @section -- Task | +| task.runner.memoryWarnPercent | int | `90` | Memory warning threshold as a percentage of the soft limit (task.runner.memorySoftLimitPercent). A warning is logged to the run console when memory usage exceeds this value, indicating that the workload is at risk of being terminated due to high memory usage. The warning is reported after the run completes. Has no effect when task.runner.memorySoftLimitPercent or task.runner.resources.limits.memory are not set. @section -- Task | +| task.runner.resources | object | `{"limits":{"cpu":"4000m","memory":"2048Mi"},"requests":{"cpu":"500m","memory":"512Mi"}}` | Resource requests and limits for the runner container. Note: For scalr-managed agents, this may be overridden by Scalr platform billing resource tier presets. @section -- Task | +| task.runner.securityContext | object | `{"allowPrivilegeEscalation":false,"capabilities":{"drop":["ALL"]},"privileged":false,"readOnlyRootFilesystem":true,"runAsNonRoot":true,"seLinuxOptions":{}}` | Security context for the runner container. The default declaration duplicates some critical options from podSecurityContext to keep them independent. @section -- Task | +| task.runner.securityContext.allowPrivilegeEscalation | bool | `false` | Allow privilege escalation. @section -- Task | +| task.runner.securityContext.capabilities | object | `{"drop":["ALL"]}` | Container capabilities restrictions for security. @section -- Task | +| task.runner.securityContext.privileged | bool | `false` | Run container in privileged mode. @section -- Task | +| task.runner.securityContext.readOnlyRootFilesystem | bool | `true` | Read-only root filesystem. @section -- Task | +| task.runner.securityContext.runAsNonRoot | bool | `true` | Run container as non-root user for security. @section -- Task | +| task.runner.securityContext.seLinuxOptions | object | `{}` | SELinux options for the container. @section -- Task | +| task.sidecars | list | `[]` | Additional sidecar containers for task job pods. @section -- Task | +| task.startupTimeoutSeconds | int | `180` | Maximum time in seconds for the agent worker container to become ready and begin Scalr run execution. If the pod does not start within this period, the controller fails the Scalr run and deletes the job. @section -- Task | +| task.terminationGracePeriodSeconds | int | `360` | Grace period in seconds before forcibly terminating task job containers. @section -- Task | +| task.tolerations | list | `[]` | Node tolerations for task job pods. Expects input structure as per specification . Example: `--set task.tolerations[0].key=dedicated,task.tolerations[0].operator=Equal,task.tolerations[0].value=agent-worker,task.tolerations[0].effect=NoSchedule` @section -- Task | +| task.worker | object | `{"extraEnv":{},"extraVolumeMounts":[],"resources":{"limits":{"memory":"1024Mi"},"requests":{"cpu":"250m","memory":"256Mi"}},"securityContext":{}}` | Worker container configuration (sidecar that supervises task execution). @section -- Task | +| task.worker.extraEnv | object | `{}` | Additional environment variables for the worker container (merged with agent.extraEnv). @section -- Task | +| task.worker.extraVolumeMounts | list | `[]` | Additional volume mounts for the worker container. @section -- Task | +| task.worker.resources | object | `{"limits":{"memory":"1024Mi"},"requests":{"cpu":"250m","memory":"256Mi"}}` | Resource requests and limits for the worker container. @section -- Task | +| task.worker.securityContext | object | `{}` | Security context for the worker container. @section -- Task | ---------------------------------------------- -Autogenerated from chart metadata using [helm-docs v1.14.2](https://github.com/norwoodj/helm-docs/releases/v1.14.2) +Autogenerated from chart metadata using [helm-docs v1.11.0](https://github.com/norwoodj/helm-docs/releases/v1.11.0) diff --git a/charts/agent-job/README.md.gotmpl b/charts/agent-job/README.md.gotmpl index 67594097..0cdde640 100644 --- a/charts/agent-job/README.md.gotmpl +++ b/charts/agent-job/README.md.gotmpl @@ -458,11 +458,15 @@ This feature relies on egress NetworkPolicy enforcement, which requires a compat Ensure your cluster uses a CNI plugin that supports egress NetworkPolicies. Tested configurations: -| Cluster | CNI | IMDS Blocked | -|---------|-----|:------------:| -| AWS EKS | Calico | ✅ | -| GKE | Dataplane V1 (Calico) | ❌ | -| GKE | [Dataplane V2](https://docs.cloud.google.com/kubernetes-engine/docs/how-to/dataplane-v2) (Cilium/eBPF) | ✅ | + +| Cluster | CNI / network setup | IMDS blocked | +|-----------|--------------------------------------------------------------------------------------|:------------:| +| AWS EKS | Amazon VPC CNI (data plane) + Calico for network policy only (tigera-operator with `cni.type: AmazonVPC`) | ✅ | +| GKE | Dataplane V1 (Calico) | ❌ | +| GKE | [Dataplane V2](https://docs.cloud.google.com/kubernetes-engine/docs/how-to/dataplane-v2) (Cilium/eBPF) | ✅ | +| Azure AKS | Azure CNI (network plugin) + Cilium (network data-plane) | ✅ | + +**Note (EKS):** Pod networking is provided by Amazon VPC CNI. Calico is used only as the network policy engine (no Calico data plane); the VPC CNI is patched so Calico can enforce policy (e.g. `ANNOTATE_POD_IP=true` on the aws-node DaemonSet) ## Network Requirements diff --git a/charts/agent-k8s/README.md b/charts/agent-k8s/README.md index 815de5c4..2c85c0cc 100644 --- a/charts/agent-k8s/README.md +++ b/charts/agent-k8s/README.md @@ -397,4 +397,4 @@ If your cluster doesn't currently support egress NetworkPolicies, you may need t | workerTolerations | list | `[]` | Kubernetes Node Tolerations for the agent worker and the agent task pods. Expects input structure as per specification . Example: `--set workerTolerations[0].operator=Equal,workerTolerations[0].effect=NoSchedule,workerTolerations[0].key=dedicated,workerTolerations[0].value=scalr-agent-worker-pool` | ---------------------------------------------- -Autogenerated from chart metadata using [helm-docs v1.14.2](https://github.com/norwoodj/helm-docs/releases/v1.14.2) +Autogenerated from chart metadata using [helm-docs v1.11.0](https://github.com/norwoodj/helm-docs/releases/v1.11.0) diff --git a/charts/agent-local/README.md b/charts/agent-local/README.md index f2adc733..7c3511b3 100644 --- a/charts/agent-local/README.md +++ b/charts/agent-local/README.md @@ -318,109 +318,62 @@ It's best to pull the logs immediately after an incident, since this command wil ## Values -### Scheduling & Placement - -| Key | Type | Default | Description | -|-----|------|---------|-------------| -| affinity | object | `{}` | Affinity rules for pod scheduling. | -| nodeSelector | object | `{}` | Node selector for scheduling Scalr Agent pods. | -| replicaCount | int | `1` | Number of replicas for the Scalr Agent deployment. Adjust for high availability. | -| tolerations | list | `[]` | Tolerations for scheduling pods on tainted nodes. | - -### Agent - -| Key | Type | Default | Description | -|-----|------|---------|-------------| -| agent.cacheDir | string | `"/var/lib/scalr-agent/cache"` | Cache directory where the agent stores provider binaries, plugin cache, and metadata. This directory must be readable, writable, and executable. | -| agent.dataDir | string | `"/var/lib/scalr-agent/data"` | Data directory where the agent stores workspace data (configuration versions, modules, and providers). This directory must be readable, writable, and executable. | -| agent.shutdownMode | string | `"graceful"` | The agent termination behaviour. Can be graceful, force or drain. See https://docs.scalr.io/docs/configuration#scalr_agent_worker_on_stop_action | -| agent.token | string | `""` | The agent pool token. | -| agent.tokenExistingSecret | object | `{"key":"token","name":""}` | Pre-existing Kubernetes secret for the Scalr Agent token. | -| agent.tokenExistingSecret.key | string | `"token"` | Key within the secret that holds the token value. | -| agent.tokenExistingSecret.name | string | `""` | Name of the secret containing the token. | -| agent.url | string | `""` | The Scalr API endpoint URL. For tokens generated after Scalr version 8.162.0, this value is optional, as the domain can be extracted from the token payload. However, it is recommended to specify the URL explicitly for long-lived services to avoid issues if the account is renamed. | - -### Security - -| Key | Type | Default | Description | -|-----|------|---------|-------------| -| allowMetadataService | bool | `true` | When set to `true` (default), disables the NetworkPolicy that blocks access to the VM metadata service (`169.254.169.254`) for agent containers. When set to `false`, a NetworkPolicy is created to prevent workloads from accessing cloud credentials or instance metadata. | -| podSecurityContext | object | `{"fsGroup":1000,"runAsNonRoot":true}` | Security context for Scalr Agent pod. | -| secret | object | `{"annotations":{},"labels":{}}` | Secret configuration for storing the Scalr Agent token. | -| secret.annotations | object | `{}` | Annotations for the Secret resource. | -| secret.labels | object | `{}` | Additional labels for the Secret resource. | -| securityContext | object | `{"capabilities":{"drop":["ALL"]},"privileged":false,"procMount":"Default","runAsGroup":1000,"runAsNonRoot":true,"runAsUser":1000}` | Security context for Scalr Agent container. | -| securityContext.capabilities | object | `{"drop":["ALL"]}` | Restrict container capabilities for security. | -| securityContext.privileged | bool | `false` | Run container in privileged mode. Enable only if required. | -| securityContext.procMount | string | `"Default"` | Proc mount type. Valid values: Default, Unmasked, Host. | - -### Pod Configuration - -| Key | Type | Default | Description | -|-----|------|---------|-------------| -| extraEnv | object | `{}` | Additional environment variables for Scalr Agent. Use to configure HTTP proxies or other runtime parameters. | -| podAnnotations | object | `{}` | Annotations for Scalr Agent pods (e.g., for monitoring or logging). | - -### Image - -| Key | Type | Default | Description | -|-----|------|---------|-------------| -| image.pullPolicy | string | `"IfNotPresent"` | Image pull policy. 'IfNotPresent' is efficient for stable deployments. | -| image.repository | string | `"scalr/agent-runner"` | Docker repository for the Scalr Agent image. | -| image.tag | string | `""` | Image tag. Overrides the default (chart appVersion). Leave empty to use chart default. | -| imagePullSecrets | list | `[]` | Image pull secret to use for registry authentication. | - -### Persistence - -| Key | Type | Default | Description | -|-----|------|---------|-------------| -| persistence.cache | object | `{"emptyDir":{"sizeLimit":"20Gi"}}` | Cache directory storage configuration. Stores OpenTofu/Terraform providers, modules and binaries. | -| persistence.cache.emptyDir | object | `{"sizeLimit":"20Gi"}` | EmptyDir volume configuration (used when persistence.enabled is false). | -| persistence.cache.emptyDir.sizeLimit | string | `"20Gi"` | Size limit for the emptyDir volume. | -| persistence.data | object | `{"emptyDir":{"sizeLimit":"4Gi"}}` | Data directory storage configuration. Stores workspace data including configuration versions, modules, and run metadata. | -| persistence.data.emptyDir | object | `{"sizeLimit":"4Gi"}` | EmptyDir volume configuration. | -| persistence.data.emptyDir.sizeLimit | string | `"4Gi"` | Size limit for the emptyDir volume. | -| persistence.enabled | bool | `false` | Enable persistent storage for cache volume. If false, uses emptyDir (ephemeral storage). | -| persistence.persistentVolumeClaim | object | `{"accessMode":"ReadWriteOnce","claimName":"","storage":"20Gi","storageClassName":"","subPath":""}` | Configuration for persistentVolumeClaim for cache volume (used when persistence.enabled is true). | -| persistence.persistentVolumeClaim.accessMode | string | `"ReadWriteOnce"` | Access mode for the PVC. Use "ReadWriteOnce" for single-replica deployments. Use "ReadWriteMany" only if the Scalr Agent supports shared storage (e.g., with NFS). | -| persistence.persistentVolumeClaim.claimName | string | `""` | Name of an existing PVC. If empty, a new PVC is created dynamically. | -| persistence.persistentVolumeClaim.storage | string | `"20Gi"` | Storage size for the PVC. | -| persistence.persistentVolumeClaim.storageClassName | string | `""` | Storage class for the PVC. Leave empty to use the cluster's default storage class. Set to "-" to disable dynamic provisioning and require a pre-existing PVC. | -| persistence.persistentVolumeClaim.subPath | string | `""` | Optional subPath for mounting a specific subdirectory of the volume. | - -### Resource Management - -| Key | Type | Default | Description | -|-----|------|---------|-------------| -| resources | object | `{"limits":{"cpu":"4000m","memory":"2048Mi"},"requests":{"cpu":"1000m","memory":"1024Mi"}}` | Resource limits and requests for Scalr Agent pods. Set identical resource limits and requests to enable Guaranteed QoS and minimize eviction risk. See: https://kubernetes.io/docs/concepts/workloads/pods/pod-qos/#quality-of-service-classes | - -### Service account - -| Key | Type | Default | Description | -|-----|------|---------|-------------| -| serviceAccount.annotations | object | `{}` | Annotations for the service account. | -| serviceAccount.automountToken | bool | `false` | Whether to automount the service account token in the Scalr Agent pod. | -| serviceAccount.create | bool | `false` | Create a Kubernetes service account for the Scalr Agent. | -| serviceAccount.labels | object | `{}` | Additional labels for the service account. | -| serviceAccount.name | string | `""` | Name of the service account. Generated if not set and 'create' is true. | - -### Deployment & Scaling - -| Key | Type | Default | Description | -|-----|------|---------|-------------| -| terminationGracePeriodSeconds | int | `120` | Termination grace period (in seconds) for pod shutdown. | - -### Other Values - | Key | Type | Default | Description | |-----|------|---------|-------------| +| affinity | object | `{}` | Affinity rules for pod scheduling. @section -- Scheduling & Placement | +| agent.cacheDir | string | `"/var/lib/scalr-agent/cache"` | Cache directory where the agent stores provider binaries, plugin cache, and metadata. This directory must be readable, writable, and executable. @section -- Agent | +| agent.dataDir | string | `"/var/lib/scalr-agent/data"` | Data directory where the agent stores workspace data (configuration versions, modules, and providers). This directory must be readable, writable, and executable. @section -- Agent | +| agent.shutdownMode | string | `"graceful"` | The agent termination behaviour. Can be graceful, force or drain. See https://docs.scalr.io/docs/configuration#scalr_agent_worker_on_stop_action @section -- Agent | +| agent.token | string | `""` | The agent pool token. @section -- Agent | +| agent.tokenExistingSecret | object | `{"key":"token","name":""}` | Pre-existing Kubernetes secret for the Scalr Agent token. @section -- Agent | +| agent.tokenExistingSecret.key | string | `"token"` | Key within the secret that holds the token value. @section -- Agent | +| agent.tokenExistingSecret.name | string | `""` | Name of the secret containing the token. @section -- Agent | +| agent.url | string | `""` | The Scalr API endpoint URL. For tokens generated after Scalr version 8.162.0, this value is optional, as the domain can be extracted from the token payload. However, it is recommended to specify the URL explicitly for long-lived services to avoid issues if the account is renamed. @section -- Agent | +| allowMetadataService | bool | `true` | When set to `true` (default), disables the NetworkPolicy that blocks access to the VM metadata service (`169.254.169.254`) for agent containers. When set to `false`, a NetworkPolicy is created to prevent workloads from accessing cloud credentials or instance metadata. @section -- Security | +| extraEnv | object | `{}` | Additional environment variables for Scalr Agent. Use to configure HTTP proxies or other runtime parameters. @section -- Pod Configuration | | fullnameOverride | string | `""` | Fully override the resource name for all resources. | +| image.pullPolicy | string | `"IfNotPresent"` | Image pull policy. 'IfNotPresent' is efficient for stable deployments. @section -- Image | +| image.repository | string | `"scalr/agent-runner"` | Docker repository for the Scalr Agent image. @section -- Image | +| image.tag | string | `""` | Image tag. Overrides the default (chart appVersion). Leave empty to use chart default. @section -- Image | +| imagePullSecrets | list | `[]` | Image pull secret to use for registry authentication. @section -- Image | | nameOverride | string | `""` | Override the default resource name prefix for all resources. | +| nodeSelector | object | `{}` | Node selector for scheduling Scalr Agent pods. @section -- Scheduling & Placement | +| persistence.cache | object | `{"emptyDir":{"sizeLimit":"20Gi"}}` | Cache directory storage configuration. Stores OpenTofu/Terraform providers, modules and binaries. @section -- Persistence | +| persistence.cache.emptyDir | object | `{"sizeLimit":"20Gi"}` | EmptyDir volume configuration (used when persistence.enabled is false). @section -- Persistence | +| persistence.cache.emptyDir.sizeLimit | string | `"20Gi"` | Size limit for the emptyDir volume. @section -- Persistence | +| persistence.data | object | `{"emptyDir":{"sizeLimit":"4Gi"}}` | Data directory storage configuration. Stores workspace data including configuration versions, modules, and run metadata. @section -- Persistence | +| persistence.data.emptyDir | object | `{"sizeLimit":"4Gi"}` | EmptyDir volume configuration. @section -- Persistence | +| persistence.data.emptyDir.sizeLimit | string | `"4Gi"` | Size limit for the emptyDir volume. @section -- Persistence | +| persistence.enabled | bool | `false` | Enable persistent storage for cache volume. If false, uses emptyDir (ephemeral storage). @section -- Persistence | +| persistence.persistentVolumeClaim | object | `{"accessMode":"ReadWriteOnce","claimName":"","storage":"20Gi","storageClassName":"","subPath":""}` | Configuration for persistentVolumeClaim for cache volume (used when persistence.enabled is true). @section -- Persistence | +| persistence.persistentVolumeClaim.accessMode | string | `"ReadWriteOnce"` | Access mode for the PVC. Use "ReadWriteOnce" for single-replica deployments. Use "ReadWriteMany" only if the Scalr Agent supports shared storage (e.g., with NFS). @section -- Persistence | +| persistence.persistentVolumeClaim.claimName | string | `""` | Name of an existing PVC. If empty, a new PVC is created dynamically. @section -- Persistence | +| persistence.persistentVolumeClaim.storage | string | `"20Gi"` | Storage size for the PVC. @section -- Persistence | +| persistence.persistentVolumeClaim.storageClassName | string | `""` | Storage class for the PVC. Leave empty to use the cluster's default storage class. Set to "-" to disable dynamic provisioning and require a pre-existing PVC. @section -- Persistence | +| persistence.persistentVolumeClaim.subPath | string | `""` | Optional subPath for mounting a specific subdirectory of the volume. @section -- Persistence | +| podAnnotations | object | `{}` | Annotations for Scalr Agent pods (e.g., for monitoring or logging). @section -- Pod Configuration | +| podSecurityContext | object | `{"fsGroup":1000,"runAsNonRoot":true}` | Security context for Scalr Agent pod. @section -- Security | +| replicaCount | int | `1` | Number of replicas for the Scalr Agent deployment. Adjust for high availability. @section -- Scheduling & Placement | +| resources | object | `{"limits":{"cpu":"4000m","memory":"2048Mi"},"requests":{"cpu":"1000m","memory":"1024Mi"}}` | Resource limits and requests for Scalr Agent pods. Set identical resource limits and requests to enable Guaranteed QoS and minimize eviction risk. See: https://kubernetes.io/docs/concepts/workloads/pods/pod-qos/#quality-of-service-classes @section -- Resource Management | +| secret | object | `{"annotations":{},"labels":{}}` | Secret configuration for storing the Scalr Agent token. @section -- Security | +| secret.annotations | object | `{}` | Annotations for the Secret resource. @section -- Security | +| secret.labels | object | `{}` | Additional labels for the Secret resource. @section -- Security | +| securityContext | object | `{"capabilities":{"drop":["ALL"]},"privileged":false,"procMount":"Default","runAsGroup":1000,"runAsNonRoot":true,"runAsUser":1000}` | Security context for Scalr Agent container. @section -- Security | +| securityContext.capabilities | object | `{"drop":["ALL"]}` | Restrict container capabilities for security. @section -- Security | +| securityContext.privileged | bool | `false` | Run container in privileged mode. Enable only if required. @section -- Security | +| securityContext.procMount | string | `"Default"` | Proc mount type. Valid values: Default, Unmasked, Host. @section -- Security | +| serviceAccount.annotations | object | `{}` | Annotations for the service account. @section -- Service account | +| serviceAccount.automountToken | bool | `false` | Whether to automount the service account token in the Scalr Agent pod. @section -- Service account | +| serviceAccount.create | bool | `false` | Create a Kubernetes service account for the Scalr Agent. @section -- Service account | +| serviceAccount.labels | object | `{}` | Additional labels for the service account. @section -- Service account | +| serviceAccount.name | string | `""` | Name of the service account. Generated if not set and 'create' is true. @section -- Service account | | strategy | object | `{"rollingUpdate":{"maxSurge":"25%","maxUnavailable":"50%"},"type":"RollingUpdate"}` | Deployment strategy configuration. | | strategy.rollingUpdate | object | `{"maxSurge":"25%","maxUnavailable":"50%"}` | Rolling update parameters. | | strategy.rollingUpdate.maxSurge | string | `"25%"` | Maximum number of pods that can be created above the desired number during an update. | | strategy.rollingUpdate.maxUnavailable | string | `"50%"` | Maximum number of pods that can be unavailable during an update. | | strategy.type | string | `"RollingUpdate"` | Type of deployment strategy. Options: RollingUpdate, Recreate. | +| terminationGracePeriodSeconds | int | `120` | Termination grace period (in seconds) for pod shutdown. @section -- Deployment & Scaling | +| tolerations | list | `[]` | Tolerations for scheduling pods on tainted nodes. @section -- Scheduling & Placement | ---------------------------------------------- -Autogenerated from chart metadata using [helm-docs v1.14.2](https://github.com/norwoodj/helm-docs/releases/v1.14.2) +Autogenerated from chart metadata using [helm-docs v1.11.0](https://github.com/norwoodj/helm-docs/releases/v1.11.0)