diff --git a/.gitignore b/.gitignore index 157490ae..e5e0a092 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,4 @@ autoshift/files/ release-artifacts/ .cache/ .mcp.json +my-values.yaml \ No newline at end of file diff --git a/autoshift/values.hub.yaml b/autoshift/values.hub.yaml index 8f5261b5..6a0661af 100644 --- a/autoshift/values.hub.yaml +++ b/autoshift/values.hub.yaml @@ -38,6 +38,17 @@ hubClusterSets: servicemesh3-source: redhat-operators servicemesh3-source-namespace: openshift-marketplace # servicemesh3-version: 'servicemeshoperator3.v3.2.1' + # Istio control plane configuration + # servicemesh3-istio-name: default + # servicemesh3-istio-namespace: istio-system + # servicemesh3-istio-version: 'v1.26.2' + # servicemesh3-istio-update-strategy: InPlace + # servicemesh3-istio-pilot-cpu-request: '100m' + # servicemesh3-istio-pilot-memory-request: '256Mi' + # IstioCNI configuration + # servicemesh3-istio-cni-enabled: 'true' + # servicemesh3-istio-cni-name: default + # servicemesh3-istio-cni-version: 'v1.26.2' self-managed: 'true' # OpenShift version for cluster deployment and image mirroring openshift-version: '4.18.28' @@ -254,6 +265,36 @@ hubClusterSets: node-feature-discovery-source: redhat-operators node-feature-discovery-source-namespace: openshift-marketplace node-feature-discovery-version: 'nfd.4.18.0-202511261113' + ### NVIDIA GPU Operator + # nvidia-gpu: 'false' + # nvidia-gpu-subscription-name: gpu-operator-certified + # nvidia-gpu-channel: stable + # nvidia-gpu-source: certified-operators + # nvidia-gpu-source-namespace: openshift-marketplace + # # nvidia-gpu-version: 'gpu-operator-certified.v24.9.0' + # nvidia-gpu-driver: 'true' + # nvidia-gpu-driver-toolkit: 'true' + # nvidia-gpu-toolkit: 'true' + # nvidia-gpu-device-plugin: 'true' + # nvidia-gpu-dcgm: 'true' + # nvidia-gpu-mig: 'false' + # nvidia-gpu-gfd: 'true' + # ### Red Hat OpenShift AI (RHOAI) + # rhoai: 'false' + # rhoai-subscription-name: rhods-operator + # rhoai-channel: fast-3.x + # rhoai-source: redhat-operators + # rhoai-source-namespace: openshift-marketplace + # # rhoai-version: 'rhods-operator.v3.0.0' + # rhoai-dashboard: 'true' + # rhoai-workbenches: 'true' + # rhoai-aipipelines: 'true' + # rhoai-kserve: 'true' + # rhoai-modelregistry: 'true' + # rhoai-ray: 'true' + # rhoai-kueue: 'true' + # rhoai-training: 'true' + # rhoai-trustyai: 'true' # Spoke Cluster Sets with Feature Flags. Blank will remove managedClusterSets: managed: @@ -279,6 +320,17 @@ managedClusterSets: servicemesh3-source: redhat-operators servicemesh3-source-namespace: openshift-marketplace servicemesh3-version: 'servicemeshoperator3.v3.2.1' + # Istio control plane configuration + # servicemesh3-istio-name: default + # servicemesh3-istio-namespace: istio-system + # servicemesh3-istio-version: 'v1.26.2' + # servicemesh3-istio-update-strategy: InPlace + # servicemesh3-istio-pilot-cpu-request: '100m' + # servicemesh3-istio-pilot-memory-request: '256Mi' + # IstioCNI configuration + # servicemesh3-istio-cni-enabled: 'true' + # servicemesh3-istio-cni-name: default + # servicemesh3-istio-cni-version: 'v1.26.2' # OpenShift version for cluster deployment and image mirroring openshift-version: '4.18.28' ### Infrastructure Nodes @@ -389,6 +441,36 @@ managedClusterSets: compliance-source: redhat-operators compliance-source-namespace: openshift-marketplace compliance-channel: stable + ### NVIDIA GPU Operator + # nvidia-gpu: 'false' + # nvidia-gpu-subscription-name: gpu-operator-certified + # nvidia-gpu-channel: stable + # nvidia-gpu-source: certified-operators + # nvidia-gpu-source-namespace: openshift-marketplace + # # nvidia-gpu-version: 'gpu-operator-certified.v24.9.0' + # nvidia-gpu-driver: 'true' + # nvidia-gpu-driver-toolkit: 'true' + # nvidia-gpu-toolkit: 'true' + # nvidia-gpu-device-plugin: 'true' + # nvidia-gpu-dcgm: 'true' + # nvidia-gpu-mig: 'false' + # nvidia-gpu-gfd: 'true' + # ### Red Hat OpenShift AI (RHOAI) + # rhoai: 'false' + # rhoai-subscription-name: rhods-operator + # rhoai-channel: fast-3.x + # rhoai-source: redhat-operators + # rhoai-source-namespace: openshift-marketplace + # # rhoai-version: 'rhods-operator.v3.0.0' + # rhoai-dashboard: 'true' + # rhoai-workbenches: 'true' + # rhoai-aipipelines: 'true' + # rhoai-kserve: 'true' + # rhoai-modelregistry: 'true' + # rhoai-ray: 'true' + # rhoai-kueue: 'true' + # rhoai-training: 'true' + # rhoai-trustyai: 'true' ### Disconnected Mirror Settings # Enable disconnected mirroring (uses mirror catalog sources instead of default) # disconnected-mirror: 'true' @@ -420,6 +502,17 @@ managedClusterSets: # servicemesh3-source: redhat-operators # servicemesh3-source-namespace: openshift-marketplace # servicemesh3-version: 'servicemeshoperator3.v3.2.1' +# # Istio control plane configuration +# servicemesh3-istio-name: default +# servicemesh3-istio-namespace: istio-system +# servicemesh3-istio-version: 'v1.26.2' +# servicemesh3-istio-update-strategy: InPlace +# servicemesh3-istio-pilot-cpu-request: '100m' +# servicemesh3-istio-pilot-memory-request: '256Mi' +# # IstioCNI configuration +# servicemesh3-istio-cni-enabled: 'true' +# servicemesh3-istio-cni-name: default +# servicemesh3-istio-cni-version: 'v1.26.2' # infra-nodes-numcpu: '8' # infra-nodes-memory-mib: '24576' # infra-nodes-numcores-per-socket: '4' diff --git a/docs/README.md b/docs/README.md index f42596a7..f45f64dc 100644 --- a/docs/README.md +++ b/docs/README.md @@ -5,8 +5,9 @@ Complete documentation for AutoShift - Infrastructure as Code for OpenShift usin ## Quick Links ### Getting Started -- **[Quick Start Guide](quickstart-oci.md)** - Get started with AutoShift in 15 minutes -- **[OCI Deployment Guide](deploy-oci.md)** - Deploy AutoShift from OCI registries (recommended) +- **[Quick Start from Source](quickstart-from-source.md)** - Deploy from Git for testing/development +- **[Quick Start (OCI)](quickstart-oci.md)** - Deploy from OCI registry in 15 minutes +- **[OCI Deployment Guide](deploy-oci.md)** - Full OCI deployment guide (recommended for production) ### Release & Operations - **[Release Guide](releases.md)** - How to create and publish AutoShift releases @@ -14,6 +15,7 @@ Complete documentation for AutoShift - Infrastructure as Code for OpenShift usin ### Development - **[Developer Guide](developer-guide.md)** - Contributing to AutoShift and advanced configuration +- **[Adding New Operators](adding-new-operators.md)** - Step-by-step guide to add operators and contribute upstream ## Documentation Overview @@ -46,8 +48,9 @@ See `autoshift/values.minimal.yaml` for a minimal configuration example. #### Development Workflow 1. Review [Developer Guide](developer-guide.md) for contribution guidelines -2. Use `values.minimal.yaml` as a starting point for new features -3. Test with Git mode before creating OCI releases +2. Follow [Adding New Operators](adding-new-operators.md) to add operator policies +3. Use `values.minimal.yaml` as a starting point for new features +4. Test with Git mode before creating OCI releases ## Architecture diff --git a/docs/adding-new-operators.md b/docs/adding-new-operators.md new file mode 100644 index 00000000..1d445586 --- /dev/null +++ b/docs/adding-new-operators.md @@ -0,0 +1,616 @@ +# Adding New Operators to AutoShift + +This guide walks you through adding a new operator to AutoShift and contributing it upstream so it works in production environments. + +## Table of Contents + +- [Overview](#overview) +- [Prerequisites](#prerequisites) +- [Step 1: Research the Operator](#step-1-research-the-operator) +- [Step 2: Generate the Policy](#step-2-generate-the-policy) +- [Step 3: Understand Generated Files](#step-3-understand-generated-files) +- [Step 4: Add Labels to Values Files](#step-4-add-labels-to-values-files) +- [Step 5: Add Operator Configuration (Optional)](#step-5-add-operator-configuration-optional) +- [Step 6: Test Locally](#step-6-test-locally) +- [Step 7: Contribute Upstream](#step-7-contribute-upstream) +- [Complete Example Walkthrough](#complete-example-walkthrough) +- [Troubleshooting](#troubleshooting) + +--- + +## Overview + +AutoShift uses Red Hat Advanced Cluster Management (RHACM) policies to deploy and manage operators across OpenShift clusters. When you add a new operator to AutoShift, you're creating: + +1. **A Helm chart** in `policies//` containing the RHACM policy +2. **Cluster labels** in `autoshift/values.*.yaml` files that enable/configure the operator +3. **Optional configuration policies** for operator-specific custom resources + +The policy is automatically picked up by ArgoCD's ApplicationSet and deployed to clusters that have the matching labels. + +--- + +## Prerequisites + +- Access to a hub cluster running ACM (for testing) +- `oc` CLI installed and logged into a cluster +- `helm` CLI installed +- Git repository cloned locally: + +```bash +git clone https://github.com/auto-shift/autoshiftv2.git +cd autoshiftv2 +``` + +--- + +## Step 1: Research the Operator + +Before generating a policy, gather the operator's details from OperatorHub. + +### Find the Operator in the Catalog + +```bash +# List available operators (search for yours) +oc get packagemanifests -n openshift-marketplace | grep -i + +# Example: searching for cert-manager +oc get packagemanifests -n openshift-marketplace | grep -i cert +``` + +### Get Operator Details + +```bash +# Get full details about the operator +oc describe packagemanifest -n openshift-marketplace + +# Example output includes: +# - Package name (subscription name) +# - Available channels +# - Current CSV versions +# - Catalog source +``` + +### Key Information to Collect + +| Field | Description | Example | +|-------|-------------|---------| +| **Package Name** | The OLM subscription name | `openshift-pipelines-operator-rh` | +| **Channel** | The update channel | `stable`, `fast`, `pipelines-1.20` | +| **Catalog Source** | Where the operator comes from | `redhat-operators` | +| **Source Namespace** | Usually `openshift-marketplace` | `openshift-marketplace` | +| **Target Namespace** | Where the operator installs | `openshift-operators`, custom namespace | +| **Install Mode** | Cluster-scoped or namespace-scoped | Most are cluster-scoped | + +### Get Available Versions + +```bash +# List available versions in a channel +oc get packagemanifests -o jsonpath='{.status.channels[*].currentCSV}' + +# Get all channels with their versions +oc get packagemanifest -o yaml | grep -A2 "channels:" +``` + +--- + +## Step 2: Generate the Policy + +Use the policy generator script to create the Helm chart structure. + +### Basic Usage + +```bash +./scripts/generate-operator-policy.sh \ + --channel \ + --namespace \ + [options] +``` + +### Required Parameters + +| Parameter | Description | +|-----------|-------------| +| `component-name` | Kebab-case name for your policy (e.g., `cert-manager`) | +| `subscription-name` | The exact OLM package name from Step 1 | +| `--channel` | Operator channel to subscribe to | +| `--namespace` | Target namespace for the operator | + +### Optional Parameters + +| Parameter | Description | +|-----------|-------------| +| `--version ` | Pin to a specific operator version | +| `--namespace-scoped` | For operators that aren't cluster-scoped | +| `--add-to-autoshift` | Auto-add labels to all values files | +| `--values-files ` | Specific values files to update (e.g., `hub,sbx`) | +| `--source ` | Catalog source (default: `redhat-operators`) | +| `--source-namespace ` | Source namespace (default: `openshift-marketplace`) | + +### Examples + +**Cluster-scoped operator (most common):** + +```bash +./scripts/generate-operator-policy.sh cert-manager cert-manager-operator \ + --channel stable \ + --namespace cert-manager \ + --add-to-autoshift +``` + +**Operator with version pinning:** + +```bash +./scripts/generate-operator-policy.sh cert-manager cert-manager-operator \ + --channel stable \ + --namespace cert-manager \ + --version cert-manager.v1.14.4 \ + --add-to-autoshift +``` + +**Namespace-scoped operator:** + +```bash +./scripts/generate-operator-policy.sh my-operator my-operator-package \ + --channel stable \ + --namespace my-operator \ + --namespace-scoped \ + --add-to-autoshift +``` + +**Community operator:** + +```bash +./scripts/generate-operator-policy.sh keycloak keycloak-operator \ + --channel fast \ + --namespace keycloak \ + --source community-operators \ + --add-to-autoshift +``` + +--- + +## Step 3: Understand Generated Files + +After running the generator, you'll have a new directory under `policies/`: + +``` +policies// +├── Chart.yaml # Helm chart metadata +├── values.yaml # Default configuration values +├── README.md # Policy documentation +└── templates/ + └── policy--operator-install.yaml # RHACM Policy + Placement +``` + +### Chart.yaml + +Contains Helm chart metadata. Usually doesn't need modification. + +### values.yaml + +Contains default values for the operator. Example: + +```yaml +policy_namespace: open-cluster-policies + +certManager: + name: cert-manager-operator + namespace: cert-manager + channel: stable + source: redhat-operators + sourceNamespace: openshift-marketplace + operatorGroupName: cert-manager-operator +``` + +### Policy Template + +The main policy file contains: + +1. **ConfigurationPolicy** - Creates the operator namespace +2. **OperatorPolicy** - Installs the operator via OLM subscription +3. **Placement** - Targets clusters with the `autoshift.io/: 'true'` label +4. **PlacementBinding** - Links the policy to the placement + +The template uses "hub templates" to read cluster labels dynamically: + +```yaml +channel: '{{ "{{hub" }} index .ManagedClusterLabels "autoshift.io/cert-manager-channel" | default "stable" {{ "hub}}" }}' +``` + +This allows per-cluster or per-clusterset overrides via labels. + +--- + +## Step 4: Add Labels to Values Files + +If you used `--add-to-autoshift`, labels are added automatically. Otherwise, add them manually. + +### Label Structure + +For each operator, add these labels to `autoshift/values.*.yaml`: + +```yaml +hubClusterSets: + hub: + labels: + ### + : 'true' # Enable the operator + -subscription-name: # OLM package name + -channel: # Operator channel + -source: redhat-operators # Catalog source + -source-namespace: openshift-marketplace + # -version: '' # Optional: pin version +``` + +### Example: Adding cert-manager Labels + +```yaml +hubClusterSets: + hub: + labels: + ### cert-manager + cert-manager: 'true' + cert-manager-subscription-name: cert-manager-operator + cert-manager-channel: stable + cert-manager-source: redhat-operators + cert-manager-source-namespace: openshift-marketplace + # cert-manager-version: 'cert-manager.v1.14.4' +``` + +### Which Values Files to Update + +| File | Purpose | +|------|---------| +| `values.hub.yaml` | Main hub cluster configuration | +| `values.sbx.yaml` | Sandbox/development environment | +| `values.minimal.yaml` | Minimal configuration template | +| `values.hub.baremetal-sno.yaml` | Single-node OpenShift on bare metal | +| `values.hub.baremetal-compact.yaml` | Compact cluster on bare metal | + +### Enable for Managed Clusters + +To deploy the operator to managed (spoke) clusters: + +```yaml +managedClusterSets: + managed: + labels: + ### cert-manager + cert-manager: 'true' + cert-manager-subscription-name: cert-manager-operator + cert-manager-channel: stable + cert-manager-source: redhat-operators + cert-manager-source-namespace: openshift-marketplace +``` + +### Per-Cluster Overrides + +Override values for specific clusters: + +```yaml +clusters: + production-1: + labels: + cert-manager-channel: stable-1.14 # Use specific channel + cert-manager-version: 'cert-manager.v1.14.4' # Pin version +``` + +--- + +## Step 5: Add Operator Configuration (Optional) + +Many operators require additional configuration (Custom Resources) after installation. + +### Create Configuration Policy + +Create a new template file for the operator configuration: + +```bash +cat > policies//templates/policy--config.yaml << 'EOF' +{{- $policyName := "policy--config" }} +{{- $placementName := "placement-policy--config" }} + +apiVersion: policy.open-cluster-management.io/v1 +kind: Policy +metadata: + name: {{ $policyName }} + namespace: {{ .Values.policy_namespace }} + annotations: + policy.open-cluster-management.io/standards: NIST SP 800-53 + policy.open-cluster-management.io/categories: CM Configuration Management + policy.open-cluster-management.io/controls: CM-2 Baseline Configuration +spec: + disabled: false + # Optional: Wait for operator to be installed first + dependencies: + - name: policy--operator-install + namespace: {{ .Values.policy_namespace }} + apiVersion: policy.open-cluster-management.io/v1 + compliance: Compliant + kind: Policy + policy-templates: + - objectDefinition: + apiVersion: policy.open-cluster-management.io/v1 + kind: ConfigurationPolicy + metadata: + name: -instance + spec: + remediationAction: enforce + severity: high + object-templates: + - complianceType: musthave + objectDefinition: + apiVersion: /v1 + kind: + metadata: + name: + namespace: {{ .Values..namespace }} + spec: + # Your operator configuration here +--- +# Include Placement and PlacementBinding (same pattern as install policy) +EOF +``` + +### Add Dependencies Between Policies + +Use the `dependencies` field to ensure proper ordering: + +```yaml +spec: + dependencies: + - name: policy-odf-operator-install + namespace: {{ .Values.policy_namespace }} + apiVersion: policy.open-cluster-management.io/v1 + compliance: Compliant + kind: Policy +``` + +### Status Check Policy + +Add a policy to verify successful deployment: + +```yaml +- objectDefinition: + apiVersion: policy.open-cluster-management.io/v1 + kind: ConfigurationPolicy + metadata: + name: -status + spec: + remediationAction: inform # Don't enforce, just report + severity: high + object-templates: + - complianceType: musthave + objectDefinition: + apiVersion: operators.coreos.com/v1alpha1 + kind: ClusterServiceVersion + metadata: + namespace: {{ .Values..namespace }} + spec: + displayName: 'Your Operator Display Name' + status: + phase: Succeeded +``` + +--- + +## Step 6: Test Locally + +### Validate Helm Template Rendering + +```bash +# Render the templates to check for errors +helm template policies// + +# Validate against Kubernetes API +helm template policies// | oc apply --dry-run=client -f - +``` + +### Test All Policies + +```bash +# Quick validation of all policies +for policy in policies/*/; do + if [ -f "$policy/Chart.yaml" ]; then + echo "Validating $policy..." + helm template "$policy" > /dev/null && echo "✓ Valid" || echo "✗ Invalid" + fi +done +``` + +### Deploy to Test Cluster + +1. Ensure your test cluster has the required labels +2. Deploy AutoShift pointing to your branch +3. Monitor the policy status: + +```bash +# Watch policies +oc get policies -A -w + +# Check specific policy +oc describe policy policy--operator-install -n policies-autoshift + +# Check ArgoCD application +oc get applications -n openshift-gitops | grep +``` + +--- + +## Step 7: Contribute Upstream + +### Fork and Clone + +```bash +# Fork on GitHub, then clone your fork +git clone https://github.com//autoshiftv2.git +cd autoshiftv2 + +# Add upstream remote +git remote add upstream https://github.com/auto-shift/autoshiftv2.git +``` + +### Create Feature Branch + +```bash +git checkout -b feature/add--policy +``` + +### Make Your Changes + +1. Generate the policy +2. Add labels to values files +3. Add configuration policies if needed +4. Test locally + +### Commit with Clear Message + +```bash +git add policies// +git add autoshift/values.*.yaml + +git commit -m "Add operator policy + +- Generate policy for installation +- Add labels to hub and managed clusterset values +- Include configuration for " +``` + +### Push and Create Pull Request + +```bash +git push origin feature/add--policy +``` + +Then create a PR via GitHub web interface. + +### PR Checklist + +- [ ] Policy generated using `generate-operator-policy.sh` +- [ ] Subscription name and channel verified from OperatorHub +- [ ] Labels added to appropriate values files +- [ ] `helm template` renders without errors +- [ ] README.md included with policy documentation +- [ ] Tested on a real cluster (if possible) +- [ ] No hardcoded values (use hub templates for flexibility) + +--- + +## Complete Example Walkthrough + +Let's add the **Cert-Manager** operator end-to-end. + +### 1. Research + +```bash +oc get packagemanifests -n openshift-marketplace | grep cert +# Output: cert-manager-operator Red Hat Operators 26d + +oc describe packagemanifest cert-manager-operator -n openshift-marketplace | head -50 +``` + +Key findings: +- Package: `cert-manager-operator` +- Channel: `stable` +- Source: `redhat-operators` +- Namespace: `cert-manager` + +### 2. Generate Policy + +```bash +./scripts/generate-operator-policy.sh cert-manager cert-manager-operator \ + --channel stable \ + --namespace cert-manager \ + --add-to-autoshift +``` + +### 3. Verify Generated Files + +```bash +ls -la policies/cert-manager/ +# Chart.yaml README.md templates/ values.yaml + +cat policies/cert-manager/values.yaml +``` + +### 4. Validate + +```bash +helm template policies/cert-manager/ +``` + +### 5. Test (on a cluster with the label) + +```bash +# Add label to test cluster +oc label managedcluster local-cluster autoshift.io/cert-manager=true + +# Watch for policy compliance +oc get policies -A | grep cert-manager +``` + +### 6. Commit and Push + +```bash +git add policies/cert-manager/ +git add autoshift/values.hub.yaml + +git commit -m "Add cert-manager operator policy + +- Enable TLS certificate management across clusters +- Support for automatic certificate renewal +- Integrates with ACME providers" + +git push origin feature/add-cert-manager +``` + +--- + +## Troubleshooting + +### Common Issues + +| Issue | Solution | +|-------|----------| +| `Policy not found by ArgoCD` | Ensure the policy directory name matches the component name | +| `Policy not applying to cluster` | Check cluster has `autoshift.io/: 'true'` label | +| `Operator not installing` | Verify subscription name, channel, and source are correct | +| `helm template fails` | Check YAML syntax and Helm template expressions | +| `Version pinning not working` | Ensure the CSV name format is correct (e.g., `operator.v1.0.0`) | + +### Debug Commands + +```bash +# Check cluster labels +oc get managedcluster -o yaml | grep autoshift + +# Check policy status +oc describe policy policy--operator-install -n policies-autoshift + +# Check OperatorPolicy on spoke cluster +oc get operatorpolicy -A +oc describe operatorpolicy install- -n + +# Check subscription on spoke cluster +oc get sub -n + +# View ACM policy propagator logs +oc logs -n open-cluster-management deployment/grc-policy-propagator +``` + +### Getting Help + +- Check existing policies in `policies/` for reference implementations +- Review the [Developer Guide](developer-guide.md) for architecture details +- Open an issue on [GitHub](https://github.com/auto-shift/autoshiftv2/issues) + +--- + +## See Also + +- [Developer Guide](developer-guide.md) - Full development documentation +- [Scripts README](../scripts/README.md) - Policy generator details +- [Gradual Rollout](gradual-rollout.md) - Multi-version deployment strategies +- [README](../README.md) - AutoShift cluster labels reference + diff --git a/docs/deploy-oci.md b/docs/deploy-oci.md index e6e64784..a70dde61 100644 --- a/docs/deploy-oci.md +++ b/docs/deploy-oci.md @@ -29,7 +29,54 @@ This guide explains how to deploy AutoShift from an OCI registry (Quay, GHCR, Ha ## Deployment Steps -### Step 1: Configure OCI Registry Credentials +### Step 1: Install OpenShift GitOps Operator + +OpenShift GitOps must be installed before deploying AutoShift: + +```bash +# Install OpenShift GitOps from the local chart +helm upgrade --install openshift-gitops ./openshift-gitops \ + --namespace openshift-gitops \ + --create-namespace + +# Wait for GitOps to be ready +echo "Waiting for GitOps operator..." +oc wait --for=condition=Available deployment/openshift-gitops-operator-controller-manager \ + -n openshift-gitops-operator --timeout=300s + +# Wait for ArgoCD instance +oc wait --for=condition=Available deployment/argocd-server \ + -n openshift-gitops --timeout=300s + +# Verify GitOps is ready +oc get pods -n openshift-gitops +``` + +### Step 2: Install Red Hat ACM Operator + +Advanced Cluster Management is required for policy-based management: + +```bash +# Install ACM from the local chart +helm upgrade --install advanced-cluster-management ./advanced-cluster-management \ + --namespace open-cluster-management \ + --create-namespace + +# Wait for ACM operator +echo "Waiting for ACM operator..." +oc wait --for=condition=Available deployment -l app=multiclusterhub-operator \ + -n open-cluster-management --timeout=600s + +# Wait for MultiClusterHub to be ready (this takes ~10 minutes) +echo "Waiting for MultiClusterHub (this takes ~10 minutes)..." +oc wait --for=condition=Complete mch multiclusterhub \ + -n open-cluster-management --timeout=1200s + +# Verify ACM is ready +oc get mch -A +``` + +### Step 3: Configure OCI Registry Credentials If your OCI registry is private, configure credentials for ArgoCD: @@ -58,7 +105,7 @@ oc rollout restart deployment/argocd-repo-server -n openshift-gitops oc rollout restart deployment/argocd-applicationset-controller -n openshift-gitops ``` -### Step 1b: Configure Custom CA Certificate (Optional) +### Step 3b: Configure Custom CA Certificate (Optional) If your OCI registry uses a custom CA certificate (e.g., private registry with self-signed certs), you need to configure ArgoCD to trust it. @@ -94,7 +141,7 @@ oc label configmap custom-ca-certs \ Then reference it in your ArgoCD configuration by enabling `cluster_ca_bundle: true` in your values. -### Step 2: Install AutoShift from OCI Registry +### Step 4: Install AutoShift from OCI Registry #### Option A: Using Helm @@ -126,11 +173,10 @@ EOF # Install AutoShift main chart from OCI helm registry login quay.io -u myorg+robot -p TOKEN -helm install autoshift oci://quay.io/myorg/autoshift/autoshift \ +helm install autoshift oci://quay.io/autoshift/autoshift \ --version 1.0.0 \ --namespace openshift-gitops \ - --create-namespace \ - -f my-oci-values.yaml + -f my-values.yaml ``` #### Option B: Using ArgoCD Application @@ -182,7 +228,95 @@ spec: EOF ``` -### Step 3: Verify Deployment +### Step 5: Move Cluster to Hub ClusterSet (Required) + +**Important:** By default, `local-cluster` is in the `default` ClusterSet, but AutoShift policies target the `hub` ClusterSet. You must move your cluster to `hub` for policies to apply. + +```bash +# Check current ClusterSet (will show 'default') +oc get managedcluster local-cluster -o jsonpath='{.metadata.labels.cluster\.open-cluster-management\.io/clusterset}' + +# Move local-cluster to the 'hub' ClusterSet +oc label managedcluster local-cluster \ + cluster.open-cluster-management.io/clusterset=hub --overwrite + +# Verify the change +oc get managedcluster local-cluster -o jsonpath='{.metadata.labels.cluster\.open-cluster-management\.io/clusterset}' +# Should output: hub +``` + +### Step 6: Apply AutoShift Labels to Enable Operators + +AutoShift uses labels on ManagedClusters to determine which operators to install. Apply the `autoshift.io/` labels for the operators you want: + +```bash +# Apply AutoShift labels to enable desired operators +# Customize these based on which operators you want to install +oc label managedcluster local-cluster \ + autoshift.io/self-managed='true' \ + autoshift.io/openshift-version='4.20.0' \ + autoshift.io/gitops='true' \ + autoshift.io/gitops-subscription-name='openshift-gitops-operator' \ + autoshift.io/gitops-channel='gitops-1.18' \ + autoshift.io/gitops-source='redhat-operators' \ + autoshift.io/gitops-source-namespace='openshift-marketplace' \ + autoshift.io/acm-subscription-name='advanced-cluster-management' \ + autoshift.io/acm-channel='release-2.14' \ + autoshift.io/acm-source='redhat-operators' \ + autoshift.io/acm-source-namespace='openshift-marketplace' \ + --overwrite + +# Verify labels applied +oc get managedcluster local-cluster -o jsonpath='{.metadata.labels}' | jq 'to_entries[] | select(.key | startswith("autoshift"))' +``` + +#### Example: Enable RHOAI with Dependencies + +```bash +# RHOAI 3.0 with all dependencies +oc label managedcluster local-cluster \ + autoshift.io/serverless='true' \ + autoshift.io/serverless-subscription-name='serverless-operator' \ + autoshift.io/serverless-channel='stable' \ + autoshift.io/serverless-source='redhat-operators' \ + autoshift.io/serverless-source-namespace='openshift-marketplace' \ + autoshift.io/servicemesh3='true' \ + autoshift.io/servicemesh3-subscription-name='servicemeshoperator3' \ + autoshift.io/servicemesh3-channel='stable-3.2' \ + autoshift.io/servicemesh3-source='redhat-operators' \ + autoshift.io/servicemesh3-source-namespace='openshift-marketplace' \ + autoshift.io/pipelines='true' \ + autoshift.io/pipelines-subscription-name='openshift-pipelines-operator-rh' \ + autoshift.io/pipelines-channel='pipelines-1.20' \ + autoshift.io/pipelines-source='redhat-operators' \ + autoshift.io/pipelines-source-namespace='openshift-marketplace' \ + autoshift.io/node-feature-discovery='true' \ + autoshift.io/node-feature-discovery-subscription-name='nfd' \ + autoshift.io/node-feature-discovery-channel='stable' \ + autoshift.io/node-feature-discovery-source='redhat-operators' \ + autoshift.io/node-feature-discovery-source-namespace='openshift-marketplace' \ + autoshift.io/rhoai='true' \ + autoshift.io/rhoai-subscription-name='rhods-operator' \ + autoshift.io/rhoai-channel='fast-3.x' \ + autoshift.io/rhoai-source='redhat-operators' \ + autoshift.io/rhoai-source-namespace='openshift-marketplace' \ + --overwrite +``` + +#### Example: Enable NVIDIA GPU Operator + +```bash +# NVIDIA GPU Operator (requires x86 instance with GPU) +oc label managedcluster local-cluster \ + autoshift.io/nvidia-gpu='true' \ + autoshift.io/nvidia-gpu-subscription-name='gpu-operator-certified' \ + autoshift.io/nvidia-gpu-channel='stable' \ + autoshift.io/nvidia-gpu-source='certified-operators' \ + autoshift.io/nvidia-gpu-source-namespace='openshift-marketplace' \ + --overwrite +``` + +### Step 7: Verify Deployment ```bash # Check AutoShift Application diff --git a/docs/quickstart-from-source.md b/docs/quickstart-from-source.md new file mode 100644 index 00000000..d60f9c4a --- /dev/null +++ b/docs/quickstart-from-source.md @@ -0,0 +1,318 @@ +# AutoShift Quick Start (From Source) + +Deploy AutoShift from the Git repository for testing and development. + +## Prerequisites + +- OpenShift 4.18+ cluster +- `oc` CLI installed and logged in +- `helm` CLI installed + +```bash +# Verify you're logged in +oc whoami +``` + +--- + +## Step 1: Install OpenShift GitOps + +```bash +helm upgrade --install openshift-gitops openshift-gitops +``` + +Wait for GitOps pods to be ready (~2 minutes): + +```bash +oc get pods -n openshift-gitops -w +# Ctrl+C when you see pods in Running state +``` + +--- + +## Step 2: Install Advanced Cluster Management + +```bash +helm upgrade --install advanced-cluster-management advanced-cluster-management +``` + +Wait for ACM to finish installing (~10 minutes): + +```bash +oc get mch -A -w +# Ctrl+C when STATUS shows "Running" +``` + +--- + +## Step 3: Deploy AutoShift + +Once ACM is running, deploy AutoShift: + +```bash +cat << 'EOF' | oc apply -f - +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: autoshift + namespace: openshift-gitops +spec: + destination: + server: https://kubernetes.default.svc + source: + path: autoshift + repoURL: https://github.com/auto-shift/autoshiftv2.git + targetRevision: main + helm: + valueFiles: + - values.minimal.yaml + values: | + hubClusterSets: + hub: + labels: + self-managed: 'true' + openshift-version: '4.20.0' + gitops: 'true' + gitops-subscription-name: openshift-gitops-operator + gitops-channel: gitops-1.18 + gitops-source: redhat-operators + gitops-source-namespace: openshift-marketplace + acm-subscription-name: advanced-cluster-management + acm-channel: release-2.14 + acm-source: redhat-operators + acm-source-namespace: openshift-marketplace + project: default + syncPolicy: + automated: + prune: false + selfHeal: true +EOF +``` + +> **Note:** Update `openshift-version` to match your cluster version (e.g., `4.18.28`, `4.20.0`). + +--- + +## Step 4: Add Hub Cluster to ClusterSet + +```bash +oc label managedcluster local-cluster cluster.open-cluster-management.io/clusterset=hub --overwrite +``` + +--- + +## Step 5: Verify Installation + +```bash +# Watch ArgoCD applications spin up +oc get applications -n openshift-gitops -w + +# Check policies are being created +oc get policies -A + +# Check policy compliance +oc get policies -n policies-autoshift +``` + +--- + +## Adding Operators + +### Option 1: Patch the Application + +Add operators by patching the AutoShift application with updated labels: + +```bash +oc patch applications.argoproj.io autoshift -n openshift-gitops --type=merge -p ' +spec: + source: + helm: + values: | + hubClusterSets: + hub: + labels: + self-managed: "true" + openshift-version: "4.20.0" + gitops: "true" + gitops-subscription-name: openshift-gitops-operator + gitops-channel: gitops-1.18 + gitops-source: redhat-operators + gitops-source-namespace: openshift-marketplace + acm-subscription-name: advanced-cluster-management + acm-channel: release-2.14 + acm-source: redhat-operators + acm-source-namespace: openshift-marketplace + ### Add OpenShift Pipelines + pipelines: "true" + pipelines-subscription-name: openshift-pipelines-operator-rh + pipelines-channel: pipelines-1.20 + pipelines-source: redhat-operators + pipelines-source-namespace: openshift-marketplace +' +``` + +### Option 2: Use a Custom Values File + +1. Create your own values file (e.g., `values.my-cluster.yaml`) +2. Update the AutoShift application to use it: + +```bash +oc patch application autoshift -n openshift-gitops --type=merge -p ' +spec: + source: + helm: + valueFiles: + - values.my-cluster.yaml +' +``` + +--- + +## Available Operators + +Common operators you can enable: + +| Operator | Label | Subscription Name | +|----------|-------|-------------------| +| OpenShift Pipelines | `pipelines: 'true'` | `openshift-pipelines-operator-rh` | +| Advanced Cluster Security | `acs: 'true'` | `rhacs-operator` | +| Compliance Operator | `compliance: 'true'` | `compliance-operator` | +| OpenShift Logging | `logging: 'true'` | `cluster-logging` | +| Loki | `loki: 'true'` | `loki-operator` | +| Cluster Observability | `coo: 'true'` | `cluster-observability-operator` | +| OpenShift Data Foundation | `odf: 'true'` | `odf-operator` | +| Developer Spaces | `dev-spaces: 'true'` | `devspaces` | +| Developer Hub | `dev-hub: 'true'` | `rhdh` | +| Quay | `quay: 'true'` | `quay-operator` | +| OpenShift Virtualization | `virt: 'true'` | `kubevirt-hyperconverged` | + +For each operator, you need these labels: + +```yaml +: 'true' +-subscription-name: +-channel: +-source: redhat-operators +-source-namespace: openshift-marketplace +``` + +--- + +## Example: Enable Multiple Operators + +```bash +oc patch application autoshift -n openshift-gitops --type=merge -p ' +spec: + source: + helm: + values: | + hubClusterSets: + hub: + labels: + self-managed: "true" + openshift-version: "4.20.0" + ### Required: GitOps + gitops: "true" + gitops-subscription-name: openshift-gitops-operator + gitops-channel: gitops-1.18 + gitops-source: redhat-operators + gitops-source-namespace: openshift-marketplace + ### Required: ACM + acm-subscription-name: advanced-cluster-management + acm-channel: release-2.14 + acm-source: redhat-operators + acm-source-namespace: openshift-marketplace + ### OpenShift Pipelines + pipelines: "true" + pipelines-subscription-name: openshift-pipelines-operator-rh + pipelines-channel: pipelines-1.20 + pipelines-source: redhat-operators + pipelines-source-namespace: openshift-marketplace + ### Advanced Cluster Security + acs: "true" + acs-subscription-name: rhacs-operator + acs-channel: stable + acs-source: redhat-operators + acs-source-namespace: openshift-marketplace + ### Compliance Operator + compliance: "true" + compliance-subscription-name: compliance-operator + compliance-channel: stable + compliance-source: redhat-operators + compliance-source-namespace: openshift-marketplace +' +``` + +--- + +## Verify Operator Installation + +```bash +# Check policies for your operator +oc get policies -A | grep pipelines + +# Watch the operator install +oc get csv -n openshift-operators -w + +# Check operator pods +oc get pods -n openshift-operators +``` + +--- + +## Troubleshooting + +### Check ArgoCD Application Status + +```bash +oc get application autoshift -n openshift-gitops -o yaml +oc describe application autoshift -n openshift-gitops +``` + +### Check Policy Status + +```bash +# List all policies +oc get policies -A + +# Describe a specific policy +oc describe policy policy-pipelines-operator-install -n policies-autoshift +``` + +### Check Cluster Labels + +```bash +oc get managedcluster local-cluster -o yaml | grep -A50 labels +``` + +### View ACM Policy Propagator Logs + +```bash +oc logs -n open-cluster-management deployment/grc-policy-propagator --tail=100 +``` + +--- + +## Cleanup + +To remove AutoShift: + +```bash +# Delete the AutoShift application +oc delete application autoshift -n openshift-gitops + +# Delete policies namespace +oc delete namespace policies-autoshift + +# Remove cluster from clusterset +oc label managedcluster local-cluster cluster.open-cluster-management.io/clusterset- +``` + +--- + +## Next Steps + +- [Adding New Operators](adding-new-operators.md) - Create custom operator policies +- [Developer Guide](developer-guide.md) - Full development documentation +- [OCI Quick Start](quickstart-oci.md) - Deploy from OCI registry (production) + diff --git a/docs/quickstart-oci.md b/docs/quickstart-oci.md index 7bc49433..0cecfbe8 100644 --- a/docs/quickstart-oci.md +++ b/docs/quickstart-oci.md @@ -101,15 +101,64 @@ hubClusterSets: hub: labels: self-managed: 'true' - openshift-version: '4.18.28' - # GitOps is required for hub clusters + openshift-version: '4.20.0' + + ### Required: GitOps gitops: 'true' - # ACM is automatically installed on all hub clustersets by policy - # Optional: Additional operators - acs: 'true' - acs-channel: 'stable' - odf: 'true' - odf-channel: 'stable-4.18' + gitops-subscription-name: openshift-gitops-operator + gitops-channel: gitops-1.18 + gitops-source: redhat-operators + gitops-source-namespace: openshift-marketplace + + ### Required: ACM (automatically installed on hub clustersets) + acm-subscription-name: advanced-cluster-management + acm-channel: release-2.14 + acm-source: redhat-operators + acm-source-namespace: openshift-marketplace + + ### ===== RHOAI 3.0 AND DEPENDENCIES ===== + + ### OpenShift Serverless (required for KServe model serving) + serverless: 'true' + serverless-subscription-name: serverless-operator + serverless-channel: stable + serverless-source: redhat-operators + serverless-source-namespace: openshift-marketplace + + ### OpenShift Service Mesh 3 (required for model serving) + servicemesh3: 'true' + servicemesh3-subscription-name: servicemeshoperator3 + servicemesh3-channel: stable-3.2 + servicemesh3-source: redhat-operators + servicemesh3-source-namespace: openshift-marketplace + + ### OpenShift Pipelines (for ML pipelines) + pipelines: 'true' + pipelines-subscription-name: openshift-pipelines-operator-rh + pipelines-channel: pipelines-1.20 + pipelines-source: redhat-operators + pipelines-source-namespace: openshift-marketplace + + ### Node Feature Discovery (for GPU detection) + node-feature-discovery: 'true' + node-feature-discovery-subscription-name: nfd + node-feature-discovery-channel: stable + node-feature-discovery-source: redhat-operators + node-feature-discovery-source-namespace: openshift-marketplace + + ### NVIDIA GPU Operator + nvidia-gpu: 'true' + nvidia-gpu-subscription-name: gpu-operator-certified + nvidia-gpu-channel: stable + nvidia-gpu-source: certified-operators + nvidia-gpu-source-namespace: openshift-marketplace + + ### Red Hat OpenShift AI 3.0 + rhoai: 'true' + rhoai-subscription-name: rhods-operator + rhoai-channel: fast-3.x + rhoai-source: redhat-operators + rhoai-source-namespace: openshift-marketplace EOF # 3. Install diff --git a/policies/kiali/templates/policy-kiali-operator-install.yaml b/policies/kiali/templates/policy-kiali-operator-install.yaml index 7385ad77..bcbf3349 100644 --- a/policies/kiali/templates/policy-kiali-operator-install.yaml +++ b/policies/kiali/templates/policy-kiali-operator-install.yaml @@ -38,7 +38,8 @@ spec: severity: high complianceType: musthave operatorGroup: - name: {{ .Values.kiali.operatorGroupName }} + # if operator group exists in namespace use that + name: '{{ "{{" }} with (lookup "operators.coreos.com/v1" "OperatorGroup" "{{ .Values.kiali.namespace }}" "").items {{ "}}" }}{{ "{{" }} (index . 0).metadata.name {{ "}}" }}{{ "{{" }} else {{ "}}" }}{{ .Values.kiali.operatorGroupName }}{{ "{{" }} end {{ "}}" }}' namespace: {{ .Values.kiali.namespace }} {{- if .Values.kiali.targetNamespaces }} targetNamespaces: @@ -47,12 +48,18 @@ spec: {{- end }} {{- end }} subscription: - startingCSV: '{{ "{{hub" }} index .ManagedClusterLabels "autoshift.io/kiali-version" | default "" {{ "hub}}" }}' - namespace: {{ .Values.kiali.namespace }} - channel: '{{ "{{hub" }} index .ManagedClusterLabels "autoshift.io/kiali-channel" | default "{{ .Values.kiali.channel }}" {{ "hub}}" }}' name: '{{ "{{hub" }} index .ManagedClusterLabels "autoshift.io/kiali-subscription-name" | default "{{ .Values.kiali.name }}" {{ "hub}}" }}' - source: '{{ "{{hub" }} index .ManagedClusterLabels "autoshift.io/kiali-source" | default "{{ .Values.kiali.source }}" {{ "hub}}" }}' + namespace: {{ .Values.kiali.namespace }} + # Source Selection Logic (evaluated on hub for each managed cluster): + # 1. If cluster has explicit 'autoshift.io/kiali-source' label → use that value + # 2. Else if cluster has 'autoshift.io/disconnected-mirror: true': + # → use {source}{suffix} (e.g., redhat-operators-mirror) + # → suffix from 'autoshift.io/mirror-catalog-suffix' label, or default '-mirror' + # 3. Else → use standard source (e.g., redhat-operators) + source: '{{ "{{hub" }} index .ManagedClusterLabels "autoshift.io/kiali-source" | default (ternary (printf "%s-%s" "{{ .Values.kiali.source }}" (index .ManagedClusterLabels "autoshift.io/mirror-catalog-suffix" | default "mirror")) "{{ .Values.kiali.source }}" (eq (index .ManagedClusterLabels "autoshift.io/disconnected-mirror") "true")) {{ "hub}}" }}' sourceNamespace: '{{ "{{hub" }} index .ManagedClusterLabels "autoshift.io/kiali-source-namespace" | default "{{ .Values.kiali.sourceNamespace }}" {{ "hub}}" }}' + channel: '{{ "{{hub" }} index .ManagedClusterLabels "autoshift.io/kiali-channel" | default "{{ .Values.kiali.channel }}" {{ "hub}}" }}' + startingCSV: '{{ "{{hub" }} index .ManagedClusterLabels "autoshift.io/kiali-version" | default "" {{ "hub}}" }}' # Upgrade approval for subscription will be set based on if version is set not this flag # If version is set install plan is set to manual and only the version specified will be installed upgradeApproval: Automatic diff --git a/policies/node-feature-discovery/templates/policy-node-feature-discovery-instance.yaml b/policies/node-feature-discovery/templates/policy-node-feature-discovery-instance.yaml index 6529204f..9f0bc706 100644 --- a/policies/node-feature-discovery/templates/policy-node-feature-discovery-instance.yaml +++ b/policies/node-feature-discovery/templates/policy-node-feature-discovery-instance.yaml @@ -39,76 +39,11 @@ spec: enableTaints: false instance: '' operand: - image: {{ .Values.nodeFeatureDiscovery.operandImage }} - imagePullPolicy: Always servicePort: 0 prunerOnDelete: false topologyUpdater: false workerConfig: - configData: | - core: - # labelWhiteList: - # noPublish: false - sleepInterval: 60s - # sources: [all] - # klog: - # addDirHeader: false - # alsologtostderr: false - # logBacktraceAt: - # logtostderr: true - # skipHeaders: false - # stderrthreshold: 2 - # v: 0 - # vmodule: - ## NOTE: the following options are not dynamically run-time configurable - ## and require a nfd-worker restart to take effect after being changed - # logDir: - # logFile: - # logFileMaxSize: 1800 - # skipLogHeaders: false - sources: - cpu: - cpuid: - # NOTE: whitelist has priority over blacklist - attributeBlacklist: - - "BMI1" - - "BMI2" - - "CLMUL" - - "CMOV" - - "CX16" - - "ERMS" - - "F16C" - - "HTT" - - "LZCNT" - - "MMX" - - "MMXEXT" - - "NX" - - "POPCNT" - - "RDRAND" - - "RDSEED" - - "RDTSCP" - - "SGX" - - "SSE" - - "SSE2" - - "SSE3" - - "SSE4.1" - - "SSE4.2" - - "SSSE3" - attributeWhitelist: - kernel: - kconfigFile: "/path/to/kconfig" - configOpts: - - "NO_HZ" - - "X86" - - "DMI" - pci: - deviceClassWhitelist: - - "0200" - - "03" - - "0302" - - "12" - deviceLabelFields: - - "class" + configData: '' --- apiVersion: cluster.open-cluster-management.io/v1beta1 kind: Placement diff --git a/policies/node-feature-discovery/templates/policy-node-feature-discovery-operator-install.yaml b/policies/node-feature-discovery/templates/policy-node-feature-discovery-operator-install.yaml index 3c6980c0..a1f975d5 100644 --- a/policies/node-feature-discovery/templates/policy-node-feature-discovery-operator-install.yaml +++ b/policies/node-feature-discovery/templates/policy-node-feature-discovery-operator-install.yaml @@ -38,7 +38,8 @@ spec: severity: high complianceType: musthave operatorGroup: - name: {{ .Values.nodeFeatureDiscovery.operatorGroupName }} + # if operator group exists in namespace use that + name: '{{ "{{" }} with (lookup "operators.coreos.com/v1" "OperatorGroup" "{{ .Values.nodeFeatureDiscovery.namespace }}" "").items {{ "}}" }}{{ "{{" }} (index . 0).metadata.name {{ "}}" }}{{ "{{" }} else {{ "}}" }}{{ .Values.nodeFeatureDiscovery.operatorGroupName }}{{ "{{" }} end {{ "}}" }}' namespace: {{ .Values.nodeFeatureDiscovery.namespace }} {{- if .Values.nodeFeatureDiscovery.targetNamespaces }} targetNamespaces: @@ -47,12 +48,18 @@ spec: {{- end }} {{- end }} subscription: - startingCSV: '{{ "{{hub" }} index .ManagedClusterLabels "autoshift.io/node-feature-discovery-version" | default "" {{ "hub}}" }}' - namespace: {{ .Values.nodeFeatureDiscovery.namespace }} - channel: '{{ "{{hub" }} index .ManagedClusterLabels "autoshift.io/node-feature-discovery-channel" | default "{{ .Values.nodeFeatureDiscovery.channel }}" {{ "hub}}" }}' name: '{{ "{{hub" }} index .ManagedClusterLabels "autoshift.io/node-feature-discovery-subscription-name" | default "{{ .Values.nodeFeatureDiscovery.name }}" {{ "hub}}" }}' - source: '{{ "{{hub" }} index .ManagedClusterLabels "autoshift.io/node-feature-discovery-source" | default "{{ .Values.nodeFeatureDiscovery.source }}" {{ "hub}}" }}' + namespace: {{ .Values.nodeFeatureDiscovery.namespace }} + # Source Selection Logic (evaluated on hub for each managed cluster): + # 1. If cluster has explicit 'autoshift.io/node-feature-discovery-source' label → use that value + # 2. Else if cluster has 'autoshift.io/disconnected-mirror: true': + # → use {source}{suffix} (e.g., redhat-operators-mirror) + # → suffix from 'autoshift.io/mirror-catalog-suffix' label, or default '-mirror' + # 3. Else → use standard source (e.g., redhat-operators) + source: '{{ "{{hub" }} index .ManagedClusterLabels "autoshift.io/node-feature-discovery-source" | default (ternary (printf "%s-%s" "{{ .Values.nodeFeatureDiscovery.source }}" (index .ManagedClusterLabels "autoshift.io/mirror-catalog-suffix" | default "mirror")) "{{ .Values.nodeFeatureDiscovery.source }}" (eq (index .ManagedClusterLabels "autoshift.io/disconnected-mirror") "true")) {{ "hub}}" }}' sourceNamespace: '{{ "{{hub" }} index .ManagedClusterLabels "autoshift.io/node-feature-discovery-source-namespace" | default "{{ .Values.nodeFeatureDiscovery.sourceNamespace }}" {{ "hub}}" }}' + channel: '{{ "{{hub" }} index .ManagedClusterLabels "autoshift.io/node-feature-discovery-channel" | default "{{ .Values.nodeFeatureDiscovery.channel }}" {{ "hub}}" }}' + startingCSV: '{{ "{{hub" }} index .ManagedClusterLabels "autoshift.io/node-feature-discovery-version" | default "" {{ "hub}}" }}' # Upgrade approval for subscription will be set based on if version is set not this flag # If version is set install plan is set to manual and only the version specified will be installed upgradeApproval: Automatic diff --git a/policies/node-feature-discovery/values.yaml b/policies/node-feature-discovery/values.yaml index ca596b04..cf616b23 100644 --- a/policies/node-feature-discovery/values.yaml +++ b/policies/node-feature-discovery/values.yaml @@ -11,7 +11,6 @@ nodeFeatureDiscovery: operatorGroupName: openshift-nfd-operator targetNamespaces: # Target namespaces for namespace-scoped operators - openshift-nfd - operandImage: 'registry.redhat.io/openshift4/ose-node-feature-discovery-rhel9:v4.18' # hubClusterSets: # hub: diff --git a/policies/nvidia-gpu/Chart.yaml b/policies/nvidia-gpu/Chart.yaml new file mode 100644 index 00000000..3f1d7683 --- /dev/null +++ b/policies/nvidia-gpu/Chart.yaml @@ -0,0 +1,24 @@ +apiVersion: v2 +name: nvidia-gpu +description: A Helm chart for Kubernetes + +# A chart can be either an 'application' or a 'library' chart. +# +# Application charts are a collection of templates that can be packaged into versioned archives +# to be deployed. +# +# Library charts provide useful utilities or functions for the chart developer. They're included as +# a dependency of application charts to inject those utilities and functions into the rendering +# pipeline. Library charts do not define any templates and therefore cannot be deployed. +type: application + +# This is the chart version. This version number should be incremented each time you make changes +# to the chart and its templates, including the app version. +# Versions are expected to follow Semantic Versioning (https://semver.org/) +version: 0.1.0 + +# This is the version number of the application being deployed. This version number should be +# incremented each time you make changes to the application. Versions are not expected to +# follow Semantic Versioning. They should reflect the version the application is using. +# It is recommended to use it with quotes. +appVersion: "1.0.0" \ No newline at end of file diff --git a/policies/nvidia-gpu/README.md b/policies/nvidia-gpu/README.md new file mode 100644 index 00000000..b1142d4b --- /dev/null +++ b/policies/nvidia-gpu/README.md @@ -0,0 +1,258 @@ +# NVIDIA GPU Operator Policy + +This policy installs and configures the NVIDIA GPU Operator on OpenShift clusters with NVIDIA GPUs. + +## What Gets Installed + +| Policy | Description | +|--------|-------------| +| `policy-nvidia-gpu-operator-install` | Installs the gpu-operator-certified operator | +| `policy-nvidia-gpu-config` | Creates ClusterPolicy to configure GPU stack | + +## Prerequisites + +- Nodes with NVIDIA GPUs (e.g., AWS g4dn, g5, g6e instances) +- **Node Feature Discovery (NFD) operator installed (REQUIRED)** - Creates the `pci-10de.present=true` label that triggers GPU detection +- Entitled RHEL nodes (for driver compilation) OR pre-built driver containers + +> **Important**: The `policy-nvidia-gpu-config` has a dependency on `policy-nfd-instance-deploy`. You must enable NFD when using the NVIDIA GPU operator. + +## Quick Start + +Enable NVIDIA GPU operator: + +```yaml +hubClusterSets: + hub: + labels: + # Recommended: Enable NFD first + node-feature-discovery: 'true' + node-feature-discovery-subscription-name: nfd + node-feature-discovery-channel: stable + node-feature-discovery-source: redhat-operators + node-feature-discovery-source-namespace: openshift-marketplace + + # NVIDIA GPU Operator + nvidia-gpu: 'true' + nvidia-gpu-subscription-name: gpu-operator-certified + nvidia-gpu-channel: stable + nvidia-gpu-source: certified-operators + nvidia-gpu-source-namespace: openshift-marketplace +``` + +## Configuration Labels + +### Operator Configuration + +| Label | Default | Description | +|-------|---------|-------------| +| `nvidia-gpu` | - | Enable NVIDIA GPU operator (`'true'` or `'false'`) | +| `nvidia-gpu-subscription-name` | `gpu-operator-certified` | OLM package name | +| `nvidia-gpu-channel` | `stable` | Operator channel | +| `nvidia-gpu-version` | - | Pin to specific CSV version (optional) | +| `nvidia-gpu-source` | `certified-operators` | Catalog source | +| `nvidia-gpu-source-namespace` | `openshift-marketplace` | Catalog namespace | + +### ClusterPolicy Component Configuration + +| Label | Default | Description | +|-------|---------|-------------| +| `nvidia-gpu-driver` | `true` | Install GPU driver | +| `nvidia-gpu-driver-toolkit` | `true` | Use OpenShift Driver Toolkit | +| `nvidia-gpu-toolkit` | `true` | Install CUDA toolkit | +| `nvidia-gpu-device-plugin` | `true` | Enable Kubernetes device plugin | +| `nvidia-gpu-dcgm` | `true` | Enable DCGM exporter for metrics | +| `nvidia-gpu-gfd` | `true` | Enable GPU Feature Discovery | +| `nvidia-gpu-mig` | `false` | Enable MIG manager (A30/A100/H100 only) | +| `nvidia-gpu-sandbox` | `false` | Enable sandbox workloads | +| `nvidia-gpu-vgpu` | `false` | Enable vGPU support (requires license) | + +## Examples + +### Basic GPU Setup + +```yaml +nvidia-gpu: 'true' +nvidia-gpu-subscription-name: gpu-operator-certified +nvidia-gpu-channel: stable +nvidia-gpu-source: certified-operators +nvidia-gpu-source-namespace: openshift-marketplace +``` + +### GPU with NFD Operator (Required) + +```yaml +# NFD is REQUIRED for GPU detection +node-feature-discovery: 'true' +node-feature-discovery-subscription-name: nfd +node-feature-discovery-channel: stable +node-feature-discovery-source: redhat-operators +node-feature-discovery-source-namespace: openshift-marketplace + +# GPU Operator (uses NFD for hardware detection) +nvidia-gpu: 'true' +nvidia-gpu-subscription-name: gpu-operator-certified +nvidia-gpu-channel: stable +nvidia-gpu-source: certified-operators +nvidia-gpu-source-namespace: openshift-marketplace +``` + +### GPU with MIG Enabled + +```yaml +nvidia-gpu: 'true' +nvidia-gpu-subscription-name: gpu-operator-certified +nvidia-gpu-channel: stable +nvidia-gpu-source: certified-operators +nvidia-gpu-source-namespace: openshift-marketplace +nvidia-gpu-mig: 'true' +``` + +### Minimal GPU (Driver + Device Plugin only) + +```yaml +nvidia-gpu: 'true' +nvidia-gpu-subscription-name: gpu-operator-certified +nvidia-gpu-channel: stable +nvidia-gpu-source: certified-operators +nvidia-gpu-source-namespace: openshift-marketplace +nvidia-gpu-dcgm: 'false' +nvidia-gpu-mig: 'false' +nvidia-gpu-gfd: 'false' +``` + +## Integration with RHOAI + +For RHOAI GPU workloads, enable both NFD and GPU operator: + +```yaml +# NFD for hardware detection +node-feature-discovery: 'true' +node-feature-discovery-subscription-name: nfd +node-feature-discovery-channel: stable +node-feature-discovery-source: redhat-operators +node-feature-discovery-source-namespace: openshift-marketplace + +# GPU Operator +nvidia-gpu: 'true' +nvidia-gpu-subscription-name: gpu-operator-certified +nvidia-gpu-channel: stable +nvidia-gpu-source: certified-operators +nvidia-gpu-source-namespace: openshift-marketplace + +# RHOAI +rhoai: 'true' +rhoai-subscription-name: rhods-operator +rhoai-channel: fast-3.x +rhoai-source: redhat-operators +rhoai-source-namespace: openshift-marketplace +``` + +## Verification + +Check GPU operator installation: + +```bash +# Check operator +oc get csv -n nvidia-gpu-operator | grep gpu + +# Check ClusterPolicy +oc get clusterpolicy + +# Check ClusterPolicy status +oc describe clusterpolicy gpu-cluster-policy + +# Check GPU operator pods +oc get pods -n nvidia-gpu-operator + +# Check GPU nodes are labeled +oc get nodes -l nvidia.com/gpu.present=true + +# Verify GPUs are detected +oc get nodes -o json | jq '.items[].status.allocatable | select(.["nvidia.com/gpu"] != null)' +``` + +## Troubleshooting + +### Driver not installing + +```bash +# Check driver pod logs +oc logs -n nvidia-gpu-operator -l app=nvidia-driver-daemonset + +# Check if node has GPU hardware +oc describe node | grep -i nvidia + +# Verify NFD detected the GPU +oc get node -o json | jq '.metadata.labels | with_entries(select(.key | startswith("feature.node.kubernetes.io/pci-10de")))' +``` + +### ClusterPolicy not ready + +```bash +# Check ClusterPolicy status +oc describe clusterpolicy gpu-cluster-policy + +# Check all GPU pods +oc get pods -n nvidia-gpu-operator + +# Check events +oc get events -n nvidia-gpu-operator --sort-by='.lastTimestamp' +``` + +### Pods can't request GPUs + +```bash +# Verify device plugin is running +oc get pods -n nvidia-gpu-operator -l app=nvidia-device-plugin-daemonset + +# Check node allocatable resources +oc get nodes -o custom-columns="NAME:.metadata.name,GPU:.status.allocatable.nvidia\.com/gpu" +``` + +## values.yaml Reference + +```yaml +nvidiaGpu: + # Operator settings + name: gpu-operator-certified + namespace: nvidia-gpu-operator + channel: stable + source: certified-operators + sourceNamespace: openshift-marketplace + operatorGroupName: nvidia-gpu-operator-group + + # ClusterPolicy settings + clusterPolicy: + name: gpu-cluster-policy + driver: + enabled: true + useOpenShiftDriverToolkit: true + toolkit: + enabled: true + devicePlugin: + enabled: true + dcgmExporter: + enabled: true + migManager: + enabled: false # Enable only for MIG-capable GPUs (A30/A100/H100) + gfd: + enabled: true + sandboxWorkloads: + enabled: false + vgpuManager: + enabled: false + vgpuDeviceManager: + enabled: false +``` + +## Supported GPUs + +| GPU | AWS Instance | MIG Support | Notes | +|-----|--------------|-------------|-------| +| NVIDIA T4 | g4dn.* | No | Good for inference | +| NVIDIA L4 | g6.* | No | Newer, efficient | +| NVIDIA A10G | g5.* | No | Training + inference | +| NVIDIA L40S | g6e.* | No | High performance | +| NVIDIA A100 | p4d.* | Yes | Enable `nvidia-gpu-mig: 'true'` | +| NVIDIA H100 | p5.* | Yes | Enable `nvidia-gpu-mig: 'true'` | diff --git a/policies/nvidia-gpu/templates/policy-nvidia-gpu-config.yaml b/policies/nvidia-gpu/templates/policy-nvidia-gpu-config.yaml new file mode 100644 index 00000000..3c1fceec --- /dev/null +++ b/policies/nvidia-gpu/templates/policy-nvidia-gpu-config.yaml @@ -0,0 +1,163 @@ +{{- $policyName := "policy-nvidia-gpu-config" }} +{{- $placementName := "placement-policy-nvidia-gpu-config" }} + +apiVersion: policy.open-cluster-management.io/v1 +kind: Policy +metadata: + name: {{ $policyName }} + namespace: {{ .Values.policy_namespace }} + annotations: + policy.open-cluster-management.io/standards: NIST SP 800-53 + policy.open-cluster-management.io/categories: CM Configuration Management + policy.open-cluster-management.io/controls: CM-2 Baseline Configuration +spec: + disabled: false + # Wait for NVIDIA GPU operator to be installed first + # Note: NFD is optional - the GPU operator includes its own GPU Feature Discovery (GFD) + # which can detect GPUs without relying on NFD labels + dependencies: + - name: policy-nvidia-gpu-operator-install + namespace: {{ .Values.policy_namespace }} + apiVersion: policy.open-cluster-management.io/v1 + compliance: Compliant + kind: Policy + policy-templates: + # ClusterPolicy - configures the GPU operator + - objectDefinition: + apiVersion: policy.open-cluster-management.io/v1 + kind: ConfigurationPolicy + metadata: + name: nvidia-gpu-cluster-policy + spec: + remediationAction: enforce + severity: high + object-templates: + - complianceType: musthave + objectDefinition: + apiVersion: nvidia.com/v1 + kind: ClusterPolicy + metadata: + name: {{ .Values.nvidiaGpu.clusterPolicy.name }} + spec: + operator: + defaultRuntime: crio + # Note: ClusterPolicy CRD requires native booleans, not strings + # Hub templating cannot be used for these boolean fields + use_ocp_driver_toolkit: {{ .Values.nvidiaGpu.clusterPolicy.driver.useOpenShiftDriverToolkit }} + daemonsets: {} + driver: + enabled: {{ .Values.nvidiaGpu.clusterPolicy.driver.enabled }} + upgradePolicy: + autoUpgrade: true + maxParallelUpgrades: 1 + maxUnavailable: "25%" + podDeletion: + deleteEmptyDir: false + force: false + timeoutSeconds: 300 + waitForCompletion: + timeoutSeconds: 0 + repoConfig: + configMapName: "" + certConfig: + name: "" + licensingConfig: + nlsEnabled: false + configMapName: "" + virtualTopology: + config: "" + kernelModuleConfig: + name: "" + toolkit: + enabled: {{ .Values.nvidiaGpu.clusterPolicy.toolkit.enabled }} + devicePlugin: + enabled: {{ .Values.nvidiaGpu.clusterPolicy.devicePlugin.enabled }} + dcgm: + enabled: {{ .Values.nvidiaGpu.clusterPolicy.dcgmExporter.enabled }} + dcgmExporter: + enabled: {{ .Values.nvidiaGpu.clusterPolicy.dcgmExporter.enabled }} + config: + name: "" + serviceMonitor: + enabled: true + gfd: + enabled: {{ .Values.nvidiaGpu.clusterPolicy.gfd.enabled }} + migManager: + enabled: {{ .Values.nvidiaGpu.clusterPolicy.migManager.enabled }} + nodeStatusExporter: + enabled: true + sandboxWorkloads: + enabled: {{ .Values.nvidiaGpu.clusterPolicy.sandboxWorkloads.enabled }} + validator: + plugin: + env: + - name: WITH_WORKLOAD + value: "false" + vgpuManager: + enabled: {{ .Values.nvidiaGpu.clusterPolicy.vgpuManager.enabled }} + vgpuDeviceManager: + enabled: {{ .Values.nvidiaGpu.clusterPolicy.vgpuDeviceManager.enabled }} + vfioManager: + enabled: true + sandboxDevicePlugin: + enabled: true + # Status check - verify ClusterPolicy is ready + - objectDefinition: + apiVersion: policy.open-cluster-management.io/v1 + kind: ConfigurationPolicy + metadata: + name: nvidia-gpu-cluster-policy-status + spec: + remediationAction: inform + severity: medium + object-templates: + - complianceType: musthave + objectDefinition: + apiVersion: nvidia.com/v1 + kind: ClusterPolicy + metadata: + name: {{ .Values.nvidiaGpu.clusterPolicy.name }} + status: + state: ready +--- +apiVersion: cluster.open-cluster-management.io/v1beta1 +kind: Placement +metadata: + name: {{ $placementName }} + namespace: {{ .Values.policy_namespace }} +spec: + clusterSets: + {{- range $clusterSet, $value := $.Values.hubClusterSets }} + - {{ $clusterSet }} + {{- end }} + {{- range $clusterSet, $value := $.Values.managedClusterSets }} + - {{ $clusterSet }} + {{- end }} + predicates: + - requiredClusterSelector: + labelSelector: + matchExpressions: + - key: 'autoshift.io/nvidia-gpu' + operator: In + values: + - 'true' + tolerations: + - key: cluster.open-cluster-management.io/unreachable + operator: Exists + - key: cluster.open-cluster-management.io/unavailable + operator: Exists +--- +apiVersion: policy.open-cluster-management.io/v1 +kind: PlacementBinding +metadata: + name: {{ $placementName }} + namespace: {{ .Values.policy_namespace }} +placementRef: + name: {{ $placementName }} + apiGroup: cluster.open-cluster-management.io + kind: Placement +subjects: + - name: {{ $policyName }} + apiGroup: policy.open-cluster-management.io + kind: Policy + diff --git a/policies/nvidia-gpu/templates/policy-nvidia-gpu-operator-install.yaml b/policies/nvidia-gpu/templates/policy-nvidia-gpu-operator-install.yaml new file mode 100644 index 00000000..96aae8b6 --- /dev/null +++ b/policies/nvidia-gpu/templates/policy-nvidia-gpu-operator-install.yaml @@ -0,0 +1,100 @@ +{{- $policyName := "policy-nvidia-gpu-operator-install" }} +{{- $placementName := "placement-policy-nvidia-gpu-operator-install" }} + +apiVersion: policy.open-cluster-management.io/v1 +kind: Policy +metadata: + name: {{ $policyName }} + namespace: {{ .Values.policy_namespace }} + annotations: + policy.open-cluster-management.io/standards: NIST SP 800-53 + policy.open-cluster-management.io/categories: CM Configuration Management + policy.open-cluster-management.io/controls: CM-2 Baseline Configuration +spec: + disabled: false + policy-templates: + - objectDefinition: + apiVersion: policy.open-cluster-management.io/v1 + kind: ConfigurationPolicy + metadata: + name: nvidia-gpu-operator-ns + spec: + remediationAction: enforce + severity: high + object-templates: + - complianceType: musthave + objectDefinition: + apiVersion: v1 + kind: Namespace + metadata: + name: {{ .Values.nvidiaGpu.namespace }} + labels: + openshift.io/cluster-monitoring: "true" + - objectDefinition: + apiVersion: policy.open-cluster-management.io/v1beta1 + kind: OperatorPolicy + metadata: + name: install-nvidia-gpu + spec: + remediationAction: enforce + severity: high + complianceType: musthave + operatorGroup: + name: {{ .Values.nvidiaGpu.operatorGroupName }} + namespace: {{ .Values.nvidiaGpu.namespace }} + # NVIDIA GPU operator requires OwnNamespace install mode + targetNamespaces: + - {{ .Values.nvidiaGpu.namespace }} + subscription: + startingCSV: '{{ "{{hub" }} index .ManagedClusterLabels "autoshift.io/nvidia-gpu-version" | default "" {{ "hub}}" }}' + namespace: {{ .Values.nvidiaGpu.namespace }} + channel: '{{ "{{hub" }} index .ManagedClusterLabels "autoshift.io/nvidia-gpu-channel" | default "{{ .Values.nvidiaGpu.channel }}" {{ "hub}}" }}' + name: '{{ "{{hub" }} index .ManagedClusterLabels "autoshift.io/nvidia-gpu-subscription-name" | default "{{ .Values.nvidiaGpu.name }}" {{ "hub}}" }}' + source: '{{ "{{hub" }} index .ManagedClusterLabels "autoshift.io/nvidia-gpu-source" | default "{{ .Values.nvidiaGpu.source }}" {{ "hub}}" }}' + sourceNamespace: '{{ "{{hub" }} index .ManagedClusterLabels "autoshift.io/nvidia-gpu-source-namespace" | default "{{ .Values.nvidiaGpu.sourceNamespace }}" {{ "hub}}" }}' + # Upgrade approval for subscription will be set based on if version is set not this flag + # If version is set install plan is set to manual and only the version specified will be installed + upgradeApproval: Automatic + versions: + - '{{ "{{hub" }} index .ManagedClusterLabels "autoshift.io/nvidia-gpu-version" | default "" {{ "hub}}" }}' +--- +apiVersion: cluster.open-cluster-management.io/v1beta1 +kind: Placement +metadata: + name: {{ $placementName }} + namespace: {{ .Values.policy_namespace }} +spec: + clusterSets: + {{- range $clusterSet, $value := $.Values.hubClusterSets }} + - {{ $clusterSet }} + {{- end }} + {{- range $clusterSet, $value := $.Values.managedClusterSets }} + - {{ $clusterSet }} + {{- end }} + predicates: + - requiredClusterSelector: + labelSelector: + matchExpressions: + - key: 'autoshift.io/nvidia-gpu' + operator: In + values: + - 'true' + tolerations: + - key: cluster.open-cluster-management.io/unreachable + operator: Exists + - key: cluster.open-cluster-management.io/unavailable + operator: Exists +--- +apiVersion: policy.open-cluster-management.io/v1 +kind: PlacementBinding +metadata: + name: {{ $placementName }} + namespace: {{ .Values.policy_namespace }} +placementRef: + name: {{ $placementName }} + apiGroup: cluster.open-cluster-management.io + kind: Placement +subjects: + - name: {{ $policyName }} + apiGroup: policy.open-cluster-management.io + kind: Policy \ No newline at end of file diff --git a/policies/nvidia-gpu/values.yaml b/policies/nvidia-gpu/values.yaml new file mode 100644 index 00000000..79955383 --- /dev/null +++ b/policies/nvidia-gpu/values.yaml @@ -0,0 +1,100 @@ +# Default values for nvidia-gpu +# This is a YAML-formatted file. +# Declare variables to be passed into your templates. +policy_namespace: open-cluster-policies + +nvidiaGpu: + # Operator settings + name: gpu-operator-certified + namespace: nvidia-gpu-operator + channel: stable + source: certified-operators + sourceNamespace: openshift-marketplace + operatorGroupName: nvidia-gpu-operator-group + + # ClusterPolicy settings + clusterPolicy: + name: gpu-cluster-policy + + # Driver configuration + driver: + enabled: true + # Use pre-installed driver on the host (for OpenShift with entitled nodes) + usePrecompiled: false + # For OpenShift, typically use the OpenShift driver toolkit + useOpenShiftDriverToolkit: true + # Upgrade policy: OnDelete, Immediate + upgradePolicy: OnDelete + + # CUDA Toolkit configuration + toolkit: + enabled: true + + # Device Plugin configuration + devicePlugin: + enabled: true + + # DCGM (Data Center GPU Manager) Exporter for metrics + dcgmExporter: + enabled: true + + # MIG (Multi-Instance GPU) Manager + migManager: + enabled: false # Enable only if using MIG-capable GPUs (A30, A100, H100) + + # GPU Feature Discovery + gfd: + enabled: true + + # Sandbox workloads (for vGPU) + sandboxWorkloads: + enabled: false + + # vGPU Manager (requires vGPU license) + vgpuManager: + enabled: false + + # vGPU Device Manager + vgpuDeviceManager: + enabled: false + +# hubClusterSets: +# hub: +# labels: +# nvidia-gpu: 'true' +# managedClusterSets: +# managed: +# labels: +# nvidia-gpu: 'true' + +### AutoShift Labels Documentation +# The following labels can be set at the cluster or clusterset level to configure this policy: +# +# Enable/Disable: +# nvidia-gpu: 'true' or 'false' - Controls whether NVIDIA GPU operator is managed +# +# Operator Configuration: +# nvidia-gpu-subscription-name: Operator subscription name (default: 'gpu-operator-certified') +# nvidia-gpu-channel: Operator channel (default: 'stable') +# nvidia-gpu-version: Specific operator version (CSV) to install (optional) +# nvidia-gpu-source: Operator catalog source (default: 'certified-operators') +# nvidia-gpu-source-namespace: Catalog namespace (default: 'openshift-marketplace') +# +# ClusterPolicy Configuration: +# nvidia-gpu-driver: Enable GPU driver installation (default: 'true') +# nvidia-gpu-driver-toolkit: Use OpenShift Driver Toolkit (default: 'true') +# nvidia-gpu-toolkit: Enable CUDA toolkit (default: 'true') +# nvidia-gpu-device-plugin: Enable device plugin (default: 'true') +# nvidia-gpu-dcgm: Enable DCGM exporter for metrics (default: 'true') +# nvidia-gpu-mig: Enable MIG manager (default: 'false' - for A30/A100/H100 only) +# nvidia-gpu-gfd: Enable GPU Feature Discovery (default: 'true') +# +# Note: NFD (Node Feature Discovery) is OPTIONAL. The NVIDIA GPU Operator +# includes its own GPU Feature Discovery (GFD) component that detects GPUs +# independently. GFD is enabled by default (nvidia-gpu-gfd: 'true'). +# +# Examples: +# autoshift.io/nvidia-gpu: 'true' +# autoshift.io/nvidia-gpu-channel: 'stable' +# autoshift.io/nvidia-gpu-driver: 'true' +# autoshift.io/nvidia-gpu-dcgm: 'true' diff --git a/policies/rhoai/Chart.yaml b/policies/rhoai/Chart.yaml new file mode 100644 index 00000000..55cbdcdd --- /dev/null +++ b/policies/rhoai/Chart.yaml @@ -0,0 +1,24 @@ +apiVersion: v2 +name: rhoai +description: A Helm chart for Kubernetes + +# A chart can be either an 'application' or a 'library' chart. +# +# Application charts are a collection of templates that can be packaged into versioned archives +# to be deployed. +# +# Library charts provide useful utilities or functions for the chart developer. They're included as +# a dependency of application charts to inject those utilities and functions into the rendering +# pipeline. Library charts do not define any templates and therefore cannot be deployed. +type: application + +# This is the chart version. This version number should be incremented each time you make changes +# to the chart and its templates, including the app version. +# Versions are expected to follow Semantic Versioning (https://semver.org/) +version: 0.1.0 + +# This is the version number of the application being deployed. This version number should be +# incremented each time you make changes to the application. Versions are not expected to +# follow Semantic Versioning. They should reflect the version the application is using. +# It is recommended to use it with quotes. +appVersion: "1.0.0" \ No newline at end of file diff --git a/policies/rhoai/README.md b/policies/rhoai/README.md new file mode 100644 index 00000000..1a26917c --- /dev/null +++ b/policies/rhoai/README.md @@ -0,0 +1,302 @@ +# RHOAI (Red Hat OpenShift AI) Policy + +This policy installs and configures Red Hat OpenShift AI (RHOAI) 3.0+ on OpenShift clusters. + +## What Gets Installed + +| Policy | Description | +|--------|-------------| +| `policy-rhoai-operator-install` | Installs the rhods-operator | +| `policy-rhoai-config` | Creates DSCInitialization and DataScienceCluster | + +### ConfigurationPolicies Created + +| ConfigurationPolicy | Description | +|---------------------|-------------| +| `rhoai-dsci` | Creates DSCInitialization with monitoring and service mesh config | +| `rhoai-dsc-bootstrap` | Creates DataScienceCluster with minimal spec | +| `rhoai-dsc` | Configures DSC components using v2 API (empty specs for enabled components) | +| `rhoai-knative-serving` | Creates KnativeServing for KServe | +| `rhoai-dashboard-route` | Creates Route to expose the dashboard | + +> **RHOAI 3.0 v2 API Changes**: +> - Uses `datasciencecluster.opendatahub.io/v2` API +> - Components are enabled by including them with empty specs `{}` +> - No `managementState` field (removed in v2) +> - Component name changes: `datasciencepipelines` → `aipipelines` +> - Removed components: `codeflare`, `modelmeshserving` (deprecated) +> - `OdhDashboardConfig` CRD removed (dashboard config managed by Dashboard component) + +## Dependencies + +RHOAI requires these operators to be installed first: + +| Dependency | Policy | Required For | +|------------|--------|--------------| +| OpenShift Serverless | `policies/serverless/` | KServe model serving | +| OpenShift Service Mesh 3 | `policies/servicemesh3/` | Traffic routing, mTLS | +| OpenShift Pipelines | `policies/openshift-pipelines/` | Data Science Pipelines | +| Node Feature Discovery | `policies/node-feature-discovery/` | GPU detection (optional) | + +## Quick Start + +Enable RHOAI with all dependencies: + +```yaml +hubClusterSets: + hub: + labels: + # Dependencies + serverless: 'true' + serverless-subscription-name: serverless-operator + serverless-channel: stable + serverless-source: redhat-operators + serverless-source-namespace: openshift-marketplace + + servicemesh3: 'true' + servicemesh3-subscription-name: servicemeshoperator3 + servicemesh3-channel: stable-3.2 + servicemesh3-source: redhat-operators + servicemesh3-source-namespace: openshift-marketplace + + pipelines: 'true' + pipelines-subscription-name: openshift-pipelines-operator-rh + pipelines-channel: pipelines-1.20 + pipelines-source: redhat-operators + pipelines-source-namespace: openshift-marketplace + + # RHOAI + rhoai: 'true' + rhoai-subscription-name: rhods-operator + rhoai-channel: fast-3.x + rhoai-source: redhat-operators + rhoai-source-namespace: openshift-marketplace +``` + +## Configuration Labels + +### Operator Configuration + +| Label | Default | Description | +|-------|---------|-------------| +| `rhoai` | - | Enable RHOAI (`'true'` or `'false'`) | +| `rhoai-subscription-name` | `rhods-operator` | OLM package name | +| `rhoai-channel` | `fast-3.x` | Operator channel | +| `rhoai-version` | - | Pin to specific CSV version (optional) | +| `rhoai-source` | `redhat-operators` | Catalog source | +| `rhoai-source-namespace` | `openshift-marketplace` | Catalog namespace | + +### DataScienceCluster Component Configuration (v2 API) + +Components are controlled via values.yaml (set to `true` to enable, `false` to disable): + +| Component | Default | Description | +|-----------|---------|-------------| +| `dashboard` | `true` | RHOAI Dashboard UI | +| `workbenches` | `true` | Jupyter notebooks and workbenches | +| `aipipelines` | `true` | AI Pipelines (formerly datasciencepipelines) | +| `kserve` | `true` | KServe model serving | +| `modelregistry` | `true` | Model Registry for model versioning | +| `ray` | `true` | Ray distributed computing | +| `kueue` | `true` | Kueue job queuing | +| `trainingoperator` | `true` | Training Operator (PyTorchJob, etc.) | +| `trustyai` | `true` | TrustyAI model explainability | + +**Removed in v2:** +- `codeflare` - functionality merged into other components +- `modelmeshserving` - use KServe instead +- `datasciencepipelines` - renamed to `aipipelines` + +### Infrastructure Configuration + +| Label | Default | Description | +|-------|---------|-------------| +| `rhoai-monitoring` | `Managed` | Monitoring stack | +| `rhoai-servicemesh` | `Managed` | Service Mesh integration | +| `rhoai-trustedca` | `Managed` | Trusted CA bundle management | + +## Examples + +### Minimal RHOAI (Dashboard + Workbenches only) + +Configure via `values.yaml`: + +```yaml +rhoai: + dsc: + dashboard: true + workbenches: true + # Disable other components + aipipelines: false + kserve: false + modelregistry: false + ray: false + kueue: false + trainingoperator: false + trustyai: false +``` + +### Full RHOAI with Model Serving (Default) + +All components enabled by default in `values.yaml`: + +```yaml +rhoai: + dsc: + dashboard: true + workbenches: true + aipipelines: true + kserve: true + modelregistry: true + ray: true + kueue: true + trainingoperator: true + trustyai: true +``` + +### Pin to Specific Version + +```yaml +rhoai: 'true' +rhoai-subscription-name: rhods-operator +rhoai-channel: fast-3.x +rhoai-version: 'rhods-operator.3.0.0' +rhoai-source: redhat-operators +rhoai-source-namespace: openshift-marketplace +``` + +## Verification + +Check RHOAI installation status: + +```bash +# Check operator +oc get csv -n redhat-ods-operator | grep rhods + +# Check DSCInitialization +oc get dscinitializations + +# Check DataScienceCluster +oc get datascienceclusters + +# Check RHOAI pods +oc get pods -n redhat-ods-applications + +# Check Dashboard route +oc get route -n redhat-ods-applications rhods-dashboard +``` + +## Troubleshooting + +### Policies not applying + +```bash +# Check policy status +oc get policies -n policies-autoshift | grep rhoai + +# Check operator policy on spoke +oc describe operatorpolicy install-rhoai -n + +# Check config policy +oc describe configurationpolicy rhoai-dsc -n +``` + +### DSCInitialization stuck + +```bash +# Check DSCI status +oc describe dscinitializations default-dsci + +# Check operator logs +oc logs -n redhat-ods-operator -l app.kubernetes.io/name=rhods-operator --tail=100 +``` + +### DataScienceCluster not ready + +```bash +# Check DSC status +oc describe datascienceclusters default-dsc + +# Check component status +oc get pods -n redhat-ods-applications +oc get pods -n redhat-ods-monitoring +``` + +### Components not deploying (RHOAI 3.0 v2) + +If components are not deploying: + +```bash +# Check DSC spec vs status +oc get datasciencecluster default-dsc -o yaml + +# Check if components are present in spec (v2 API) +oc get datasciencecluster default-dsc -o jsonpath='{.spec.components}' | jq + +# Check component status +oc get datasciencecluster default-dsc -o jsonpath='{.status.components}' | jq + +# Check ConfigurationPolicy compliance +oc get configurationpolicy -n local-cluster | grep rhoai + +# View policy details +oc describe configurationpolicy rhoai-dsc -n local-cluster + +# Check Dashboard component CRD (v2 uses components.platform.opendatahub.io) +oc get dashboard -A +``` + +## values.yaml Reference + +```yaml +rhoai: + # Operator settings + name: rhods-operator + namespace: redhat-ods-operator + channel: fast-3.x + source: redhat-operators + sourceNamespace: openshift-marketplace + operatorGroupName: rhoai-operator-group + + # DSCInitialization settings + dsci: + name: default-dsci + applicationsNamespace: redhat-ods-applications + monitoringNamespace: redhat-ods-monitoring + monitoringState: Managed + serviceMeshState: Managed + serviceMeshNamespace: istio-system + serviceMeshControlPlaneName: data-science-smcp + trustedCABundleState: Managed + + # DataScienceCluster settings (v2 API) + dsc: + name: default-dsc + # Set to true to enable, false to disable + dashboard: true + workbenches: true + aipipelines: true # Renamed from datasciencepipelines + kserve: true + modelregistry: true + ray: true + kueue: true + trainingoperator: true + trustyai: true + # Removed in v2: codeflare, modelmeshserving + + # Dashboard configuration (v2 API) + # Note: OdhDashboardConfig CRD removed in RHOAI 3.0 + # Dashboard settings managed via Dashboard component + dashboard: + # These settings may be used for future dashboard configuration + disableTracking: false + disableModelRegistry: false + disableModelCatalog: false + disableKServeMetrics: false + genAiStudio: true + modelAsService: true + disableLMEval: false + notebookControllerEnabled: true + notebookNamespace: rhods-notebooks + pvcSize: 20Gi +``` diff --git a/policies/rhoai/templates/policy-rhoai-config.yaml b/policies/rhoai/templates/policy-rhoai-config.yaml new file mode 100644 index 00000000..e688d908 --- /dev/null +++ b/policies/rhoai/templates/policy-rhoai-config.yaml @@ -0,0 +1,250 @@ +{{- $policyName := "policy-rhoai-config" }} +{{- $placementName := "placement-policy-rhoai-config" }} + +apiVersion: policy.open-cluster-management.io/v1 +kind: Policy +metadata: + name: {{ $policyName }} + namespace: {{ .Values.policy_namespace }} + annotations: + policy.open-cluster-management.io/standards: NIST SP 800-53 + policy.open-cluster-management.io/categories: CM Configuration Management + policy.open-cluster-management.io/controls: CM-2 Baseline Configuration +spec: + disabled: false + # Wait for RHOAI operator to be installed first + dependencies: + - name: policy-rhoai-operator-install + namespace: {{ .Values.policy_namespace }} + apiVersion: policy.open-cluster-management.io/v1 + compliance: Compliant + kind: Policy + policy-templates: + # DSCInitialization - must be created before DataScienceCluster + - objectDefinition: + apiVersion: policy.open-cluster-management.io/v1 + kind: ConfigurationPolicy + metadata: + name: rhoai-dsci + spec: + remediationAction: enforce + severity: high + object-templates: + - complianceType: musthave + objectDefinition: + apiVersion: dscinitialization.opendatahub.io/v1 + kind: DSCInitialization + metadata: + name: {{ .Values.rhoai.dsci.name }} + spec: + applicationsNamespace: {{ .Values.rhoai.dsci.applicationsNamespace }} + monitoring: + managementState: '{{ "{{hub" }} index .ManagedClusterLabels "autoshift.io/rhoai-monitoring" | default "{{ .Values.rhoai.dsci.monitoringState }}" {{ "hub}}" }}' + namespace: {{ .Values.rhoai.dsci.monitoringNamespace }} + serviceMesh: + controlPlane: + metricsCollection: Istio + name: {{ .Values.rhoai.dsci.serviceMeshControlPlaneName }} + namespace: {{ .Values.rhoai.dsci.serviceMeshNamespace }} + managementState: '{{ "{{hub" }} index .ManagedClusterLabels "autoshift.io/rhoai-servicemesh" | default "{{ .Values.rhoai.dsci.serviceMeshState }}" {{ "hub}}" }}' + trustedCABundle: + managementState: '{{ "{{hub" }} index .ManagedClusterLabels "autoshift.io/rhoai-trustedca" | default "{{ .Values.rhoai.dsci.trustedCABundleState }}" {{ "hub}}" }}' + # DataScienceCluster - configure component states + # RHOAI 3.0 uses v2 API with explicit managementState: Managed to enable components + - objectDefinition: + apiVersion: policy.open-cluster-management.io/v1 + kind: ConfigurationPolicy + metadata: + name: rhoai-dsc + spec: + remediationAction: enforce + severity: high + object-templates: + - complianceType: musthave + recreateOption: IfRequired + objectDefinition: + apiVersion: datasciencecluster.opendatahub.io/v2 + kind: DataScienceCluster + metadata: + name: {{ .Values.rhoai.dsc.name }} + spec: + components: + # RHOAI 3.0 v2 API - most components require explicit managementState: Managed + # kueue is an exception - it only supports Unmanaged or Removed (presence enables it) + # Hub template checks cluster labels, falls back to values.yaml defaults + {{- if or (eq (.Values.rhoai.dsc.aipipelines | toString) "true") (eq (.Values.rhoai.dsc.aipipelines | toString) "Managed") }} + aipipelines: + managementState: Managed + {{- end }} + {{- if or (eq (.Values.rhoai.dsc.dashboard | toString) "true") (eq (.Values.rhoai.dsc.dashboard | toString) "Managed") }} + dashboard: + managementState: Managed + {{- end }} + {{- if or (eq (.Values.rhoai.dsc.kserve | toString) "true") (eq (.Values.rhoai.dsc.kserve | toString) "Managed") }} + kserve: + managementState: Managed + {{- end }} + {{- if or (eq (.Values.rhoai.dsc.kueue | toString) "true") (eq (.Values.rhoai.dsc.kueue | toString) "Managed") }} + # kueue only supports Unmanaged/Removed - presence with config enables it + kueue: + defaultClusterQueueName: default + defaultLocalQueueName: default + {{- end }} + {{- if or (eq (.Values.rhoai.dsc.modelregistry | toString) "true") (eq (.Values.rhoai.dsc.modelregistry | toString) "Managed") }} + modelregistry: + managementState: Managed + {{- end }} + {{- if or (eq (.Values.rhoai.dsc.ray | toString) "true") (eq (.Values.rhoai.dsc.ray | toString) "Managed") }} + ray: + managementState: Managed + {{- end }} + {{- if or (eq (.Values.rhoai.dsc.trainingoperator | toString) "true") (eq (.Values.rhoai.dsc.trainingoperator | toString) "Managed") }} + trainingoperator: + managementState: Managed + {{- end }} + {{- if or (eq (.Values.rhoai.dsc.trustyai | toString) "true") (eq (.Values.rhoai.dsc.trustyai | toString) "Managed") }} + trustyai: + managementState: Managed + {{- end }} + {{- if or (eq (.Values.rhoai.dsc.workbenches | toString) "true") (eq (.Values.rhoai.dsc.workbenches | toString) "Managed") }} + workbenches: + managementState: Managed + {{- end }} + # OdhDashboardConfig - REMOVED in RHOAI 3.0 + # Dashboard configuration is now managed through the Dashboard component in DataScienceCluster + # Additional dashboard settings can be configured via OdhDashboardConfig if the CRD becomes available + # KnativeServing - required for KServe model serving + - objectDefinition: + apiVersion: policy.open-cluster-management.io/v1 + kind: ConfigurationPolicy + metadata: + name: rhoai-knative-serving + spec: + remediationAction: enforce + severity: high + object-templates: + - complianceType: musthave + objectDefinition: + apiVersion: v1 + kind: Namespace + metadata: + name: knative-serving + - complianceType: musthave + objectDefinition: + apiVersion: operator.knative.dev/v1beta1 + kind: KnativeServing + metadata: + name: knative-serving + namespace: knative-serving + spec: + config: + network: + ingress-class: "kourier.ingress.networking.knative.dev" + ingress: + kourier: + enabled: true + # Dashboard Route - expose the RHOAI dashboard + - objectDefinition: + apiVersion: policy.open-cluster-management.io/v1 + kind: ConfigurationPolicy + metadata: + name: rhoai-dashboard-route + spec: + remediationAction: enforce + severity: medium + object-templates: + - complianceType: musthave + objectDefinition: + apiVersion: route.openshift.io/v1 + kind: Route + metadata: + name: rhods-dashboard + namespace: {{ .Values.rhoai.dsci.applicationsNamespace }} + spec: + port: + targetPort: 8443 + tls: + insecureEdgeTerminationPolicy: Redirect + termination: reencrypt + to: + kind: Service + name: rhods-dashboard + weight: 100 + # Status check - verify DSCInitialization is ready + - objectDefinition: + apiVersion: policy.open-cluster-management.io/v1 + kind: ConfigurationPolicy + metadata: + name: rhoai-dsci-status + spec: + remediationAction: inform + severity: medium + object-templates: + - complianceType: musthave + objectDefinition: + apiVersion: dscinitialization.opendatahub.io/v1 + kind: DSCInitialization + metadata: + name: {{ .Values.rhoai.dsci.name }} + status: + phase: Ready + # Status check - verify DataScienceCluster is ready + - objectDefinition: + apiVersion: policy.open-cluster-management.io/v1 + kind: ConfigurationPolicy + metadata: + name: rhoai-dsc-status + spec: + remediationAction: inform + severity: medium + object-templates: + - complianceType: musthave + objectDefinition: + apiVersion: datasciencecluster.opendatahub.io/v1 + kind: DataScienceCluster + metadata: + name: {{ .Values.rhoai.dsc.name }} + status: + phase: Ready +--- +apiVersion: cluster.open-cluster-management.io/v1beta1 +kind: Placement +metadata: + name: {{ $placementName }} + namespace: {{ .Values.policy_namespace }} +spec: + clusterSets: + {{- range $clusterSet, $value := $.Values.hubClusterSets }} + - {{ $clusterSet }} + {{- end }} + {{- range $clusterSet, $value := $.Values.managedClusterSets }} + - {{ $clusterSet }} + {{- end }} + predicates: + - requiredClusterSelector: + labelSelector: + matchExpressions: + - key: 'autoshift.io/rhoai' + operator: In + values: + - 'true' + tolerations: + - key: cluster.open-cluster-management.io/unreachable + operator: Exists + - key: cluster.open-cluster-management.io/unavailable + operator: Exists +--- +apiVersion: policy.open-cluster-management.io/v1 +kind: PlacementBinding +metadata: + name: {{ $placementName }} + namespace: {{ .Values.policy_namespace }} +placementRef: + name: {{ $placementName }} + apiGroup: cluster.open-cluster-management.io + kind: Placement +subjects: + - name: {{ $policyName }} + apiGroup: policy.open-cluster-management.io + kind: Policy + diff --git a/policies/rhoai/templates/policy-rhoai-operator-install.yaml b/policies/rhoai/templates/policy-rhoai-operator-install.yaml new file mode 100644 index 00000000..7f4bd42a --- /dev/null +++ b/policies/rhoai/templates/policy-rhoai-operator-install.yaml @@ -0,0 +1,103 @@ +{{- $policyName := "policy-rhoai-operator-install" }} +{{- $placementName := "placement-policy-rhoai-operator-install" }} + +apiVersion: policy.open-cluster-management.io/v1 +kind: Policy +metadata: + name: {{ $policyName }} + namespace: {{ .Values.policy_namespace }} + annotations: + policy.open-cluster-management.io/standards: NIST SP 800-53 + policy.open-cluster-management.io/categories: CM Configuration Management + policy.open-cluster-management.io/controls: CM-2 Baseline Configuration +spec: + disabled: false + policy-templates: + - objectDefinition: + apiVersion: policy.open-cluster-management.io/v1 + kind: ConfigurationPolicy + metadata: + name: rhoai-operator-ns + spec: + remediationAction: enforce + severity: high + object-templates: + - complianceType: musthave + objectDefinition: + apiVersion: v1 + kind: Namespace + metadata: + name: {{ .Values.rhoai.namespace }} + labels: + openshift.io/cluster-monitoring: "true" + - objectDefinition: + apiVersion: policy.open-cluster-management.io/v1beta1 + kind: OperatorPolicy + metadata: + name: install-rhoai + spec: + remediationAction: enforce + severity: high + complianceType: musthave + operatorGroup: + name: {{ .Values.rhoai.operatorGroupName }} + namespace: {{ .Values.rhoai.namespace }} + {{- if .Values.rhoai.targetNamespaces }} + targetNamespaces: + {{- range .Values.rhoai.targetNamespaces }} + - {{ . }} + {{- end }} + {{- end }} + subscription: + startingCSV: '{{ "{{hub" }} index .ManagedClusterLabels "autoshift.io/rhoai-version" | default "" {{ "hub}}" }}' + namespace: {{ .Values.rhoai.namespace }} + channel: '{{ "{{hub" }} index .ManagedClusterLabels "autoshift.io/rhoai-channel" | default "{{ .Values.rhoai.channel }}" {{ "hub}}" }}' + name: '{{ "{{hub" }} index .ManagedClusterLabels "autoshift.io/rhoai-subscription-name" | default "{{ .Values.rhoai.name }}" {{ "hub}}" }}' + source: '{{ "{{hub" }} index .ManagedClusterLabels "autoshift.io/rhoai-source" | default "{{ .Values.rhoai.source }}" {{ "hub}}" }}' + sourceNamespace: '{{ "{{hub" }} index .ManagedClusterLabels "autoshift.io/rhoai-source-namespace" | default "{{ .Values.rhoai.sourceNamespace }}" {{ "hub}}" }}' + # Upgrade approval for subscription will be set based on if version is set not this flag + # If version is set install plan is set to manual and only the version specified will be installed + upgradeApproval: Automatic + versions: + - '{{ "{{hub" }} index .ManagedClusterLabels "autoshift.io/rhoai-version" | default "" {{ "hub}}" }}' +--- +apiVersion: cluster.open-cluster-management.io/v1beta1 +kind: Placement +metadata: + name: {{ $placementName }} + namespace: {{ .Values.policy_namespace }} +spec: + clusterSets: + {{- range $clusterSet, $value := $.Values.hubClusterSets }} + - {{ $clusterSet }} + {{- end }} + {{- range $clusterSet, $value := $.Values.managedClusterSets }} + - {{ $clusterSet }} + {{- end }} + predicates: + - requiredClusterSelector: + labelSelector: + matchExpressions: + - key: 'autoshift.io/rhoai' + operator: In + values: + - 'true' + tolerations: + - key: cluster.open-cluster-management.io/unreachable + operator: Exists + - key: cluster.open-cluster-management.io/unavailable + operator: Exists +--- +apiVersion: policy.open-cluster-management.io/v1 +kind: PlacementBinding +metadata: + name: {{ $placementName }} + namespace: {{ .Values.policy_namespace }} +placementRef: + name: {{ $placementName }} + apiGroup: cluster.open-cluster-management.io + kind: Placement +subjects: + - name: {{ $policyName }} + apiGroup: policy.open-cluster-management.io + kind: Policy \ No newline at end of file diff --git a/policies/rhoai/values.yaml b/policies/rhoai/values.yaml new file mode 100644 index 00000000..012e6cb0 --- /dev/null +++ b/policies/rhoai/values.yaml @@ -0,0 +1,107 @@ +# Default values for rhoai +# This is a YAML-formatted file. +# Declare variables to be passed into your templates. +policy_namespace: open-cluster-policies + +rhoai: + # Operator settings + name: rhods-operator + namespace: redhat-ods-operator + channel: fast-3.x + source: redhat-operators + sourceNamespace: openshift-marketplace + operatorGroupName: rhoai-operator-group + # targetNamespaces: # Optional: specify target namespaces for namespace-scoped operators + # - redhat-ods-operator + + # DSCInitialization settings + dsci: + name: default-dsci + applicationsNamespace: redhat-ods-applications + monitoringNamespace: redhat-ods-monitoring + monitoringState: Managed + serviceMeshState: Managed + serviceMeshNamespace: istio-system + serviceMeshControlPlaneName: data-science-smcp + trustedCABundleState: Managed + + # DataScienceCluster settings (RHOAI 3.0 v2 API) + dsc: + name: default-dsc + # RHOAI 3.0 uses v2 API - components are enabled with empty specs {} + # Note: codeflare, modelmeshserving, datasciencepipelines removed in v2 + # New component names: + # - datasciencepipelines → aipipipelines + # Components enabled by default (set to 'true' to enable): + dashboard: true + workbenches: true + aipipelines: true + kserve: true + modelregistry: true + ray: true + kueue: true + trainingoperator: true + trustyai: true + + # OdhDashboardConfig settings + dashboard: + # Tracking and analytics + disableTracking: false + # Model features + disableModelRegistry: false + disableModelCatalog: false + disableKServeMetrics: false + # GenAI features + genAiStudio: true + modelAsService: true + disableLMEval: false + # Notebook controller + notebookControllerEnabled: true + notebookNamespace: rhods-notebooks + pvcSize: 20Gi + +# hubClusterSets: +# hub: +# labels: +# rhoai: 'true' +# managedClusterSets: +# managed: +# labels: +# rhoai: 'true' + +### AutoShift Labels Documentation +# The following labels can be set at the cluster or clusterset level to configure this policy: +# +# Enable/Disable: +# rhoai: 'true' or 'false' - Controls whether RHOAI is managed +# +# Operator Configuration: +# rhoai-subscription-name: Operator subscription name (default: 'rhods-operator') +# rhoai-channel: Operator channel (default: 'fast-3.x') +# rhoai-version: Specific operator version (CSV) to install (optional) +# rhoai-source: Operator catalog source (default: 'redhat-operators') +# rhoai-source-namespace: Catalog namespace (default: 'openshift-marketplace') +# +# DataScienceCluster Component Configuration (RHOAI 3.0 v2 API): +# Note: RHOAI 3.0 uses the v2 API with simpler component specs (no managementState) +# Components are enabled by including them with empty specs {} +# +# Available components: +# rhoai-dashboard: Dashboard UI (default: enabled) +# rhoai-workbenches: Jupyter workbenches (default: enabled) +# rhoai-aipipelines: AI Pipelines (replaces datasciencepipelines, default: enabled) +# rhoai-kserve: KServe model serving (default: enabled) +# rhoai-modelregistry: Model Registry (default: enabled) +# rhoai-ray: Ray distributed computing (default: enabled) +# rhoai-kueue: Kueue job queuing (default: enabled) +# rhoai-training: Training Operator (default: enabled) +# rhoai-trustyai: TrustyAI explainability (default: enabled) +# +# Removed in RHOAI 3.0: +# - codeflare (functionality merged into other components) +# - modelmeshserving (use KServe instead) +# - datasciencepipelines (renamed to aipipelines) +# +# Examples: +# autoshift.io/rhoai: 'true' +# autoshift.io/rhoai-channel: 'fast-3.x' diff --git a/policies/serverless/Chart.yaml b/policies/serverless/Chart.yaml new file mode 100644 index 00000000..9fdaf1cd --- /dev/null +++ b/policies/serverless/Chart.yaml @@ -0,0 +1,24 @@ +apiVersion: v2 +name: serverless +description: A Helm chart for Kubernetes + +# A chart can be either an 'application' or a 'library' chart. +# +# Application charts are a collection of templates that can be packaged into versioned archives +# to be deployed. +# +# Library charts provide useful utilities or functions for the chart developer. They're included as +# a dependency of application charts to inject those utilities and functions into the rendering +# pipeline. Library charts do not define any templates and therefore cannot be deployed. +type: application + +# This is the chart version. This version number should be incremented each time you make changes +# to the chart and its templates, including the app version. +# Versions are expected to follow Semantic Versioning (https://semver.org/) +version: 0.1.0 + +# This is the version number of the application being deployed. This version number should be +# incremented each time you make changes to the application. Versions are not expected to +# follow Semantic Versioning. They should reflect the version the application is using. +# It is recommended to use it with quotes. +appVersion: "1.0.0" \ No newline at end of file diff --git a/policies/serverless/README.md b/policies/serverless/README.md new file mode 100644 index 00000000..1cfaf9ae --- /dev/null +++ b/policies/serverless/README.md @@ -0,0 +1,274 @@ +# serverless AutoShift Policy + +## Overview +This policy installs the serverless-operator operator using AutoShift patterns. + +## Status +✅ **Operator Installation**: Ready to deploy +🔧 **Configuration**: Requires operator-specific setup (see below) + +## Quick Deploy + +### Test Locally +```bash +# Validate policy renders correctly +helm template policies/serverless/ +``` + +### Enable on Clusters +Edit AutoShift values files to add the operator labels: + +```yaml +# In autoshift/values.hub.yaml (or values.sbx.yaml, etc.) +hubClusterSets: + hub: + labels: + serverless: 'true' + serverless-subscription-name: 'serverless-operator' + serverless-channel: 'stable' + serverless-source: 'redhat-operators' + serverless-source-namespace: 'openshift-marketplace' + # serverless-version: 'serverless-operator.v1.x.x' # Optional: pin to specific CSV version + +managedClusterSets: + managed: + labels: + serverless: 'true' + serverless-subscription-name: 'serverless-operator' + serverless-channel: 'stable' + serverless-source: 'redhat-operators' + serverless-source-namespace: 'openshift-marketplace' + # serverless-version: 'serverless-operator.v1.x.x' # Optional: pin to specific CSV version + +# For specific clusters (optional override) +clusters: + my-cluster: + labels: + serverless: 'true' + serverless-channel: 'fast' # Override channel for this cluster +``` + +Labels are automatically propagated to clusters via the cluster-labels policy. + +### Add to AutoShift ApplicationSet +Edit `autoshift/templates/applicationset.yaml` and add: +```yaml +- name: serverless + path: policies/serverless + helm: + valueFiles: + - values.yaml +``` + +## Configuration + +### Namespace Scope +This operator is configured as: +- **Cluster-scoped**: Manages resources across all namespaces (default) +- **Namespace-scoped**: Limited to specific target namespaces (if `targetNamespaces` enabled in values.yaml) + +To change scope, edit `values.yaml` and uncomment/configure the `targetNamespaces` field. + +### Version Control +This policy supports AutoShift's operator version control system: + +- **Automatic Upgrades**: By default, the operator follows automatic upgrade paths within its channel +- **Version Pinning**: Add `serverless-version` label to pin to a specific CSV version +- **Manual Control**: Pinned versions require manual updates to upgrade + +To pin to a specific version, add the version label to your cluster or clusterset: +```yaml +serverless-version: 'serverless-operator.v1.x.x' +``` + +Find available CSV versions: +```bash +# List available versions for this operator +oc get packagemanifests serverless-operator -o jsonpath='{.status.channels[*].currentCSV}' +``` + +## Next Steps: Configuration + +### 1. Explore Installed CRDs +After operator installation, check what Custom Resources are available: +```bash +# Wait for operator to install +oc get pods -n openshift-serverless + +# Check available CRDs +oc get crds | grep serverless + +# Explore CRD specifications +oc explain +``` + +### 2. Create Configuration Policies +Add operator-specific configuration policies to `templates/` directory. + +#### Common Patterns: +- `policy-serverless-config.yaml` - Main configuration +- `policy-serverless-.yaml` - Feature-specific configs + +#### Template Structure: +```yaml +{{- $policyName := "policy-serverless-config" }} +{{- $placementName := "placement-policy-serverless-config" }} + +apiVersion: policy.open-cluster-management.io/v1 +kind: Policy +metadata: + name: {{ $policyName }} + namespace: {{ .Values.policy_namespace }} + annotations: + policy.open-cluster-management.io/standards: NIST SP 800-53 + policy.open-cluster-management.io/categories: CM Configuration Management + policy.open-cluster-management.io/controls: CM-2 Baseline Configuration +spec: + disabled: false + dependencies: + - name: policy-serverless-operator-install + namespace: {{ .Values.policy_namespace }} + apiVersion: policy.open-cluster-management.io/v1 + compliance: Compliant + kind: Policy + policy-templates: + - objectDefinition: + apiVersion: policy.open-cluster-management.io/v1 + kind: ConfigurationPolicy + metadata: + name: serverless-config + spec: + remediationAction: enforce + severity: high + object-templates: + - complianceType: musthave + objectDefinition: + apiVersion: # Your operator's API version + kind: # Your operator's Custom Resource + metadata: + name: serverless-config + namespace: {{ .Values.serverless.namespace }} + spec: + # Your operator-specific configuration + # Use dynamic labels when needed: + # setting: '{{ "{{hub" }} index .ManagedClusterLabels "autoshift.io/serverless-setting" | default "default-value" {{ "hub}}" }}' + pruneObjectBehavior: None +--- +# Use same placement as operator install or create specific targeting +apiVersion: cluster.open-cluster-management.io/v1beta1 +kind: Placement +metadata: + name: {{ $placementName }} + namespace: {{ .Values.policy_namespace }} +spec: + clusterSets: + {{- range $clusterSet, $value := $.Values.hubClusterSets }} + - {{ $clusterSet }} + {{- end }} + {{- range $clusterSet, $value := $.Values.managedClusterSets }} + - {{ $clusterSet }} + {{- end }} + predicates: + - requiredClusterSelector: + labelSelector: + matchExpressions: + - key: 'autoshift.io/serverless' + operator: In + values: + - 'true' + tolerations: + - key: cluster.open-cluster-management.io/unreachable + operator: Exists + - key: cluster.open-cluster-management.io/unavailable + operator: Exists +--- +apiVersion: policy.open-cluster-management.io/v1 +kind: PlacementBinding +metadata: + name: {{ $placementName }} + namespace: {{ .Values.policy_namespace }} +placementRef: + name: {{ $placementName }} + apiGroup: cluster.open-cluster-management.io + kind: Placement +subjects: + - name: {{ $policyName }} + apiGroup: policy.open-cluster-management.io + kind: Policy +``` + +### 3. Reference Examples +**Study similar complexity policies:** +- **Simple**: `policies/openshift-gitops/` - Basic operator + ArgoCD config +- **Medium**: `policies/advanced-cluster-security/` - Multiple related policies +- **Complex**: `policies/metallb/` - Multiple configuration types (L2, BGP, etc.) +- **Advanced**: `policies/openshift-data-foundation/` - Storage cluster configuration + +### 4. AutoShift Labels +Add configuration labels to `values.yaml` and use in templates: + +```yaml +# Add to values.yaml AutoShift Labels Documentation: +# serverless-setting: Configuration option (default: 'value') +# serverless-feature-enabled: Enable optional feature (default: 'false') +# serverless-provider: Provider-specific config (default: 'generic') + +# Use in templates: +setting: '{{ "{{hub" }} index .ManagedClusterLabels "autoshift.io/serverless-setting" | default "default-value" {{ "hub}}" }}' +``` + +## Common Patterns + +### CSV Status Checking (Optional) +For operators that need installation verification: +```yaml +- objectDefinition: + apiVersion: policy.open-cluster-management.io/v1 + kind: ConfigurationPolicy + metadata: + name: serverless-csv-status + spec: + remediationAction: inform + severity: high + object-templates: + - complianceType: musthave + objectDefinition: + apiVersion: operators.coreos.com/v1alpha1 + kind: ClusterServiceVersion + metadata: + namespace: {{ .Values.serverless.namespace }} + status: + phase: Succeeded +``` + +### ArgoCD Sync Annotations (If Needed) +For policies requiring special sync behavior: +```yaml +annotations: + argocd.argoproj.io/sync-options: Prune=false,SkipDryRunOnMissingResource=true + argocd.argoproj.io/compare-options: IgnoreExtraneous + argocd.argoproj.io/sync-wave: "1" +``` + +## Troubleshooting + +### Policy Not Applied +1. Check cluster labels: `oc get managedcluster --show-labels` +2. Verify placement: `oc get placement -n open-cluster-policies` +3. Check policy status: `oc describe policy policy-serverless-operator-install` + +### Operator Installation Issues +1. Check subscription: `oc get subscription -n openshift-serverless` +2. Check install plan: `oc get installplan -n openshift-serverless` +3. Verify operator source exists: `oc get catalogsource -n openshift-marketplace` + +### Template Rendering Issues +1. Test locally: `helm template policies/serverless/` +2. Check hub escaping: Look for `{{ "{{hub" }} ... {{ "hub}}" }}` patterns +3. Validate YAML: `helm lint policies/serverless/` + +## Resources +- [Operator Documentation](https://operatorhub.io/operator/serverless-operator) - Find your operator details +- [AutoShift Policy Patterns](../../README-DEVELOPER.md) - Comprehensive policy development guide +- [ACM Policy Documentation](https://access.redhat.com/documentation/en-us/red_hat_advanced_cluster_management_for_kubernetes) - Policy syntax reference in Governence Section +- [Similar Policies](../) - Browse other policies for patterns and examples \ No newline at end of file diff --git a/policies/serverless/templates/policy-serverless-config.yaml b/policies/serverless/templates/policy-serverless-config.yaml new file mode 100644 index 00000000..8ef685f2 --- /dev/null +++ b/policies/serverless/templates/policy-serverless-config.yaml @@ -0,0 +1,165 @@ +{{- $policyName := "policy-serverless-config" }} +{{- $placementName := "placement-policy-serverless-config" }} + +apiVersion: policy.open-cluster-management.io/v1 +kind: Policy +metadata: + name: {{ $policyName }} + namespace: {{ .Values.policy_namespace }} + annotations: + policy.open-cluster-management.io/standards: NIST SP 800-53 + policy.open-cluster-management.io/categories: CM Configuration Management + policy.open-cluster-management.io/controls: CM-2 Baseline Configuration +spec: + disabled: false + # Wait for Serverless operator to be installed first + dependencies: + - name: policy-serverless-operator-install + namespace: {{ .Values.policy_namespace }} + apiVersion: policy.open-cluster-management.io/v1 + compliance: Compliant + kind: Policy + policy-templates: + # Create knative-serving namespace + - objectDefinition: + apiVersion: policy.open-cluster-management.io/v1 + kind: ConfigurationPolicy + metadata: + name: serverless-knative-serving-ns + spec: + remediationAction: enforce + severity: high + object-templates: + - complianceType: musthave + objectDefinition: + apiVersion: v1 + kind: Namespace + metadata: + name: knative-serving + # Create KnativeServing instance + - objectDefinition: + apiVersion: policy.open-cluster-management.io/v1 + kind: ConfigurationPolicy + metadata: + name: serverless-knative-serving + spec: + remediationAction: enforce + severity: high + object-templates: + - complianceType: musthave + objectDefinition: + apiVersion: operator.knative.dev/v1beta1 + kind: KnativeServing + metadata: + name: knative-serving + namespace: knative-serving + spec: + config: + network: + ingress-class: "kourier.ingress.networking.knative.dev" + ingress: + kourier: + enabled: true + high-availability: + replicas: {{ .Values.serverless.serving.replicas | default 2 }} + # Create knative-eventing namespace (if enabled) + {{- if .Values.serverless.eventing.enabled }} + - objectDefinition: + apiVersion: policy.open-cluster-management.io/v1 + kind: ConfigurationPolicy + metadata: + name: serverless-knative-eventing-ns + spec: + remediationAction: enforce + severity: high + object-templates: + - complianceType: musthave + objectDefinition: + apiVersion: v1 + kind: Namespace + metadata: + name: knative-eventing + # Create KnativeEventing instance (if enabled) + - objectDefinition: + apiVersion: policy.open-cluster-management.io/v1 + kind: ConfigurationPolicy + metadata: + name: serverless-knative-eventing + spec: + remediationAction: enforce + severity: high + object-templates: + - complianceType: musthave + objectDefinition: + apiVersion: operator.knative.dev/v1beta1 + kind: KnativeEventing + metadata: + name: knative-eventing + namespace: knative-eventing + spec: + high-availability: + replicas: {{ .Values.serverless.eventing.replicas | default 2 }} + {{- end }} + # Status check - verify KnativeServing is ready + - objectDefinition: + apiVersion: policy.open-cluster-management.io/v1 + kind: ConfigurationPolicy + metadata: + name: serverless-knative-serving-status + spec: + remediationAction: inform + severity: medium + object-templates: + - complianceType: musthave + objectDefinition: + apiVersion: operator.knative.dev/v1beta1 + kind: KnativeServing + metadata: + name: knative-serving + namespace: knative-serving + status: + conditions: + - status: "True" + type: Ready +--- +apiVersion: cluster.open-cluster-management.io/v1beta1 +kind: Placement +metadata: + name: {{ $placementName }} + namespace: {{ .Values.policy_namespace }} +spec: + clusterSets: + {{- range $clusterSet, $value := $.Values.hubClusterSets }} + - {{ $clusterSet }} + {{- end }} + {{- range $clusterSet, $value := $.Values.managedClusterSets }} + - {{ $clusterSet }} + {{- end }} + predicates: + - requiredClusterSelector: + labelSelector: + matchExpressions: + - key: 'autoshift.io/serverless' + operator: In + values: + - 'true' + tolerations: + - key: cluster.open-cluster-management.io/unreachable + operator: Exists + - key: cluster.open-cluster-management.io/unavailable + operator: Exists +--- +apiVersion: policy.open-cluster-management.io/v1 +kind: PlacementBinding +metadata: + name: {{ $placementName }} + namespace: {{ .Values.policy_namespace }} +placementRef: + name: {{ $placementName }} + apiGroup: cluster.open-cluster-management.io + kind: Placement +subjects: + - name: {{ $policyName }} + apiGroup: policy.open-cluster-management.io + kind: Policy + diff --git a/policies/serverless/templates/policy-serverless-operator-install.yaml b/policies/serverless/templates/policy-serverless-operator-install.yaml new file mode 100644 index 00000000..fa4749d7 --- /dev/null +++ b/policies/serverless/templates/policy-serverless-operator-install.yaml @@ -0,0 +1,103 @@ +{{- $policyName := "policy-serverless-operator-install" }} +{{- $placementName := "placement-policy-serverless-operator-install" }} + +apiVersion: policy.open-cluster-management.io/v1 +kind: Policy +metadata: + name: {{ $policyName }} + namespace: {{ .Values.policy_namespace }} + annotations: + policy.open-cluster-management.io/standards: NIST SP 800-53 + policy.open-cluster-management.io/categories: CM Configuration Management + policy.open-cluster-management.io/controls: CM-2 Baseline Configuration +spec: + disabled: false + policy-templates: + - objectDefinition: + apiVersion: policy.open-cluster-management.io/v1 + kind: ConfigurationPolicy + metadata: + name: serverless-operator-ns + spec: + remediationAction: enforce + severity: high + object-templates: + - complianceType: musthave + objectDefinition: + apiVersion: v1 + kind: Namespace + metadata: + name: {{ .Values.serverless.namespace }} + labels: + openshift.io/cluster-monitoring: "true" + - objectDefinition: + apiVersion: policy.open-cluster-management.io/v1beta1 + kind: OperatorPolicy + metadata: + name: install-serverless + spec: + remediationAction: enforce + severity: high + complianceType: musthave + operatorGroup: + name: {{ .Values.serverless.operatorGroupName }} + namespace: {{ .Values.serverless.namespace }} + {{- if .Values.serverless.targetNamespaces }} + targetNamespaces: + {{- range .Values.serverless.targetNamespaces }} + - {{ . }} + {{- end }} + {{- end }} + subscription: + startingCSV: '{{ "{{hub" }} index .ManagedClusterLabels "autoshift.io/serverless-version" | default "" {{ "hub}}" }}' + namespace: {{ .Values.serverless.namespace }} + channel: '{{ "{{hub" }} index .ManagedClusterLabels "autoshift.io/serverless-channel" | default "{{ .Values.serverless.channel }}" {{ "hub}}" }}' + name: '{{ "{{hub" }} index .ManagedClusterLabels "autoshift.io/serverless-subscription-name" | default "{{ .Values.serverless.name }}" {{ "hub}}" }}' + source: '{{ "{{hub" }} index .ManagedClusterLabels "autoshift.io/serverless-source" | default "{{ .Values.serverless.source }}" {{ "hub}}" }}' + sourceNamespace: '{{ "{{hub" }} index .ManagedClusterLabels "autoshift.io/serverless-source-namespace" | default "{{ .Values.serverless.sourceNamespace }}" {{ "hub}}" }}' + # Upgrade approval for subscription will be set based on if version is set not this flag + # If version is set install plan is set to manual and only the version specified will be installed + upgradeApproval: Automatic + versions: + - '{{ "{{hub" }} index .ManagedClusterLabels "autoshift.io/serverless-version" | default "" {{ "hub}}" }}' +--- +apiVersion: cluster.open-cluster-management.io/v1beta1 +kind: Placement +metadata: + name: {{ $placementName }} + namespace: {{ .Values.policy_namespace }} +spec: + clusterSets: + {{- range $clusterSet, $value := $.Values.hubClusterSets }} + - {{ $clusterSet }} + {{- end }} + {{- range $clusterSet, $value := $.Values.managedClusterSets }} + - {{ $clusterSet }} + {{- end }} + predicates: + - requiredClusterSelector: + labelSelector: + matchExpressions: + - key: 'autoshift.io/serverless' + operator: In + values: + - 'true' + tolerations: + - key: cluster.open-cluster-management.io/unreachable + operator: Exists + - key: cluster.open-cluster-management.io/unavailable + operator: Exists +--- +apiVersion: policy.open-cluster-management.io/v1 +kind: PlacementBinding +metadata: + name: {{ $placementName }} + namespace: {{ .Values.policy_namespace }} +placementRef: + name: {{ $placementName }} + apiGroup: cluster.open-cluster-management.io + kind: Placement +subjects: + - name: {{ $policyName }} + apiGroup: policy.open-cluster-management.io + kind: Policy \ No newline at end of file diff --git a/policies/serverless/values.yaml b/policies/serverless/values.yaml new file mode 100644 index 00000000..655a8cb0 --- /dev/null +++ b/policies/serverless/values.yaml @@ -0,0 +1,52 @@ +# Default values for serverless +# This is a YAML-formatted file. +# Declare variables to be passed into your templates. +policy_namespace: open-cluster-policies +serverless: + name: serverless-operator + namespace: openshift-serverless + channel: stable + source: redhat-operators + sourceNamespace: openshift-marketplace + operatorGroupName: serverless-operator-group + # targetNamespaces: # Optional: specify target namespaces for namespace-scoped operators + # - openshift-serverless + + # KnativeServing configuration + serving: + replicas: 2 # HA replicas for serving components + + # KnativeEventing configuration + eventing: + enabled: false # Set to true to deploy KnativeEventing + replicas: 2 # HA replicas for eventing components + +# hubClusterSets: +# hub: +# labels: +# test1: 'test' +# managedClusterSets: +# managed: +# labels: +# test4: 'test' + +### AutoShift Labels Documentation +# The following labels can be set at the cluster or clusterset level to configure this policy: +# +# Enable/Disable: +# serverless: 'true' or 'false' - Controls whether serverless is managed +# +# Configuration: +# serverless-subscription-name: Operator subscription name (default: 'serverless-operator') +# serverless-channel: Operator channel (default: 'stable') +# serverless-version: Specific operator version (CSV) to install (optional) +# serverless-source: Operator catalog source (default: 'redhat-operators') +# serverless-source-namespace: Catalog namespace (default: 'openshift-marketplace') +# +# Examples: +# autoshift.io/serverless: 'true' +# autoshift.io/serverless-subscription-name: 'serverless-operator' +# autoshift.io/serverless-channel: 'stable' +# autoshift.io/serverless-version: 'operator-name.v1.x.x' +# autoshift.io/serverless-source: 'redhat-operators' +# autoshift.io/serverless-source-namespace: 'openshift-marketplace' \ No newline at end of file diff --git a/policies/servicemesh3/templates/policy-servicemesh3-config.yaml b/policies/servicemesh3/templates/policy-servicemesh3-config.yaml new file mode 100644 index 00000000..009e907f --- /dev/null +++ b/policies/servicemesh3/templates/policy-servicemesh3-config.yaml @@ -0,0 +1,149 @@ +{{- $policyName := "policy-servicemesh3-config" }} +{{- $placementName := "placement-policy-servicemesh3-config" }} + +apiVersion: policy.open-cluster-management.io/v1 +kind: Policy +metadata: + name: {{ $policyName }} + namespace: {{ .Values.policy_namespace }} + annotations: + policy.open-cluster-management.io/standards: NIST SP 800-53 + policy.open-cluster-management.io/categories: CM Configuration Management + policy.open-cluster-management.io/controls: CM-2 Baseline Configuration +spec: + disabled: false + # Wait for Service Mesh 3 operator to be installed first + dependencies: + - name: policy-servicemesh3-operator-install + namespace: {{ .Values.policy_namespace }} + apiVersion: policy.open-cluster-management.io/v1 + compliance: Compliant + kind: Policy + policy-templates: + # Create istio-system namespace + - objectDefinition: + apiVersion: policy.open-cluster-management.io/v1 + kind: ConfigurationPolicy + metadata: + name: servicemesh3-istio-ns + spec: + remediationAction: enforce + severity: high + object-templates: + - complianceType: musthave + objectDefinition: + apiVersion: v1 + kind: Namespace + metadata: + name: {{ .Values.servicemesh3.istio.namespace | default "istio-system" }} + # Create IstioCNI instance (recommended for OpenShift) + {{- if .Values.servicemesh3.istioCni.enabled | default true }} + - objectDefinition: + apiVersion: policy.open-cluster-management.io/v1 + kind: ConfigurationPolicy + metadata: + name: servicemesh3-istio-cni + spec: + remediationAction: enforce + severity: high + object-templates: + - complianceType: musthave + objectDefinition: + apiVersion: sailoperator.io/v1 + kind: IstioCNI + metadata: + name: {{ .Values.servicemesh3.istioCni.name | default "default" }} + spec: + version: {{ .Values.servicemesh3.istioCni.version | default "v1.24.3" | quote }} + namespace: {{ .Values.servicemesh3.istio.namespace | default "istio-system" }} + {{- end }} + # Create Istio control plane + - objectDefinition: + apiVersion: policy.open-cluster-management.io/v1 + kind: ConfigurationPolicy + metadata: + name: servicemesh3-istio + spec: + remediationAction: enforce + severity: high + object-templates: + - complianceType: musthave + objectDefinition: + apiVersion: sailoperator.io/v1 + kind: Istio + metadata: + name: {{ .Values.servicemesh3.istio.name | default "default" }} + spec: + version: {{ .Values.servicemesh3.istio.version | default "v1.24.3" | quote }} + namespace: {{ .Values.servicemesh3.istio.namespace | default "istio-system" }} + updateStrategy: + type: {{ .Values.servicemesh3.istio.updateStrategy | default "InPlace" }} + values: + global: + # Use OpenShift CNI + platform: openshift + pilot: + resources: + requests: + cpu: {{ .Values.servicemesh3.istio.pilot.cpuRequest | default "100m" }} + memory: {{ .Values.servicemesh3.istio.pilot.memoryRequest | default "256Mi" }} + # Status check - verify Istio is ready + - objectDefinition: + apiVersion: policy.open-cluster-management.io/v1 + kind: ConfigurationPolicy + metadata: + name: servicemesh3-istio-status + spec: + remediationAction: inform + severity: medium + object-templates: + - complianceType: musthave + objectDefinition: + apiVersion: sailoperator.io/v1 + kind: Istio + metadata: + name: {{ .Values.servicemesh3.istio.name | default "default" }} + status: + state: Healthy +--- +apiVersion: cluster.open-cluster-management.io/v1beta1 +kind: Placement +metadata: + name: {{ $placementName }} + namespace: {{ .Values.policy_namespace }} +spec: + clusterSets: + {{- range $clusterSet, $value := $.Values.hubClusterSets }} + - {{ $clusterSet }} + {{- end }} + {{- range $clusterSet, $value := $.Values.managedClusterSets }} + - {{ $clusterSet }} + {{- end }} + predicates: + - requiredClusterSelector: + labelSelector: + matchExpressions: + - key: 'autoshift.io/servicemesh3' + operator: In + values: + - 'true' + tolerations: + - key: cluster.open-cluster-management.io/unreachable + operator: Exists + - key: cluster.open-cluster-management.io/unavailable + operator: Exists +--- +apiVersion: policy.open-cluster-management.io/v1 +kind: PlacementBinding +metadata: + name: {{ $placementName }} + namespace: {{ .Values.policy_namespace }} +placementRef: + name: {{ $placementName }} + apiGroup: cluster.open-cluster-management.io + kind: Placement +subjects: + - name: {{ $policyName }} + apiGroup: policy.open-cluster-management.io + kind: Policy + diff --git a/policies/servicemesh3/templates/policy-servicemesh3-operator-install.yaml b/policies/servicemesh3/templates/policy-servicemesh3-operator-install.yaml index a83cc907..84475213 100644 --- a/policies/servicemesh3/templates/policy-servicemesh3-operator-install.yaml +++ b/policies/servicemesh3/templates/policy-servicemesh3-operator-install.yaml @@ -38,7 +38,8 @@ spec: severity: high complianceType: musthave operatorGroup: - name: {{ .Values.servicemesh3.operatorGroupName }} + # if operator group exists in namespace use that + name: '{{ "{{" }} with (lookup "operators.coreos.com/v1" "OperatorGroup" "{{ .Values.servicemesh3.namespace }}" "").items {{ "}}" }}{{ "{{" }} (index . 0).metadata.name {{ "}}" }}{{ "{{" }} else {{ "}}" }}{{ .Values.servicemesh3.operatorGroupName }}{{ "{{" }} end {{ "}}" }}' namespace: {{ .Values.servicemesh3.namespace }} {{- if .Values.servicemesh3.targetNamespaces }} targetNamespaces: @@ -47,12 +48,18 @@ spec: {{- end }} {{- end }} subscription: - startingCSV: '{{ "{{hub" }} index .ManagedClusterLabels "autoshift.io/servicemesh3-version" | default "" {{ "hub}}" }}' - namespace: {{ .Values.servicemesh3.namespace }} - channel: '{{ "{{hub" }} index .ManagedClusterLabels "autoshift.io/servicemesh3-channel" | default "{{ .Values.servicemesh3.channel }}" {{ "hub}}" }}' name: '{{ "{{hub" }} index .ManagedClusterLabels "autoshift.io/servicemesh3-subscription-name" | default "{{ .Values.servicemesh3.name }}" {{ "hub}}" }}' - source: '{{ "{{hub" }} index .ManagedClusterLabels "autoshift.io/servicemesh3-source" | default "{{ .Values.servicemesh3.source }}" {{ "hub}}" }}' + namespace: {{ .Values.servicemesh3.namespace }} + # Source Selection Logic (evaluated on hub for each managed cluster): + # 1. If cluster has explicit 'autoshift.io/servicemesh3-source' label → use that value + # 2. Else if cluster has 'autoshift.io/disconnected-mirror: true': + # → use {source}{suffix} (e.g., redhat-operators-mirror) + # → suffix from 'autoshift.io/mirror-catalog-suffix' label, or default '-mirror' + # 3. Else → use standard source (e.g., redhat-operators) + source: '{{ "{{hub" }} index .ManagedClusterLabels "autoshift.io/servicemesh3-source" | default (ternary (printf "%s-%s" "{{ .Values.servicemesh3.source }}" (index .ManagedClusterLabels "autoshift.io/mirror-catalog-suffix" | default "mirror")) "{{ .Values.servicemesh3.source }}" (eq (index .ManagedClusterLabels "autoshift.io/disconnected-mirror") "true")) {{ "hub}}" }}' sourceNamespace: '{{ "{{hub" }} index .ManagedClusterLabels "autoshift.io/servicemesh3-source-namespace" | default "{{ .Values.servicemesh3.sourceNamespace }}" {{ "hub}}" }}' + channel: '{{ "{{hub" }} index .ManagedClusterLabels "autoshift.io/servicemesh3-channel" | default "{{ .Values.servicemesh3.channel }}" {{ "hub}}" }}' + startingCSV: '{{ "{{hub" }} index .ManagedClusterLabels "autoshift.io/servicemesh3-version" | default "" {{ "hub}}" }}' # Upgrade approval for subscription will be set based on if version is set not this flag # If version is set install plan is set to manual and only the version specified will be installed upgradeApproval: Automatic diff --git a/policies/servicemesh3/values.yaml b/policies/servicemesh3/values.yaml index 74ae2956..322ee23c 100644 --- a/policies/servicemesh3/values.yaml +++ b/policies/servicemesh3/values.yaml @@ -5,12 +5,28 @@ policy_namespace: open-cluster-policies servicemesh3: name: servicemeshoperator3 namespace: openshift-operators - channel: stable + channel: stable-3.2 source: redhat-operators sourceNamespace: openshift-marketplace operatorGroupName: global-operators # targetNamespaces: # Optional: specify target namespaces for namespace-scoped operators # - openshift-operators + + # Istio control plane configuration + istio: + name: default + namespace: istio-system + version: "v1.26.2" # Must match servicemesh3 operator version + updateStrategy: InPlace + pilot: + cpuRequest: "100m" + memoryRequest: "256Mi" + + # IstioCNI configuration (recommended for OpenShift) + istioCni: + enabled: true + name: default + version: "v1.26.2" # Must match servicemesh3 operator version # hubClusterSets: # hub: diff --git a/policies/tempo/templates/policy-tempo-operator-install.yaml b/policies/tempo/templates/policy-tempo-operator-install.yaml index d92589c5..0d6eb9a1 100644 --- a/policies/tempo/templates/policy-tempo-operator-install.yaml +++ b/policies/tempo/templates/policy-tempo-operator-install.yaml @@ -38,7 +38,8 @@ spec: severity: high complianceType: musthave operatorGroup: - name: {{ .Values.tempo.operatorGroupName }} + # if operator group exists in namespace use that + name: '{{ "{{" }} with (lookup "operators.coreos.com/v1" "OperatorGroup" "{{ .Values.tempo.namespace }}" "").items {{ "}}" }}{{ "{{" }} (index . 0).metadata.name {{ "}}" }}{{ "{{" }} else {{ "}}" }}{{ .Values.tempo.operatorGroupName }}{{ "{{" }} end {{ "}}" }}' namespace: {{ .Values.tempo.namespace }} {{- if .Values.tempo.targetNamespaces }} targetNamespaces: @@ -47,12 +48,18 @@ spec: {{- end }} {{- end }} subscription: - startingCSV: '{{ "{{hub" }} index .ManagedClusterLabels "autoshift.io/tempo-version" | default "" {{ "hub}}" }}' - namespace: {{ .Values.tempo.namespace }} - channel: '{{ "{{hub" }} index .ManagedClusterLabels "autoshift.io/tempo-channel" | default "{{ .Values.tempo.channel }}" {{ "hub}}" }}' name: '{{ "{{hub" }} index .ManagedClusterLabels "autoshift.io/tempo-subscription-name" | default "{{ .Values.tempo.name }}" {{ "hub}}" }}' - source: '{{ "{{hub" }} index .ManagedClusterLabels "autoshift.io/tempo-source" | default "{{ .Values.tempo.source }}" {{ "hub}}" }}' + namespace: {{ .Values.tempo.namespace }} + # Source Selection Logic (evaluated on hub for each managed cluster): + # 1. If cluster has explicit 'autoshift.io/tempo-source' label → use that value + # 2. Else if cluster has 'autoshift.io/disconnected-mirror: true': + # → use {source}{suffix} (e.g., redhat-operators-mirror) + # → suffix from 'autoshift.io/mirror-catalog-suffix' label, or default '-mirror' + # 3. Else → use standard source (e.g., redhat-operators) + source: '{{ "{{hub" }} index .ManagedClusterLabels "autoshift.io/tempo-source" | default (ternary (printf "%s-%s" "{{ .Values.tempo.source }}" (index .ManagedClusterLabels "autoshift.io/mirror-catalog-suffix" | default "mirror")) "{{ .Values.tempo.source }}" (eq (index .ManagedClusterLabels "autoshift.io/disconnected-mirror") "true")) {{ "hub}}" }}' sourceNamespace: '{{ "{{hub" }} index .ManagedClusterLabels "autoshift.io/tempo-source-namespace" | default "{{ .Values.tempo.sourceNamespace }}" {{ "hub}}" }}' + channel: '{{ "{{hub" }} index .ManagedClusterLabels "autoshift.io/tempo-channel" | default "{{ .Values.tempo.channel }}" {{ "hub}}" }}' + startingCSV: '{{ "{{hub" }} index .ManagedClusterLabels "autoshift.io/tempo-version" | default "" {{ "hub}}" }}' # Upgrade approval for subscription will be set based on if version is set not this flag # If version is set install plan is set to manual and only the version specified will be installed upgradeApproval: Automatic