From 6d7a60bc7d7eb60bb830e1080b8f184f000e516f Mon Sep 17 00:00:00 2001 From: Petr Muller Date: Mon, 29 Jan 2024 17:36:17 +0100 Subject: [PATCH 1/5] cvo: read enabled feature flags from cluster Read CVO-related Feature Flags from the cluster resource and propagate them to CVO controllers. Multiplex the coarse cluster feature flag into smaller, CVO-specific flags for easier maintenance in the future. --- ...luster-version-operator_03_deployment.yaml | 2 + pkg/cvo/cvo.go | 63 +++++++++++++++++++ .../featurechangestopper.go | 21 ++++++- .../featurechangestopper_test.go | 62 +++++++++++++----- pkg/start/start.go | 20 ++++-- pkg/start/start_integration_test.go | 9 ++- 6 files changed, 151 insertions(+), 26 deletions(-) diff --git a/install/0000_00_cluster-version-operator_03_deployment.yaml b/install/0000_00_cluster-version-operator_03_deployment.yaml index d8a32bd694..8950390a26 100644 --- a/install/0000_00_cluster-version-operator_03_deployment.yaml +++ b/install/0000_00_cluster-version-operator_03_deployment.yaml @@ -56,6 +56,8 @@ spec: name: kube-api-access readOnly: true env: + - name: OPERATOR_IMAGE_VERSION + value: "0.0.1-snapshot" - name: KUBERNETES_SERVICE_PORT # allows CVO to communicate with apiserver directly on same host. Is substituted with port from infrastructures.status.apiServerInternalURL if available. value: "6443" - name: KUBERNETES_SERVICE_HOST # allows CVO to communicate with apiserver directly on same host. Is substituted with hostname from infrastructures.status.apiServerInternalURL if available. diff --git a/pkg/cvo/cvo.go b/pkg/cvo/cvo.go index 134f2cafd5..51f13f2f70 100644 --- a/pkg/cvo/cvo.go +++ b/pkg/cvo/cvo.go @@ -4,6 +4,7 @@ import ( "context" "fmt" "net/http" + "os" "strconv" "sync" "time" @@ -56,6 +57,61 @@ const ( maxRetries = 15 ) +// FeatureGates contains flags that control CVO functionality gated by product feature gates. The +// names do not correspond to product feature gates, the booleans here are "smaller" (product-level +// gate will enable multiple CVO behaviors). +type FeatureGates struct { + // UnknownVersion flag is set to true if CVO did not find a matching version in the FeatureGate + // status resource, meaning the current set of enabled and disabled feature gates is unknown for + // this version. This should be a temporary state (config-operator should eventually add the + // enabled/disabled flags for this version), so CVO should try to behave in a way that reflects + // a "good default": default-on flags are enabled, default-off flags are disabled. Where reasonable, + // It can also attempt to tolerate the existing state: if it finds evidence that a feature was + // enabled, it can continue to behave as if it was enabled and vice versa. This temporary state + // should be eventually resolved when the FeatureGate status resource is updated, which forces CVO + // to restart when the flags change. + UnknownVersion bool + + // ResourceReconciliationIssuesCondition controls whether CVO maintains a Condition with + // ResourceReconciliationIssues type, containing a JSON that describes all "issues" that prevented + // or delayed CVO from reconciling individual resources in the cluster. This is a pseudo-API + // that the experimental work for "oc adm upgrade status" uses to report upgrade status, and + // should never be relied upon by any production code. We may want to eventually turn this into + // some kind of "real" API. + ResourceReconciliationIssuesCondition bool +} + +func DefaultGatesWhenUnknown() FeatureGates { + return FeatureGates{ + UnknownVersion: true, + + ResourceReconciliationIssuesCondition: false, + } +} + +func GetCvoGatesFrom(gate *configv1.FeatureGate) FeatureGates { + enabledGates := DefaultGatesWhenUnknown() + // This is analogical to VersionForOperatorFromEnv() from o/library-go but the import is pretty + // heavy for a single, simple os.Getenv wrapper, so we just inline the logic here. + operatorVersion := os.Getenv("OPERATOR_IMAGE_VERSION") + klog.Infof("Looking up feature gates for version %s", operatorVersion) + for _, g := range gate.Status.FeatureGates { + + if g.Version != operatorVersion { + continue + } + // We found the matching version, so we do not need to run in the unknown version mode + enabledGates.UnknownVersion = false + for _, enabled := range g.Enabled { + if enabled.Name == configv1.FeatureGateUpgradeStatus { + enabledGates.ResourceReconciliationIssuesCondition = true + } + } + } + + return enabledGates +} + // Operator defines cluster version operator. The CVO attempts to reconcile the appropriate image // onto the cluster, writing status to the ClusterVersion object as it goes. A background loop // periodically checks for new updates from a server described by spec.upstream and spec.channel. @@ -168,6 +224,11 @@ type Operator struct { // moving resource, so it is not re-detected live. requiredFeatureSet string + // enabledFeatureGates contains flags that control CVO functionality gated by product feature gates. The + // names do not correspond to product feature gates, the booleans here are "smaller" (product-level + // gate will enable multiple CVO behaviors). + enabledFeatureGates FeatureGates + clusterProfile string uid types.UID } @@ -188,6 +249,7 @@ func New( kubeClient kubernetes.Interface, exclude string, requiredFeatureSet string, + enabledFeatureGates FeatureGates, clusterProfile string, promqlTarget clusterconditions.PromQLTarget, injectClusterIdIntoPromQL bool, @@ -220,6 +282,7 @@ func New( exclude: exclude, requiredFeatureSet: requiredFeatureSet, + enabledFeatureGates: enabledFeatureGates, clusterProfile: clusterProfile, conditionRegistry: standard.NewConditionRegistry(promqlTarget), injectClusterIdIntoPromQL: injectClusterIdIntoPromQL, diff --git a/pkg/featurechangestopper/featurechangestopper.go b/pkg/featurechangestopper/featurechangestopper.go index 93e0afed99..0466c24b45 100644 --- a/pkg/featurechangestopper/featurechangestopper.go +++ b/pkg/featurechangestopper/featurechangestopper.go @@ -15,11 +15,14 @@ import ( "k8s.io/client-go/tools/cache" "k8s.io/client-go/util/workqueue" "k8s.io/klog/v2" + + "github.com/openshift/cluster-version-operator/pkg/cvo" ) // FeatureChangeStopper calls stop when the value of the featureset changes type FeatureChangeStopper struct { startingRequiredFeatureSet string + startingCvoFeatureGates cvo.FeatureGates featureGateLister configlistersv1.FeatureGateLister cacheSynced []cache.InformerSynced @@ -31,10 +34,12 @@ type FeatureChangeStopper struct { // New returns a new FeatureChangeStopper. func New( startingRequiredFeatureSet string, + startingCvoGates cvo.FeatureGates, featureGateInformer configinformersv1.FeatureGateInformer, ) (*FeatureChangeStopper, error) { c := &FeatureChangeStopper{ startingRequiredFeatureSet: startingRequiredFeatureSet, + startingCvoFeatureGates: startingCvoGates, featureGateLister: featureGateInformer.Lister(), cacheSynced: []cache.InformerSynced{featureGateInformer.Informer().HasSynced}, queue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "feature-gate-stopper"), @@ -61,22 +66,32 @@ func New( // syncHandler processes a single work entry, with the // processNextWorkItem caller handling the queue management. It returns // done when there will be no more work (because the feature gate changed). -func (c *FeatureChangeStopper) syncHandler(ctx context.Context) (done bool, err error) { +func (c *FeatureChangeStopper) syncHandler(_ context.Context) (done bool, err error) { var current configv1.FeatureSet + var currentCvoGates cvo.FeatureGates if featureGates, err := c.featureGateLister.Get("cluster"); err == nil { current = featureGates.Spec.FeatureSet + currentCvoGates = cvo.GetCvoGatesFrom(featureGates) } else if !apierrors.IsNotFound(err) { return false, err } - if string(current) != c.startingRequiredFeatureSet { + featureSetChanged := string(current) != c.startingRequiredFeatureSet + cvoFeaturesChanged := currentCvoGates != c.startingCvoFeatureGates + if featureSetChanged || cvoFeaturesChanged { var action string if c.shutdownFn == nil { action = "no shutdown function configured" } else { action = "requesting shutdown" } - klog.Infof("FeatureSet was %q, but the current feature set is %q; %s.", c.startingRequiredFeatureSet, current, action) + if featureSetChanged { + klog.Infof("FeatureSet was %q, but the current feature set is %q; %s.", c.startingRequiredFeatureSet, current, action) + } + if cvoFeaturesChanged { + klog.Infof("CVO feature flags were %v, but changed to %v; %s.", c.startingCvoFeatureGates, currentCvoGates, action) + } + if c.shutdownFn != nil { c.shutdownFn() } diff --git a/pkg/featurechangestopper/featurechangestopper_test.go b/pkg/featurechangestopper/featurechangestopper_test.go index 9b92cb8ff5..346547a64c 100644 --- a/pkg/featurechangestopper/featurechangestopper_test.go +++ b/pkg/featurechangestopper/featurechangestopper_test.go @@ -9,45 +9,69 @@ import ( fakeconfigv1client "github.com/openshift/client-go/config/clientset/versioned/fake" configv1informer "github.com/openshift/client-go/config/informers/externalversions" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + "github.com/openshift/cluster-version-operator/pkg/cvo" + "github.com/openshift/cluster-version-operator/pkg/version" ) func TestTechPreviewChangeStopper(t *testing.T) { tests := []struct { name string startingRequiredFeatureSet string - featureGate string - expectedShutdownCalled bool + startingCvoFeatureGates cvo.FeatureGates + + featureSet string + featureGateStatus *configv1.FeatureGateStatus + + expectedShutdownCalled bool }{ { name: "default-no-change", startingRequiredFeatureSet: "", - featureGate: "", + featureSet: "", expectedShutdownCalled: false, }, { name: "default-with-change-to-tech-preview", startingRequiredFeatureSet: "", - featureGate: "TechPreviewNoUpgrade", + featureSet: "TechPreviewNoUpgrade", expectedShutdownCalled: true, }, { name: "default-with-change-to-other", startingRequiredFeatureSet: "", - featureGate: "AnythingElse", + featureSet: "AnythingElse", expectedShutdownCalled: true, }, { name: "techpreview-to-techpreview", startingRequiredFeatureSet: "TechPreviewNoUpgrade", - featureGate: "TechPreviewNoUpgrade", + featureSet: "TechPreviewNoUpgrade", expectedShutdownCalled: false, }, { name: "techpreview-to-not-tech-preview", // this isn't allowed today startingRequiredFeatureSet: "TechPreviewNoUpgrade", - featureGate: "", + featureSet: "", expectedShutdownCalled: true, }, + { + name: "cvo flags changed", + startingRequiredFeatureSet: "TechPreviewNoUpgrade", + startingCvoFeatureGates: cvo.FeatureGates{ + UnknownVersion: true, + }, + featureSet: "TechPreviewNoUpgrade", + featureGateStatus: &configv1.FeatureGateStatus{ + FeatureGates: []configv1.FeatureGateDetails{ + { + Version: version.Version.String(), + Enabled: []configv1.FeatureGateAttributes{{Name: configv1.FeatureGateUpgradeStatus}}, + }, + }, + }, + expectedShutdownCalled: true, + }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { @@ -59,20 +83,26 @@ func TestTechPreviewChangeStopper(t *testing.T) { actualShutdownCalled = true } - client := fakeconfigv1client.NewSimpleClientset( - &configv1.FeatureGate{ - ObjectMeta: metav1.ObjectMeta{Name: "cluster"}, - Spec: configv1.FeatureGateSpec{ - FeatureGateSelection: configv1.FeatureGateSelection{ - FeatureSet: configv1.FeatureSet(tt.featureGate), - }, + fg := &configv1.FeatureGate{ + ObjectMeta: metav1.ObjectMeta{Name: "cluster"}, + Spec: configv1.FeatureGateSpec{ + FeatureGateSelection: configv1.FeatureGateSelection{ + FeatureSet: configv1.FeatureSet(tt.featureSet), }, }, - ) + } + if tt.featureGateStatus != nil { + fg.Status = *tt.featureGateStatus + } else { + fg.Status = configv1.FeatureGateStatus{} + tt.startingCvoFeatureGates = cvo.FeatureGates{UnknownVersion: true} + } + + client := fakeconfigv1client.NewSimpleClientset(fg) informerFactory := configv1informer.NewSharedInformerFactory(client, 0) featureGates := informerFactory.Config().V1().FeatureGates() - c, err := New(tt.startingRequiredFeatureSet, featureGates) + c, err := New(tt.startingRequiredFeatureSet, tt.startingCvoFeatureGates, featureGates) if err != nil { t.Fatal(err) } diff --git a/pkg/start/start.go b/pkg/start/start.go index 46cf79140d..f0eba2b86b 100644 --- a/pkg/start/start.go +++ b/pkg/start/start.go @@ -13,6 +13,7 @@ import ( "time" "github.com/google/uuid" + v1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -32,7 +33,7 @@ import ( configv1 "github.com/openshift/api/config/v1" clientset "github.com/openshift/client-go/config/clientset/versioned" - externalversions "github.com/openshift/client-go/config/informers/externalversions" + "github.com/openshift/client-go/config/informers/externalversions" "github.com/openshift/cluster-version-operator/pkg/autoupdate" "github.com/openshift/cluster-version-operator/pkg/clusterconditions" "github.com/openshift/cluster-version-operator/pkg/cvo" @@ -159,6 +160,9 @@ func (o *Options) Run(ctx context.Context) error { // to be off. If this value changes, the binary will shutdown and expect the pod lifecycle to restart it. startingFeatureSet := "" + // enabledGates control gated CVO functionality (not gated cluster functionality) + enabledGates := cvo.DefaultGatesWhenUnknown() + // client-go automatically retries some network blip errors on GETs for 30s by default, and we want to // retry the remaining ones ourselves. If we fail longer than that, the operator won't be able to do work // anyway. Return the error and crashloop. @@ -180,6 +184,7 @@ func (o *Options) Run(ctx context.Context) error { return false, nil default: startingFeatureSet = string(gate.Spec.FeatureSet) + enabledGates = cvo.GetCvoGatesFrom(gate) return true, nil } }); err != nil { @@ -189,13 +194,19 @@ func (o *Options) Run(ctx context.Context) error { return err } + klog.Infof("Feature set detected at startup: %q", startingFeatureSet) + if enabledGates.UnknownVersion { + klog.Infof("CVO features could not be detected from FeatureGate; will use defaults plus special UnkownVersion feature gate") + } + klog.Infof("CVO features enabled at startup: %+v", enabledGates) + lock, err := createResourceLock(cb, o.Namespace, o.Name) if err != nil { return err } // initialize the controllers and attempt to load the payload information - controllerCtx, err := o.NewControllerContext(cb, startingFeatureSet) + controllerCtx, err := o.NewControllerContext(cb, startingFeatureSet, enabledGates) if err != nil { return err } @@ -471,7 +482,7 @@ type Context struct { // NewControllerContext initializes the default Context for the current Options. It does // not start any background processes. -func (o *Options) NewControllerContext(cb *ClientBuilder, startingFeatureSet string) (*Context, error) { +func (o *Options) NewControllerContext(cb *ClientBuilder, startingFeatureSet string, enabledFeatureGates cvo.FeatureGates) (*Context, error) { client := cb.ClientOrDie("shared-informer") kubeClient := cb.KubeClientOrDie(internal.ConfigNamespace, useProtobuf) @@ -484,7 +495,7 @@ func (o *Options) NewControllerContext(cb *ClientBuilder, startingFeatureSet str sharedInformers := externalversions.NewSharedInformerFactory(client, resyncPeriod(o.ResyncInterval)) coInformer := sharedInformers.Config().V1().ClusterOperators() - featureChangeStopper, err := featurechangestopper.New(startingFeatureSet, sharedInformers.Config().V1().FeatureGates()) + featureChangeStopper, err := featurechangestopper.New(startingFeatureSet, enabledFeatureGates, sharedInformers.Config().V1().FeatureGates()) if err != nil { return nil, err } @@ -506,6 +517,7 @@ func (o *Options) NewControllerContext(cb *ClientBuilder, startingFeatureSet str cvoKubeClient, o.Exclude, startingFeatureSet, + enabledFeatureGates, o.ClusterProfile, o.PromQLTarget, o.InjectClusterIdIntoPromQL, diff --git a/pkg/start/start_integration_test.go b/pkg/start/start_integration_test.go index 9b224df44b..30a71e3813 100644 --- a/pkg/start/start_integration_test.go +++ b/pkg/start/start_integration_test.go @@ -185,7 +185,8 @@ func TestIntegrationCVO_initializeAndUpgrade(t *testing.T) { options.PayloadOverride = filepath.Join(dir, "0.0.1") options.leaderElection = getLeaderElectionConfig(ctx, cfg) startingFeatureSet := "" - controllers, err := options.NewControllerContext(cb, startingFeatureSet) + enabledGates := cvo.FeatureGates{} + controllers, err := options.NewControllerContext(cb, startingFeatureSet, enabledGates) if err != nil { t.Fatal(err) } @@ -317,7 +318,8 @@ func TestIntegrationCVO_gracefulStepDown(t *testing.T) { options.PayloadOverride = filepath.Join(dir, "0.0.1") options.leaderElection = getLeaderElectionConfig(ctx, cfg) startingFeatureSet := "" - controllers, err := options.NewControllerContext(cb, startingFeatureSet) + enabledGates := cvo.FeatureGates{} + controllers, err := options.NewControllerContext(cb, startingFeatureSet, enabledGates) if err != nil { t.Fatal(err) } @@ -511,7 +513,8 @@ metadata: options.PayloadOverride = payloadDir options.leaderElection = getLeaderElectionConfig(ctx, cfg) startingFeatureSet := "" - controllers, err := options.NewControllerContext(cb, startingFeatureSet) + enabledGates := cvo.FeatureGates{} + controllers, err := options.NewControllerContext(cb, startingFeatureSet, enabledGates) if err != nil { t.Fatal(err) } From 922397e45687012c7bcdd872471173db3659ebec Mon Sep 17 00:00:00 2001 From: Petr Muller Date: Tue, 5 Mar 2024 20:27:47 +0100 Subject: [PATCH 2/5] featuregates: move gates-related code into a package --- pkg/cvo/cvo.go | 76 ++--------------- .../featurechangestopper.go | 51 +++++------ .../featurechangestopper_test.go | 21 +++-- pkg/featuregates/featuregates.go | 85 +++++++++++++++++++ pkg/start/start.go | 40 +++++---- pkg/start/start_integration_test.go | 23 +++-- 6 files changed, 156 insertions(+), 140 deletions(-) rename pkg/{featurechangestopper => featuregates}/featurechangestopper.go (74%) rename pkg/{featurechangestopper => featuregates}/featurechangestopper_test.go (87%) create mode 100644 pkg/featuregates/featuregates.go diff --git a/pkg/cvo/cvo.go b/pkg/cvo/cvo.go index 51f13f2f70..a818bca4c4 100644 --- a/pkg/cvo/cvo.go +++ b/pkg/cvo/cvo.go @@ -4,7 +4,6 @@ import ( "context" "fmt" "net/http" - "os" "strconv" "sync" "time" @@ -46,6 +45,7 @@ import ( "github.com/openshift/cluster-version-operator/pkg/customsignaturestore" cvointernal "github.com/openshift/cluster-version-operator/pkg/cvo/internal" "github.com/openshift/cluster-version-operator/pkg/cvo/internal/dynamicclient" + "github.com/openshift/cluster-version-operator/pkg/featuregates" "github.com/openshift/cluster-version-operator/pkg/internal" "github.com/openshift/cluster-version-operator/pkg/payload" "github.com/openshift/cluster-version-operator/pkg/payload/precondition" @@ -57,61 +57,6 @@ const ( maxRetries = 15 ) -// FeatureGates contains flags that control CVO functionality gated by product feature gates. The -// names do not correspond to product feature gates, the booleans here are "smaller" (product-level -// gate will enable multiple CVO behaviors). -type FeatureGates struct { - // UnknownVersion flag is set to true if CVO did not find a matching version in the FeatureGate - // status resource, meaning the current set of enabled and disabled feature gates is unknown for - // this version. This should be a temporary state (config-operator should eventually add the - // enabled/disabled flags for this version), so CVO should try to behave in a way that reflects - // a "good default": default-on flags are enabled, default-off flags are disabled. Where reasonable, - // It can also attempt to tolerate the existing state: if it finds evidence that a feature was - // enabled, it can continue to behave as if it was enabled and vice versa. This temporary state - // should be eventually resolved when the FeatureGate status resource is updated, which forces CVO - // to restart when the flags change. - UnknownVersion bool - - // ResourceReconciliationIssuesCondition controls whether CVO maintains a Condition with - // ResourceReconciliationIssues type, containing a JSON that describes all "issues" that prevented - // or delayed CVO from reconciling individual resources in the cluster. This is a pseudo-API - // that the experimental work for "oc adm upgrade status" uses to report upgrade status, and - // should never be relied upon by any production code. We may want to eventually turn this into - // some kind of "real" API. - ResourceReconciliationIssuesCondition bool -} - -func DefaultGatesWhenUnknown() FeatureGates { - return FeatureGates{ - UnknownVersion: true, - - ResourceReconciliationIssuesCondition: false, - } -} - -func GetCvoGatesFrom(gate *configv1.FeatureGate) FeatureGates { - enabledGates := DefaultGatesWhenUnknown() - // This is analogical to VersionForOperatorFromEnv() from o/library-go but the import is pretty - // heavy for a single, simple os.Getenv wrapper, so we just inline the logic here. - operatorVersion := os.Getenv("OPERATOR_IMAGE_VERSION") - klog.Infof("Looking up feature gates for version %s", operatorVersion) - for _, g := range gate.Status.FeatureGates { - - if g.Version != operatorVersion { - continue - } - // We found the matching version, so we do not need to run in the unknown version mode - enabledGates.UnknownVersion = false - for _, enabled := range g.Enabled { - if enabled.Name == configv1.FeatureGateUpgradeStatus { - enabledGates.ResourceReconciliationIssuesCondition = true - } - } - } - - return enabledGates -} - // Operator defines cluster version operator. The CVO attempts to reconcile the appropriate image // onto the cluster, writing status to the ClusterVersion object as it goes. A background loop // periodically checks for new updates from a server described by spec.upstream and spec.channel. @@ -220,14 +165,7 @@ type Operator struct { // via annotation exclude string - // requiredFeatureSet is set the value of featuregates.config.openshift.io|.spec.featureSet. It's a very slow - // moving resource, so it is not re-detected live. - requiredFeatureSet string - - // enabledFeatureGates contains flags that control CVO functionality gated by product feature gates. The - // names do not correspond to product feature gates, the booleans here are "smaller" (product-level - // gate will enable multiple CVO behaviors). - enabledFeatureGates FeatureGates + clusterFeatures featuregates.ClusterFeatures clusterProfile string uid types.UID @@ -248,8 +186,7 @@ func New( client clientset.Interface, kubeClient kubernetes.Interface, exclude string, - requiredFeatureSet string, - enabledFeatureGates FeatureGates, + clusterFeatures featuregates.ClusterFeatures, clusterProfile string, promqlTarget clusterconditions.PromQLTarget, injectClusterIdIntoPromQL bool, @@ -281,8 +218,7 @@ func New( upgradeableQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "upgradeable"), exclude: exclude, - requiredFeatureSet: requiredFeatureSet, - enabledFeatureGates: enabledFeatureGates, + clusterFeatures: clusterFeatures, clusterProfile: clusterProfile, conditionRegistry: standard.NewConditionRegistry(promqlTarget), injectClusterIdIntoPromQL: injectClusterIdIntoPromQL, @@ -340,7 +276,7 @@ func (optr *Operator) InitializeFromPayload(ctx context.Context, restConfig *res return fmt.Errorf("Error when attempting to get cluster version object: %w", err) } - update, err := payload.LoadUpdate(optr.defaultPayloadDir(), optr.release.Image, optr.exclude, optr.requiredFeatureSet, + update, err := payload.LoadUpdate(optr.defaultPayloadDir(), optr.release.Image, optr.exclude, optr.clusterFeatures.StartingRequiredFeatureSet, optr.clusterProfile, capability.GetKnownCapabilities()) if err != nil { @@ -391,7 +327,7 @@ func (optr *Operator) InitializeFromPayload(ctx context.Context, restConfig *res Cap: time.Second * 15, }, optr.exclude, - optr.requiredFeatureSet, + optr.clusterFeatures.StartingRequiredFeatureSet, optr.eventRecorder, optr.clusterProfile, ) diff --git a/pkg/featurechangestopper/featurechangestopper.go b/pkg/featuregates/featurechangestopper.go similarity index 74% rename from pkg/featurechangestopper/featurechangestopper.go rename to pkg/featuregates/featurechangestopper.go index 0466c24b45..8c7126cc79 100644 --- a/pkg/featurechangestopper/featurechangestopper.go +++ b/pkg/featuregates/featurechangestopper.go @@ -1,4 +1,4 @@ -package featurechangestopper +package featuregates import ( "context" @@ -15,14 +15,11 @@ import ( "k8s.io/client-go/tools/cache" "k8s.io/client-go/util/workqueue" "k8s.io/klog/v2" - - "github.com/openshift/cluster-version-operator/pkg/cvo" ) -// FeatureChangeStopper calls stop when the value of the featureset changes -type FeatureChangeStopper struct { - startingRequiredFeatureSet string - startingCvoFeatureGates cvo.FeatureGates +// ChangeStopper calls stop when the value of the featureset changes +type ChangeStopper struct { + clusterFeatures ClusterFeatures featureGateLister configlistersv1.FeatureGateLister cacheSynced []cache.InformerSynced @@ -31,18 +28,16 @@ type FeatureChangeStopper struct { shutdownFn context.CancelFunc } -// New returns a new FeatureChangeStopper. +// New returns a new ChangeStopper. func New( - startingRequiredFeatureSet string, - startingCvoGates cvo.FeatureGates, + clusterFeatures ClusterFeatures, featureGateInformer configinformersv1.FeatureGateInformer, -) (*FeatureChangeStopper, error) { - c := &FeatureChangeStopper{ - startingRequiredFeatureSet: startingRequiredFeatureSet, - startingCvoFeatureGates: startingCvoGates, - featureGateLister: featureGateInformer.Lister(), - cacheSynced: []cache.InformerSynced{featureGateInformer.Informer().HasSynced}, - queue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "feature-gate-stopper"), +) (*ChangeStopper, error) { + c := &ChangeStopper{ + clusterFeatures: clusterFeatures, + featureGateLister: featureGateInformer.Lister(), + cacheSynced: []cache.InformerSynced{featureGateInformer.Informer().HasSynced}, + queue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "feature-gate-stopper"), } c.queue.Add("cluster") // seed an initial sync, in case startingRequiredFeatureSet is wrong @@ -66,18 +61,18 @@ func New( // syncHandler processes a single work entry, with the // processNextWorkItem caller handling the queue management. It returns // done when there will be no more work (because the feature gate changed). -func (c *FeatureChangeStopper) syncHandler(_ context.Context) (done bool, err error) { +func (c *ChangeStopper) syncHandler(_ context.Context) (done bool, err error) { var current configv1.FeatureSet - var currentCvoGates cvo.FeatureGates + var currentCvoGates CvoGates if featureGates, err := c.featureGateLister.Get("cluster"); err == nil { current = featureGates.Spec.FeatureSet - currentCvoGates = cvo.GetCvoGatesFrom(featureGates) + currentCvoGates = getCvoGatesFrom(featureGates, c.clusterFeatures.VersionForGates) } else if !apierrors.IsNotFound(err) { return false, err } - featureSetChanged := string(current) != c.startingRequiredFeatureSet - cvoFeaturesChanged := currentCvoGates != c.startingCvoFeatureGates + featureSetChanged := string(current) != c.clusterFeatures.StartingRequiredFeatureSet + cvoFeaturesChanged := currentCvoGates != c.clusterFeatures.StartingCvoFeatureGates if featureSetChanged || cvoFeaturesChanged { var action string if c.shutdownFn == nil { @@ -86,10 +81,10 @@ func (c *FeatureChangeStopper) syncHandler(_ context.Context) (done bool, err er action = "requesting shutdown" } if featureSetChanged { - klog.Infof("FeatureSet was %q, but the current feature set is %q; %s.", c.startingRequiredFeatureSet, current, action) + klog.Infof("FeatureSet was %q, but the current feature set is %q; %s.", c.clusterFeatures.StartingRequiredFeatureSet, current, action) } if cvoFeaturesChanged { - klog.Infof("CVO feature flags were %v, but changed to %v; %s.", c.startingCvoFeatureGates, currentCvoGates, action) + klog.Infof("CVO feature flags were %v, but changed to %v; %s.", c.clusterFeatures.StartingCvoFeatureGates, currentCvoGates, action) } if c.shutdownFn != nil { @@ -101,7 +96,7 @@ func (c *FeatureChangeStopper) syncHandler(_ context.Context) (done bool, err er } // Run launches the controller and blocks until it is canceled or work completes. -func (c *FeatureChangeStopper) Run(ctx context.Context, shutdownFn context.CancelFunc) error { +func (c *ChangeStopper) Run(ctx context.Context, shutdownFn context.CancelFunc) error { // don't let panics crash the process defer utilruntime.HandleCrash() // make sure the work queue is shutdown which will trigger workers to end @@ -114,7 +109,7 @@ func (c *FeatureChangeStopper) Run(ctx context.Context, shutdownFn context.Cance }() c.shutdownFn = shutdownFn - klog.Infof("Starting stop-on-featureset-change controller with %q.", c.startingRequiredFeatureSet) + klog.Infof("Starting stop-on-featureset-change controller with %q.", c.clusterFeatures.StartingRequiredFeatureSet) // wait for your secondary caches to fill before starting your work if !cache.WaitForCacheSync(ctx.Done(), c.cacheSynced...) { @@ -129,7 +124,7 @@ func (c *FeatureChangeStopper) Run(ctx context.Context, shutdownFn context.Cance // runWorker handles a single worker poll round, processing as many // work items as possible, and returning done when there will be no // more work. -func (c *FeatureChangeStopper) runWorker(ctx context.Context) (done bool, err error) { +func (c *ChangeStopper) runWorker(ctx context.Context) (done bool, err error) { // hot loop until we're told to stop. processNextWorkItem will // automatically wait until there's work available, so we don't worry // about secondary waits @@ -142,7 +137,7 @@ func (c *FeatureChangeStopper) runWorker(ctx context.Context) (done bool, err er // processNextWorkItem deals with one key off the queue. It returns // done when there will be no more work. -func (c *FeatureChangeStopper) processNextWorkItem(ctx context.Context) (done bool, err error) { +func (c *ChangeStopper) processNextWorkItem(ctx context.Context) (done bool, err error) { // pull the next work item from queue. It should be a key we use to lookup // something in a cache key, quit := c.queue.Get() diff --git a/pkg/featurechangestopper/featurechangestopper_test.go b/pkg/featuregates/featurechangestopper_test.go similarity index 87% rename from pkg/featurechangestopper/featurechangestopper_test.go rename to pkg/featuregates/featurechangestopper_test.go index 346547a64c..d606625671 100644 --- a/pkg/featurechangestopper/featurechangestopper_test.go +++ b/pkg/featuregates/featurechangestopper_test.go @@ -1,4 +1,4 @@ -package featurechangestopper +package featuregates import ( "context" @@ -9,16 +9,14 @@ import ( fakeconfigv1client "github.com/openshift/client-go/config/clientset/versioned/fake" configv1informer "github.com/openshift/client-go/config/informers/externalversions" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - - "github.com/openshift/cluster-version-operator/pkg/cvo" - "github.com/openshift/cluster-version-operator/pkg/version" ) func TestTechPreviewChangeStopper(t *testing.T) { + versionForGates := "1.2.3" tests := []struct { name string startingRequiredFeatureSet string - startingCvoFeatureGates cvo.FeatureGates + startingCvoFeatureGates CvoGates featureSet string featureGateStatus *configv1.FeatureGateStatus @@ -58,14 +56,14 @@ func TestTechPreviewChangeStopper(t *testing.T) { { name: "cvo flags changed", startingRequiredFeatureSet: "TechPreviewNoUpgrade", - startingCvoFeatureGates: cvo.FeatureGates{ + startingCvoFeatureGates: CvoGates{ UnknownVersion: true, }, featureSet: "TechPreviewNoUpgrade", featureGateStatus: &configv1.FeatureGateStatus{ FeatureGates: []configv1.FeatureGateDetails{ { - Version: version.Version.String(), + Version: versionForGates, Enabled: []configv1.FeatureGateAttributes{{Name: configv1.FeatureGateUpgradeStatus}}, }, }, @@ -95,14 +93,19 @@ func TestTechPreviewChangeStopper(t *testing.T) { fg.Status = *tt.featureGateStatus } else { fg.Status = configv1.FeatureGateStatus{} - tt.startingCvoFeatureGates = cvo.FeatureGates{UnknownVersion: true} + tt.startingCvoFeatureGates = CvoGates{UnknownVersion: true} } client := fakeconfigv1client.NewSimpleClientset(fg) informerFactory := configv1informer.NewSharedInformerFactory(client, 0) featureGates := informerFactory.Config().V1().FeatureGates() - c, err := New(tt.startingRequiredFeatureSet, tt.startingCvoFeatureGates, featureGates) + cf := ClusterFeatures{ + StartingRequiredFeatureSet: tt.startingRequiredFeatureSet, + StartingCvoFeatureGates: tt.startingCvoFeatureGates, + VersionForGates: versionForGates, + } + c, err := New(cf, featureGates) if err != nil { t.Fatal(err) } diff --git a/pkg/featuregates/featuregates.go b/pkg/featuregates/featuregates.go new file mode 100644 index 0000000000..efbfb6b7ce --- /dev/null +++ b/pkg/featuregates/featuregates.go @@ -0,0 +1,85 @@ +package featuregates + +import ( + configv1 "github.com/openshift/api/config/v1" + "k8s.io/klog/v2" +) + +// CvoGates contains flags that control CVO functionality gated by product feature gates. The +// names do not correspond to product feature gates, the booleans here are "smaller" (product-level +// gate will enable multiple CVO behaviors). +type CvoGates struct { + + // UnknownVersion flag is set to true if CVO did not find a matching version in the FeatureGate + // status resource, meaning the current set of enabled and disabled feature gates is unknown for + // this version. This should be a temporary state (config-operator should eventually add the + // enabled/disabled flags for this version), so CVO should try to behave in a way that reflects + // a "good default": default-on flags are enabled, default-off flags are disabled. Where reasonable, + // It can also attempt to tolerate the existing state: if it finds evidence that a feature was + // enabled, it can continue to behave as if it was enabled and vice versa. This temporary state + // should be eventually resolved when the FeatureGate status resource is updated, which forces CVO + // to restart when the flags change. + UnknownVersion bool + + // ResourceReconciliationIssuesCondition controls whether CVO maintains a Condition with + // ResourceReconciliationIssues type, containing a JSON that describes all "issues" that prevented + // or delayed CVO from reconciling individual resources in the cluster. This is a pseudo-API + // that the experimental work for "oc adm upgrade status" uses to report upgrade status, and + // should never be relied upon by any production code. We may want to eventually turn this into + // some kind of "real" API. + ResourceReconciliationIssuesCondition bool +} + +type ClusterFeatures struct { + StartingRequiredFeatureSet string + + // StartingCvoFeatureGates control gated CVO functionality (not gated cluster functionality) + StartingCvoFeatureGates CvoGates + VersionForGates string +} + +func DefaultClusterFeatures(version string) ClusterFeatures { + return ClusterFeatures{ + // check to see if techpreview should be on or off. If we cannot read the featuregate for any reason, it is assumed + // to be off. If this value changes, the binary will shutdown and expect the pod lifecycle to restart it. + StartingRequiredFeatureSet: "", + StartingCvoFeatureGates: defaultGatesWhenUnknown(), + VersionForGates: version, + } +} + +func ClusterFeaturesFromFeatureGate(gate *configv1.FeatureGate, version string) ClusterFeatures { + return ClusterFeatures{ + StartingRequiredFeatureSet: string(gate.Spec.FeatureSet), + StartingCvoFeatureGates: getCvoGatesFrom(gate, version), + VersionForGates: version, + } +} + +func defaultGatesWhenUnknown() CvoGates { + return CvoGates{ + UnknownVersion: true, + + ResourceReconciliationIssuesCondition: false, + } +} + +func getCvoGatesFrom(gate *configv1.FeatureGate, version string) CvoGates { + enabledGates := defaultGatesWhenUnknown() + klog.Infof("Looking up feature gates for version %s", version) + for _, g := range gate.Status.FeatureGates { + + if g.Version != version { + continue + } + // We found the matching version, so we do not need to run in the unknown version mode + enabledGates.UnknownVersion = false + for _, enabled := range g.Enabled { + if enabled.Name == configv1.FeatureGateUpgradeStatus { + enabledGates.ResourceReconciliationIssuesCondition = true + } + } + } + + return enabledGates +} diff --git a/pkg/start/start.go b/pkg/start/start.go index f0eba2b86b..62397b8e7b 100644 --- a/pkg/start/start.go +++ b/pkg/start/start.go @@ -34,14 +34,15 @@ import ( configv1 "github.com/openshift/api/config/v1" clientset "github.com/openshift/client-go/config/clientset/versioned" "github.com/openshift/client-go/config/informers/externalversions" + "github.com/openshift/library-go/pkg/config/clusterstatus" + libgoleaderelection "github.com/openshift/library-go/pkg/config/leaderelection" + "github.com/openshift/cluster-version-operator/pkg/autoupdate" "github.com/openshift/cluster-version-operator/pkg/clusterconditions" "github.com/openshift/cluster-version-operator/pkg/cvo" - "github.com/openshift/cluster-version-operator/pkg/featurechangestopper" + "github.com/openshift/cluster-version-operator/pkg/featuregates" "github.com/openshift/cluster-version-operator/pkg/internal" "github.com/openshift/cluster-version-operator/pkg/payload" - "github.com/openshift/library-go/pkg/config/clusterstatus" - libgoleaderelection "github.com/openshift/library-go/pkg/config/leaderelection" ) const ( @@ -156,12 +157,11 @@ func (o *Options) Run(ctx context.Context) error { return fmt.Errorf("error creating clients: %v", err) } - // check to see if techpreview should be on or off. If we cannot read the featuregate for any reason, it is assumed - // to be off. If this value changes, the binary will shutdown and expect the pod lifecycle to restart it. - startingFeatureSet := "" + var clusterFeatures featuregates.ClusterFeatures - // enabledGates control gated CVO functionality (not gated cluster functionality) - enabledGates := cvo.DefaultGatesWhenUnknown() + // This is analogical to VersionForOperatorFromEnv() from o/library-go but the import is pretty + // heavy for a single, simple os.Getenv wrapper, so we just inline the logic here. + operatorVersion := os.Getenv("OPERATOR_IMAGE_VERSION") // client-go automatically retries some network blip errors on GETs for 30s by default, and we want to // retry the remaining ones ourselves. If we fail longer than that, the operator won't be able to do work @@ -176,15 +176,14 @@ func (o *Options) Run(ctx context.Context) error { case apierrors.IsNotFound(fgErr): // if we have no featuregates, then the cluster is using the default featureset, which is "". // This excludes everything that could possibly depend on a different feature set. - startingFeatureSet = "" + clusterFeatures = featuregates.DefaultClusterFeatures(operatorVersion) return true, nil case fgErr != nil: lastError = fgErr klog.Warningf("Failed to get FeatureGate from cluster: %v", fgErr) return false, nil default: - startingFeatureSet = string(gate.Spec.FeatureSet) - enabledGates = cvo.GetCvoGatesFrom(gate) + clusterFeatures = featuregates.ClusterFeaturesFromFeatureGate(gate, operatorVersion) return true, nil } }); err != nil { @@ -194,11 +193,11 @@ func (o *Options) Run(ctx context.Context) error { return err } - klog.Infof("Feature set detected at startup: %q", startingFeatureSet) - if enabledGates.UnknownVersion { - klog.Infof("CVO features could not be detected from FeatureGate; will use defaults plus special UnkownVersion feature gate") + klog.Infof("Feature set detected at startup: %q", clusterFeatures.StartingRequiredFeatureSet) + if clusterFeatures.StartingCvoFeatureGates.UnknownVersion { + klog.Infof("CVO features for version %s could not be detected from FeatureGate; will use defaults plus special UnkownVersion feature gate", clusterFeatures.VersionForGates) } - klog.Infof("CVO features enabled at startup: %+v", enabledGates) + klog.Infof("CVO features for version %s enabled at startup: %+v", clusterFeatures.VersionForGates, clusterFeatures.StartingCvoFeatureGates) lock, err := createResourceLock(cb, o.Namespace, o.Name) if err != nil { @@ -206,7 +205,7 @@ func (o *Options) Run(ctx context.Context) error { } // initialize the controllers and attempt to load the payload information - controllerCtx, err := o.NewControllerContext(cb, startingFeatureSet, enabledGates) + controllerCtx, err := o.NewControllerContext(cb, clusterFeatures) if err != nil { return err } @@ -472,7 +471,7 @@ func getLeaderElectionConfig(ctx context.Context, restcfg *rest.Config) configv1 type Context struct { CVO *cvo.Operator AutoUpdate *autoupdate.Controller - StopOnFeatureGateChange *featurechangestopper.FeatureChangeStopper + StopOnFeatureGateChange *featuregates.ChangeStopper CVInformerFactory externalversions.SharedInformerFactory OpenshiftConfigInformerFactory informers.SharedInformerFactory @@ -482,7 +481,7 @@ type Context struct { // NewControllerContext initializes the default Context for the current Options. It does // not start any background processes. -func (o *Options) NewControllerContext(cb *ClientBuilder, startingFeatureSet string, enabledFeatureGates cvo.FeatureGates) (*Context, error) { +func (o *Options) NewControllerContext(cb *ClientBuilder, clusterFeatures featuregates.ClusterFeatures) (*Context, error) { client := cb.ClientOrDie("shared-informer") kubeClient := cb.KubeClientOrDie(internal.ConfigNamespace, useProtobuf) @@ -495,7 +494,7 @@ func (o *Options) NewControllerContext(cb *ClientBuilder, startingFeatureSet str sharedInformers := externalversions.NewSharedInformerFactory(client, resyncPeriod(o.ResyncInterval)) coInformer := sharedInformers.Config().V1().ClusterOperators() - featureChangeStopper, err := featurechangestopper.New(startingFeatureSet, enabledFeatureGates, sharedInformers.Config().V1().FeatureGates()) + featureChangeStopper, err := featuregates.New(clusterFeatures, sharedInformers.Config().V1().FeatureGates()) if err != nil { return nil, err } @@ -516,8 +515,7 @@ func (o *Options) NewControllerContext(cb *ClientBuilder, startingFeatureSet str cb.ClientOrDie(o.Namespace), cvoKubeClient, o.Exclude, - startingFeatureSet, - enabledFeatureGates, + clusterFeatures, o.ClusterProfile, o.PromQLTarget, o.InjectClusterIdIntoPromQL, diff --git a/pkg/start/start_integration_test.go b/pkg/start/start_integration_test.go index 30a71e3813..8c40607baf 100644 --- a/pkg/start/start_integration_test.go +++ b/pkg/start/start_integration_test.go @@ -33,6 +33,7 @@ import ( "github.com/openshift/cluster-version-operator/lib/resourcemerge" "github.com/openshift/cluster-version-operator/pkg/cvo" + "github.com/openshift/cluster-version-operator/pkg/featuregates" "github.com/openshift/cluster-version-operator/pkg/payload" ) @@ -184,14 +185,13 @@ func TestIntegrationCVO_initializeAndUpgrade(t *testing.T) { options.ReleaseImage = payloadImage1 options.PayloadOverride = filepath.Join(dir, "0.0.1") options.leaderElection = getLeaderElectionConfig(ctx, cfg) - startingFeatureSet := "" - enabledGates := cvo.FeatureGates{} - controllers, err := options.NewControllerContext(cb, startingFeatureSet, enabledGates) + var clusterFeatures featuregates.ClusterFeatures + controllers, err := options.NewControllerContext(cb, clusterFeatures) if err != nil { t.Fatal(err) } - worker := cvo.NewSyncWorker(retriever, cvo.NewResourceBuilder(cfg, cfg, nil, nil), 5*time.Second, wait.Backoff{Steps: 3}, "", startingFeatureSet, record.NewFakeRecorder(100), payload.DefaultClusterProfile) + worker := cvo.NewSyncWorker(retriever, cvo.NewResourceBuilder(cfg, cfg, nil, nil), 5*time.Second, wait.Backoff{Steps: 3}, "", clusterFeatures.StartingRequiredFeatureSet, record.NewFakeRecorder(100), payload.DefaultClusterProfile) controllers.CVO.SetSyncWorkerForTesting(worker) lock, err := createResourceLock(cb, options.Namespace, options.Name) @@ -317,14 +317,13 @@ func TestIntegrationCVO_gracefulStepDown(t *testing.T) { options.ReleaseImage = payloadImage1 options.PayloadOverride = filepath.Join(dir, "0.0.1") options.leaderElection = getLeaderElectionConfig(ctx, cfg) - startingFeatureSet := "" - enabledGates := cvo.FeatureGates{} - controllers, err := options.NewControllerContext(cb, startingFeatureSet, enabledGates) + var clusterFeatures featuregates.ClusterFeatures + controllers, err := options.NewControllerContext(cb, clusterFeatures) if err != nil { t.Fatal(err) } - worker := cvo.NewSyncWorker(retriever, cvo.NewResourceBuilder(cfg, cfg, nil, nil), 5*time.Second, wait.Backoff{Steps: 3}, "", startingFeatureSet, record.NewFakeRecorder(100), payload.DefaultClusterProfile) + worker := cvo.NewSyncWorker(retriever, cvo.NewResourceBuilder(cfg, cfg, nil, nil), 5*time.Second, wait.Backoff{Steps: 3}, "", clusterFeatures.StartingRequiredFeatureSet, record.NewFakeRecorder(100), payload.DefaultClusterProfile) controllers.CVO.SetSyncWorkerForTesting(worker) lock, err := createResourceLock(cb, options.Namespace, options.Name) @@ -512,14 +511,14 @@ metadata: options.ReleaseImage = payloadImage1 options.PayloadOverride = payloadDir options.leaderElection = getLeaderElectionConfig(ctx, cfg) - startingFeatureSet := "" - enabledGates := cvo.FeatureGates{} - controllers, err := options.NewControllerContext(cb, startingFeatureSet, enabledGates) + + var clusterFeatures featuregates.ClusterFeatures + controllers, err := options.NewControllerContext(cb, clusterFeatures) if err != nil { t.Fatal(err) } - worker := cvo.NewSyncWorker(retriever, cvo.NewResourceBuilder(cfg, cfg, nil, nil), 5*time.Second, wait.Backoff{Steps: 3}, "", startingFeatureSet, record.NewFakeRecorder(100), payload.DefaultClusterProfile) + worker := cvo.NewSyncWorker(retriever, cvo.NewResourceBuilder(cfg, cfg, nil, nil), 5*time.Second, wait.Backoff{Steps: 3}, "", clusterFeatures.StartingRequiredFeatureSet, record.NewFakeRecorder(100), payload.DefaultClusterProfile) controllers.CVO.SetSyncWorkerForTesting(worker) arch := runtime.GOARCH From e7ea6db69ad5757708fed2b8b18706b81fabfa47 Mon Sep 17 00:00:00 2001 From: Petr Muller Date: Mon, 29 Jan 2024 17:54:58 +0100 Subject: [PATCH 3/5] cvo: set ResourceReconciliationIssues condition When an appropriate feature flag is set, maintain a `ResourceReconciliationIssues` condition on the CV status. This condition is False when no issues were encountered (signalled by the `Failure` field on the `SyncWorkerStatus` parameter) and True otherwise. --- pkg/cvo/resource_reconciliation_issues.go | 13 ++ pkg/cvo/status.go | 24 +++- pkg/cvo/status_test.go | 140 ++++++++++++++++++++++ 3 files changed, 175 insertions(+), 2 deletions(-) create mode 100644 pkg/cvo/resource_reconciliation_issues.go diff --git a/pkg/cvo/resource_reconciliation_issues.go b/pkg/cvo/resource_reconciliation_issues.go new file mode 100644 index 0000000000..04a57753d7 --- /dev/null +++ b/pkg/cvo/resource_reconciliation_issues.go @@ -0,0 +1,13 @@ +package cvo + +import v1 "github.com/openshift/api/config/v1" + +const ( + resourceReconciliationIssuesConditionType v1.ClusterStatusConditionType = "ResourceReconciliationIssues" + + noResourceReconciliationIssuesReason string = "NoIssues" + noResourceReconciliationIssuesMessage string = "No issues found during resource reconciliation" + + resourceReconciliationIssuesFoundReason string = "IssuesFound" + resourceReconciliationIssuesFoundMessage string = "Issues found during resource reconciliation" +) diff --git a/pkg/cvo/status.go b/pkg/cvo/status.go index 25d7142b6a..ba5917fb7e 100644 --- a/pkg/cvo/status.go +++ b/pkg/cvo/status.go @@ -23,6 +23,7 @@ import ( configclientv1 "github.com/openshift/client-go/config/clientset/versioned/typed/config/v1" "github.com/openshift/cluster-version-operator/lib/resourcemerge" + "github.com/openshift/cluster-version-operator/pkg/featuregates" "github.com/openshift/cluster-version-operator/pkg/payload" ) @@ -198,7 +199,7 @@ func (optr *Operator) syncStatus(ctx context.Context, original, config *configv1 original = config.DeepCopy() } - updateClusterVersionStatus(&config.Status, status, optr.release, optr.getAvailableUpdates, validationErrs) + updateClusterVersionStatus(&config.Status, status, optr.release, optr.getAvailableUpdates, optr.clusterFeatures.StartingCvoFeatureGates, validationErrs) if klog.V(6).Enabled() { klog.Infof("Apply config: %s", diff.ObjectReflectDiff(original, config)) @@ -210,7 +211,8 @@ func (optr *Operator) syncStatus(ctx context.Context, original, config *configv1 // updateClusterVersionStatus updates the passed cvStatus with the latest status information func updateClusterVersionStatus(cvStatus *configv1.ClusterVersionStatus, status *SyncWorkerStatus, - release configv1.Release, getAvailableUpdates func() *availableUpdates, validationErrs field.ErrorList) { + release configv1.Release, getAvailableUpdates func() *availableUpdates, enabledGates featuregates.CvoGates, + validationErrs field.ErrorList) { cvStatus.ObservedGeneration = status.Generation if len(status.VersionHash) > 0 { @@ -379,6 +381,24 @@ func updateClusterVersionStatus(cvStatus *configv1.ClusterVersionStatus, status } } + oldRriCondition := resourcemerge.FindOperatorStatusCondition(cvStatus.Conditions, resourceReconciliationIssuesConditionType) + if enabledGates.ResourceReconciliationIssuesCondition || (oldRriCondition != nil && enabledGates.UnknownVersion) { + rriCondition := configv1.ClusterOperatorStatusCondition{ + Type: resourceReconciliationIssuesConditionType, + Status: configv1.ConditionFalse, + Reason: noResourceReconciliationIssuesReason, + Message: noResourceReconciliationIssuesMessage, + } + if status.Failure != nil { + rriCondition.Status = configv1.ConditionTrue + rriCondition.Reason = resourceReconciliationIssuesFoundReason + rriCondition.Message = fmt.Sprintf("%s: %s", resourceReconciliationIssuesFoundMessage, status.Failure.Error()) + } + resourcemerge.SetOperatorStatusCondition(&cvStatus.Conditions, rriCondition) + } else if oldRriCondition != nil { + resourcemerge.RemoveOperatorStatusCondition(&cvStatus.Conditions, resourceReconciliationIssuesConditionType) + } + // default retrieved updates if it is not set if resourcemerge.FindOperatorStatusCondition(cvStatus.Conditions, configv1.RetrievedUpdates) == nil { resourcemerge.SetOperatorStatusCondition(&cvStatus.Conditions, configv1.ClusterOperatorStatusCondition{ diff --git a/pkg/cvo/status_test.go b/pkg/cvo/status_test.go index f9985f34ad..b953d139b1 100644 --- a/pkg/cvo/status_test.go +++ b/pkg/cvo/status_test.go @@ -6,12 +6,18 @@ import ( "reflect" "testing" + "github.com/google/go-cmp/cmp" + "github.com/google/go-cmp/cmp/cmpopts" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/diff" + "k8s.io/apimachinery/pkg/util/validation/field" "k8s.io/client-go/tools/record" configv1 "github.com/openshift/api/config/v1" "github.com/openshift/client-go/config/clientset/versioned/fake" + + "github.com/openshift/cluster-version-operator/lib/resourcemerge" + "github.com/openshift/cluster-version-operator/pkg/featuregates" ) func Test_mergeEqualVersions(t *testing.T) { @@ -190,3 +196,137 @@ func TestOperator_syncFailingStatus(t *testing.T) { }) } } + +func TestUpdateClusterVersionStatus_UnknownVersionAndRRI(t *testing.T) { + ignoreLastTransitionTime := cmpopts.IgnoreFields(configv1.ClusterOperatorStatusCondition{}, "LastTransitionTime") + + testCases := []struct { + name string + + unknownVersion bool + oldCondition *configv1.ClusterOperatorStatusCondition + failure error + + expectedRriCondition *configv1.ClusterOperatorStatusCondition + }{ + { + name: "RRI disabled, version known, no failure => condition not present", + unknownVersion: false, + expectedRriCondition: nil, + }, + { + name: "RRI disabled, version known, failure => condition not present", + unknownVersion: false, + failure: fmt.Errorf("Something happened"), + expectedRriCondition: nil, + }, + { + name: "RRI disabled, version unknown, failure, existing condition => condition present", + oldCondition: &configv1.ClusterOperatorStatusCondition{ + Type: resourceReconciliationIssuesConditionType, + Status: configv1.ConditionFalse, + Reason: noResourceReconciliationIssuesReason, + Message: "Happy condition is happy", + }, + unknownVersion: true, + failure: fmt.Errorf("Something happened"), + expectedRriCondition: &configv1.ClusterOperatorStatusCondition{ + Type: resourceReconciliationIssuesConditionType, + Status: configv1.ConditionTrue, + Reason: resourceReconciliationIssuesFoundReason, + Message: "Issues found during resource reconciliation: Something happened", + }, + }, + { + name: "RRI disabled, version unknown, failure, no existing condition => condition not present", + unknownVersion: true, + failure: fmt.Errorf("Something happened"), + expectedRriCondition: nil, + }, + } + for _, tc := range testCases { + tc := tc + t.Run(tc.name, func(t *testing.T) { + gates := featuregates.CvoGates{ + UnknownVersion: tc.unknownVersion, + ResourceReconciliationIssuesCondition: false, + } + release := configv1.Release{} + getAvailableUpdates := func() *availableUpdates { return nil } + var noErrors field.ErrorList + cvStatus := configv1.ClusterVersionStatus{} + if tc.oldCondition != nil { + cvStatus.Conditions = append(cvStatus.Conditions, *tc.oldCondition) + } + updateClusterVersionStatus(&cvStatus, &SyncWorkerStatus{Failure: tc.failure}, release, getAvailableUpdates, gates, noErrors) + condition := resourcemerge.FindOperatorStatusCondition(cvStatus.Conditions, resourceReconciliationIssuesConditionType) + if diff := cmp.Diff(tc.expectedRriCondition, condition, ignoreLastTransitionTime); diff != "" { + t.Errorf("unexpected condition\n:%s", diff) + } + }) + + } + +} + +func TestUpdateClusterVersionStatus_ResourceReconciliationIssues(t *testing.T) { + ignoreLastTransitionTime := cmpopts.IgnoreFields(configv1.ClusterOperatorStatusCondition{}, "LastTransitionTime") + + testCases := []struct { + name string + syncWorkerStatus SyncWorkerStatus + + enabled bool + + expectedCondition *configv1.ClusterOperatorStatusCondition + }{ + { + name: "ResourceReconciliationIssues present and happy when gate is enabled and no failures happened", + syncWorkerStatus: SyncWorkerStatus{}, + enabled: true, + expectedCondition: &configv1.ClusterOperatorStatusCondition{ + Type: resourceReconciliationIssuesConditionType, + Status: configv1.ConditionFalse, + Reason: noResourceReconciliationIssuesReason, + Message: noResourceReconciliationIssuesMessage, + }, + }, + { + name: "ResourceReconciliationIssues present and unhappy when gate is enabled and failures happened", + syncWorkerStatus: SyncWorkerStatus{ + Failure: fmt.Errorf("Something happened"), + }, + enabled: true, + expectedCondition: &configv1.ClusterOperatorStatusCondition{ + Type: resourceReconciliationIssuesConditionType, + Status: configv1.ConditionTrue, + Reason: resourceReconciliationIssuesFoundReason, + Message: "Issues found during resource reconciliation: Something happened", + }, + }, + { + name: "ResourceReconciliationIssues not present when gate is enabled and failures happened", + syncWorkerStatus: SyncWorkerStatus{ + Failure: fmt.Errorf("Something happened"), + }, + enabled: false, + expectedCondition: nil, + }, + } + + for _, tc := range testCases { + tc := tc + t.Run(tc.name, func(t *testing.T) { + gates := featuregates.CvoGates{ResourceReconciliationIssuesCondition: tc.enabled} + release := configv1.Release{} + getAvailableUpdates := func() *availableUpdates { return nil } + var noErrors field.ErrorList + cvStatus := configv1.ClusterVersionStatus{} + updateClusterVersionStatus(&cvStatus, &tc.syncWorkerStatus, release, getAvailableUpdates, gates, noErrors) + condition := resourcemerge.FindOperatorStatusCondition(cvStatus.Conditions, resourceReconciliationIssuesConditionType) + if diff := cmp.Diff(tc.expectedCondition, condition, ignoreLastTransitionTime); diff != "" { + t.Errorf("unexpected condition\n:%s", diff) + } + }) + } +} From 7bd309612f3f741a2479c7960299344de9f162bc Mon Sep 17 00:00:00 2001 From: Petr Muller Date: Wed, 28 Feb 2024 17:40:47 +0100 Subject: [PATCH 4/5] featuregates: process gates after version is read from payload Because of [OCPBUGS-30080](https://issues.redhat.com/browse/OCPBUGS-30080), we cannot easily determine running CVO version by a single `os.Getenv()`, like other operators can. CVO can determine its version from the initial payload it loads from disk though, but this happens a bit later in the code flow, after leadership lease is acquired and all informers are started. At that point we can provide the feature gate / featureset knowledge to the structures that need it: actual CVO controller and the feature changestopper, but these structures also need to be initialized earlier (they require informers which are already started). This leads to a slightly awkard delayed initialization scheme, where the controller structures are initialized early and populated with early content like informers etc. Later, when informers are started and CVO loads its initial payload, we can extract the version from it and use it to populate the feature gate in the controller structures. Because enabled feature gates are avaiable later in the flow, it also means part of the CVO code cannot be gated by a feature gate (like controller initialization, or initial payload loading). We do not need that now but it may cause issues later. The high-level sequence after this commit looks like this: 1. Initialize CVO and ChangeStopper controller structures with informers they need, and populate CVO's `enabledFeatureGate` checker with one panics when used (no code can check for gates before we know them) 2. Acquire lease and start the informers 3. Fetch a FeatureGate resource from the cluster (using an informer) and determine the FeatureSet from it (needed to load the payload) 4. Load the initial payload from disk and extract the version from it 5. Use the version to determine the enabled feature gates from the FeatureGate resource 6. Populate the CVO and ChangeStopper controller structures with the newly discovered feature gates --- ...luster-version-operator_03_deployment.yaml | 1 + pkg/cvo/cvo.go | 45 +++--- pkg/cvo/cvo_scenarios_test.go | 20 +-- pkg/cvo/cvo_test.go | 6 +- pkg/cvo/status.go | 6 +- pkg/cvo/status_test.go | 25 +++- pkg/cvo/sync_worker.go | 8 +- pkg/featuregates/featurechangestopper.go | 36 +++-- pkg/featuregates/featurechangestopper_test.go | 16 +-- pkg/featuregates/featuregates.go | 91 ++++++++----- pkg/start/start.go | 128 +++++++++++------- pkg/start/start_integration_test.go | 16 +-- 12 files changed, 233 insertions(+), 165 deletions(-) diff --git a/install/0000_00_cluster-version-operator_03_deployment.yaml b/install/0000_00_cluster-version-operator_03_deployment.yaml index 8950390a26..ebbdb31f17 100644 --- a/install/0000_00_cluster-version-operator_03_deployment.yaml +++ b/install/0000_00_cluster-version-operator_03_deployment.yaml @@ -56,6 +56,7 @@ spec: name: kube-api-access readOnly: true env: + # Unfortunately the placeholder is not replaced, reported as OCPBUGS-30080 - name: OPERATOR_IMAGE_VERSION value: "0.0.1-snapshot" - name: KUBERNETES_SERVICE_PORT # allows CVO to communicate with apiserver directly on same host. Is substituted with port from infrastructures.status.apiServerInternalURL if available. diff --git a/pkg/cvo/cvo.go b/pkg/cvo/cvo.go index a818bca4c4..ee8d49c82a 100644 --- a/pkg/cvo/cvo.go +++ b/pkg/cvo/cvo.go @@ -165,7 +165,7 @@ type Operator struct { // via annotation exclude string - clusterFeatures featuregates.ClusterFeatures + enabledFeatureGates featuregates.CvoGateChecker clusterProfile string uid types.UID @@ -186,7 +186,6 @@ func New( client clientset.Interface, kubeClient kubernetes.Interface, exclude string, - clusterFeatures featuregates.ClusterFeatures, clusterProfile string, promqlTarget clusterconditions.PromQLTarget, injectClusterIdIntoPromQL bool, @@ -218,10 +217,14 @@ func New( upgradeableQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "upgradeable"), exclude: exclude, - clusterFeatures: clusterFeatures, clusterProfile: clusterProfile, conditionRegistry: standard.NewConditionRegistry(promqlTarget), injectClusterIdIntoPromQL: injectClusterIdIntoPromQL, + + // Because of OCPBUGS-30080, we can only detect the enabled feature gates after Operator loads the initial payload + // from disk via LoadInitialPayload. We must not have any gate-checking code until that happens, so we initialize + // this field with a checker that panics when used. + enabledFeatureGates: featuregates.PanicOnUsageBeforeInitialization, } if _, err := cvInformer.Informer().AddEventHandler(optr.clusterVersionEventHandler()); err != nil { @@ -253,10 +256,9 @@ func New( return optr, nil } -// InitializeFromPayload waits until a ClusterVersion object exists. It then retrieves the payload contents and verifies the -// initial state, then configures the controller that loads and applies content to the cluster. It returns an error if the -// payload appears to be in error rather than continuing. -func (optr *Operator) InitializeFromPayload(ctx context.Context, restConfig *rest.Config, burstRestConfig *rest.Config) error { +// LoadInitialPayload waits until a ClusterVersion object exists. It then retrieves the payload contents, verifies the +// initial state and returns it. If the payload is invalid, an error is returned. +func (optr *Operator) LoadInitialPayload(ctx context.Context, startingRequiredFeatureSet configv1.FeatureSet, restConfig *rest.Config) (*payload.Update, error) { // wait until cluster version object exists if err := wait.PollUntilContextCancel(ctx, 3*time.Second, true, func(ctx context.Context) (bool, error) { @@ -273,24 +275,19 @@ func (optr *Operator) InitializeFromPayload(ctx context.Context, restConfig *res } return true, nil }); err != nil { - return fmt.Errorf("Error when attempting to get cluster version object: %w", err) + return nil, fmt.Errorf("Error when attempting to get cluster version object: %w", err) } - update, err := payload.LoadUpdate(optr.defaultPayloadDir(), optr.release.Image, optr.exclude, optr.clusterFeatures.StartingRequiredFeatureSet, + update, err := payload.LoadUpdate(optr.defaultPayloadDir(), optr.release.Image, optr.exclude, string(startingRequiredFeatureSet), optr.clusterProfile, capability.GetKnownCapabilities()) if err != nil { - return fmt.Errorf("the local release contents are invalid - no current version can be determined from disk: %v", err) + return nil, fmt.Errorf("the local release contents are invalid - no current version can be determined from disk: %v", err) } - - optr.release = update.Release - optr.releaseCreated = update.ImageRef.CreationTimestamp.Time - optr.SetArchitecture(update.Architecture) - httpClientConstructor := sigstore.NewCachedHTTPClientConstructor(optr.HTTPClient, nil) configClient, err := coreclientsetv1.NewForConfig(restConfig) if err != nil { - return fmt.Errorf("unable to create a configuration client: %v", err) + return nil, fmt.Errorf("unable to create a configuration client: %v", err) } customSignatureStore := &customsignaturestore.Store{ @@ -302,7 +299,7 @@ func (optr *Operator) InitializeFromPayload(ctx context.Context, restConfig *res // attempt to load a verifier as defined in the payload verifier, signatureStore, err := loadConfigMapVerifierDataFromUpdate(update, httpClientConstructor.HTTPClient, configClient, customSignatureStore) if err != nil { - return err + return nil, err } if verifier != nil { klog.Infof("Verifying release authenticity: %v", verifier) @@ -312,6 +309,16 @@ func (optr *Operator) InitializeFromPayload(ctx context.Context, restConfig *res } optr.verifier = verifier optr.signatureStore = signatureStore + return update, nil +} + +// InitializeFromPayload configures the controller that loads and applies content to the cluster given an initial payload +// and feature gate data. +func (optr *Operator) InitializeFromPayload(update *payload.Update, requiredFeatureSet configv1.FeatureSet, cvoFlags featuregates.CvoGateChecker, restConfig *rest.Config, burstRestConfig *rest.Config) { + optr.enabledFeatureGates = cvoFlags + optr.release = update.Release + optr.releaseCreated = update.ImageRef.CreationTimestamp.Time + optr.SetArchitecture(update.Architecture) // after the verifier has been loaded, initialize the sync worker with a payload retriever // which will consume the verifier @@ -327,12 +334,10 @@ func (optr *Operator) InitializeFromPayload(ctx context.Context, restConfig *res Cap: time.Second * 15, }, optr.exclude, - optr.clusterFeatures.StartingRequiredFeatureSet, + requiredFeatureSet, optr.eventRecorder, optr.clusterProfile, ) - - return nil } // ownerReferenceModifier sets the owner reference to the current CV resource if no other reference exists. It also resets diff --git a/pkg/cvo/cvo_scenarios_test.go b/pkg/cvo/cvo_scenarios_test.go index 5b7612d3ee..82c1cdd5a7 100644 --- a/pkg/cvo/cvo_scenarios_test.go +++ b/pkg/cvo/cvo_scenarios_test.go @@ -26,10 +26,11 @@ import ( configv1 "github.com/openshift/api/config/v1" "github.com/openshift/client-go/config/clientset/versioned/fake" + "github.com/openshift/library-go/pkg/manifest" + "github.com/openshift/cluster-version-operator/pkg/featuregates" "github.com/openshift/cluster-version-operator/pkg/payload" "github.com/openshift/cluster-version-operator/pkg/payload/precondition" - "github.com/openshift/library-go/pkg/manifest" ) var architecture string @@ -108,14 +109,15 @@ func setupCVOTest(payloadDir string) (*Operator, map[string]apiruntime.Object, * } o := &Operator{ - namespace: "test", - name: "version", - queue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "cvo-loop-test"), - client: client, - cvLister: &clientCVLister{client: client}, - exclude: "exclude-test", - eventRecorder: record.NewFakeRecorder(100), - clusterProfile: payload.DefaultClusterProfile, + namespace: "test", + name: "version", + queue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "cvo-loop-test"), + client: client, + enabledFeatureGates: featuregates.DefaultCvoGates("version"), + cvLister: &clientCVLister{client: client}, + exclude: "exclude-test", + eventRecorder: record.NewFakeRecorder(100), + clusterProfile: payload.DefaultClusterProfile, } dynamicScheme := apiruntime.NewScheme() diff --git a/pkg/cvo/cvo_test.go b/pkg/cvo/cvo_test.go index cb3d693a90..73c2275bb6 100644 --- a/pkg/cvo/cvo_test.go +++ b/pkg/cvo/cvo_test.go @@ -39,11 +39,12 @@ import ( configv1 "github.com/openshift/api/config/v1" clientset "github.com/openshift/client-go/config/clientset/versioned" "github.com/openshift/client-go/config/clientset/versioned/fake" - - "github.com/openshift/cluster-version-operator/pkg/payload" "github.com/openshift/library-go/pkg/manifest" "github.com/openshift/library-go/pkg/verify/store/serial" "github.com/openshift/library-go/pkg/verify/store/sigstore" + + "github.com/openshift/cluster-version-operator/pkg/featuregates" + "github.com/openshift/cluster-version-operator/pkg/payload" ) var ( @@ -2273,6 +2274,7 @@ func TestOperator_sync(t *testing.T) { optr.configSync = &fakeSyncRecorder{Returns: expectStatus} } optr.eventRecorder = record.NewFakeRecorder(100) + optr.enabledFeatureGates = featuregates.DefaultCvoGates("version") ctx := context.Background() err := optr.sync(ctx, optr.queueKey()) diff --git a/pkg/cvo/status.go b/pkg/cvo/status.go index ba5917fb7e..e5da4fd217 100644 --- a/pkg/cvo/status.go +++ b/pkg/cvo/status.go @@ -199,7 +199,7 @@ func (optr *Operator) syncStatus(ctx context.Context, original, config *configv1 original = config.DeepCopy() } - updateClusterVersionStatus(&config.Status, status, optr.release, optr.getAvailableUpdates, optr.clusterFeatures.StartingCvoFeatureGates, validationErrs) + updateClusterVersionStatus(&config.Status, status, optr.release, optr.getAvailableUpdates, optr.enabledFeatureGates, validationErrs) if klog.V(6).Enabled() { klog.Infof("Apply config: %s", diff.ObjectReflectDiff(original, config)) @@ -211,7 +211,7 @@ func (optr *Operator) syncStatus(ctx context.Context, original, config *configv1 // updateClusterVersionStatus updates the passed cvStatus with the latest status information func updateClusterVersionStatus(cvStatus *configv1.ClusterVersionStatus, status *SyncWorkerStatus, - release configv1.Release, getAvailableUpdates func() *availableUpdates, enabledGates featuregates.CvoGates, + release configv1.Release, getAvailableUpdates func() *availableUpdates, enabledGates featuregates.CvoGateChecker, validationErrs field.ErrorList) { cvStatus.ObservedGeneration = status.Generation @@ -382,7 +382,7 @@ func updateClusterVersionStatus(cvStatus *configv1.ClusterVersionStatus, status } oldRriCondition := resourcemerge.FindOperatorStatusCondition(cvStatus.Conditions, resourceReconciliationIssuesConditionType) - if enabledGates.ResourceReconciliationIssuesCondition || (oldRriCondition != nil && enabledGates.UnknownVersion) { + if enabledGates.ResourceReconciliationIssuesCondition() || (oldRriCondition != nil && enabledGates.UnknownVersion()) { rriCondition := configv1.ClusterOperatorStatusCondition{ Type: resourceReconciliationIssuesConditionType, Status: configv1.ConditionFalse, diff --git a/pkg/cvo/status_test.go b/pkg/cvo/status_test.go index b953d139b1..5b89522b8f 100644 --- a/pkg/cvo/status_test.go +++ b/pkg/cvo/status_test.go @@ -17,7 +17,6 @@ import ( "github.com/openshift/client-go/config/clientset/versioned/fake" "github.com/openshift/cluster-version-operator/lib/resourcemerge" - "github.com/openshift/cluster-version-operator/pkg/featuregates" ) func Test_mergeEqualVersions(t *testing.T) { @@ -197,6 +196,19 @@ func TestOperator_syncFailingStatus(t *testing.T) { } } +type fakeRriFlags struct { + unknownVersion bool + resourceReconciliationIssuesCondition bool +} + +func (f fakeRriFlags) UnknownVersion() bool { + return f.unknownVersion +} + +func (f fakeRriFlags) ResourceReconciliationIssuesCondition() bool { + return f.resourceReconciliationIssuesCondition +} + func TestUpdateClusterVersionStatus_UnknownVersionAndRRI(t *testing.T) { ignoreLastTransitionTime := cmpopts.IgnoreFields(configv1.ClusterOperatorStatusCondition{}, "LastTransitionTime") @@ -247,9 +259,9 @@ func TestUpdateClusterVersionStatus_UnknownVersionAndRRI(t *testing.T) { for _, tc := range testCases { tc := tc t.Run(tc.name, func(t *testing.T) { - gates := featuregates.CvoGates{ - UnknownVersion: tc.unknownVersion, - ResourceReconciliationIssuesCondition: false, + gates := fakeRriFlags{ + unknownVersion: tc.unknownVersion, + resourceReconciliationIssuesCondition: false, } release := configv1.Release{} getAvailableUpdates := func() *availableUpdates { return nil } @@ -317,7 +329,10 @@ func TestUpdateClusterVersionStatus_ResourceReconciliationIssues(t *testing.T) { for _, tc := range testCases { tc := tc t.Run(tc.name, func(t *testing.T) { - gates := featuregates.CvoGates{ResourceReconciliationIssuesCondition: tc.enabled} + gates := fakeRriFlags{ + unknownVersion: false, + resourceReconciliationIssuesCondition: tc.enabled, + } release := configv1.Release{} getAvailableUpdates := func() *availableUpdates { return nil } var noErrors field.ErrorList diff --git a/pkg/cvo/sync_worker.go b/pkg/cvo/sync_worker.go index 2bf2d26a42..cfaeb3a318 100644 --- a/pkg/cvo/sync_worker.go +++ b/pkg/cvo/sync_worker.go @@ -178,14 +178,14 @@ type SyncWorker struct { // requiredFeatureSet is set to the value of Feature.config.openshift.io|spec.featureSet, which contributes to // whether or not some manifests are included for reconciliation. - requiredFeatureSet string + requiredFeatureSet configv1.FeatureSet clusterProfile string } // NewSyncWorker initializes a ConfigSyncWorker that will retrieve payloads to disk, apply them via builder // to a server, and obey limits about how often to reconcile or retry on errors. -func NewSyncWorker(retriever PayloadRetriever, builder payload.ResourceBuilder, reconcileInterval time.Duration, backoff wait.Backoff, exclude string, requiredFeatureSet string, eventRecorder record.EventRecorder, clusterProfile string) *SyncWorker { +func NewSyncWorker(retriever PayloadRetriever, builder payload.ResourceBuilder, reconcileInterval time.Duration, backoff wait.Backoff, exclude string, requiredFeatureSet configv1.FeatureSet, eventRecorder record.EventRecorder, clusterProfile string) *SyncWorker { return &SyncWorker{ retriever: retriever, builder: builder, @@ -210,7 +210,7 @@ func NewSyncWorker(retriever PayloadRetriever, builder payload.ResourceBuilder, // NewSyncWorkerWithPreconditions initializes a ConfigSyncWorker that will retrieve payloads to disk, apply them via builder // to a server, and obey limits about how often to reconcile or retry on errors. // It allows providing preconditions for loading payload. -func NewSyncWorkerWithPreconditions(retriever PayloadRetriever, builder payload.ResourceBuilder, preconditions precondition.List, reconcileInterval time.Duration, backoff wait.Backoff, exclude string, requiredFeatureSet string, eventRecorder record.EventRecorder, clusterProfile string) *SyncWorker { +func NewSyncWorkerWithPreconditions(retriever PayloadRetriever, builder payload.ResourceBuilder, preconditions precondition.List, reconcileInterval time.Duration, backoff wait.Backoff, exclude string, requiredFeatureSet configv1.FeatureSet, eventRecorder record.EventRecorder, clusterProfile string) *SyncWorker { worker := NewSyncWorker(retriever, builder, reconcileInterval, backoff, exclude, requiredFeatureSet, eventRecorder, clusterProfile) worker.preconditions = preconditions return worker @@ -315,7 +315,7 @@ func (w *SyncWorker) syncPayload(ctx context.Context, work *SyncWork) ([]configv // Capability filtering is not done here since unknown capabilities are allowed // during updated payload load and enablement checking only occurs during apply. - payloadUpdate, err := payload.LoadUpdate(info.Directory, desired.Image, w.exclude, w.requiredFeatureSet, w.clusterProfile, nil) + payloadUpdate, err := payload.LoadUpdate(info.Directory, desired.Image, w.exclude, string(w.requiredFeatureSet), w.clusterProfile, nil) if err != nil { msg := fmt.Sprintf("Loading payload failed version=%q image=%q failure=%v", desired.Version, desired.Image, err) diff --git a/pkg/featuregates/featurechangestopper.go b/pkg/featuregates/featurechangestopper.go index 8c7126cc79..7231bc82b2 100644 --- a/pkg/featuregates/featurechangestopper.go +++ b/pkg/featuregates/featurechangestopper.go @@ -19,7 +19,8 @@ import ( // ChangeStopper calls stop when the value of the featureset changes type ChangeStopper struct { - clusterFeatures ClusterFeatures + startingRequiredFeatureSet *configv1.FeatureSet + startingCvoGates *CvoGates featureGateLister configlistersv1.FeatureGateLister cacheSynced []cache.InformerSynced @@ -28,18 +29,13 @@ type ChangeStopper struct { shutdownFn context.CancelFunc } -// New returns a new ChangeStopper. -func New( - clusterFeatures ClusterFeatures, - featureGateInformer configinformersv1.FeatureGateInformer, -) (*ChangeStopper, error) { +// NewChangeStopper returns a new ChangeStopper. +func NewChangeStopper(featureGateInformer configinformersv1.FeatureGateInformer) (*ChangeStopper, error) { c := &ChangeStopper{ - clusterFeatures: clusterFeatures, featureGateLister: featureGateInformer.Lister(), cacheSynced: []cache.InformerSynced{featureGateInformer.Informer().HasSynced}, queue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "feature-gate-stopper"), } - c.queue.Add("cluster") // seed an initial sync, in case startingRequiredFeatureSet is wrong if _, err := featureGateInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ AddFunc: func(_ interface{}) { @@ -58,6 +54,11 @@ func New( return c, nil } +func (c *ChangeStopper) SetStartingFeatures(requiredFeatureSet configv1.FeatureSet, cvoGates CvoGates) { + c.startingRequiredFeatureSet = &requiredFeatureSet + c.startingCvoGates = &cvoGates +} + // syncHandler processes a single work entry, with the // processNextWorkItem caller handling the queue management. It returns // done when there will be no more work (because the feature gate changed). @@ -65,14 +66,15 @@ func (c *ChangeStopper) syncHandler(_ context.Context) (done bool, err error) { var current configv1.FeatureSet var currentCvoGates CvoGates if featureGates, err := c.featureGateLister.Get("cluster"); err == nil { + current = featureGates.Spec.FeatureSet - currentCvoGates = getCvoGatesFrom(featureGates, c.clusterFeatures.VersionForGates) + currentCvoGates = CvoGatesFromFeatureGate(featureGates, c.startingCvoGates.desiredVersion) } else if !apierrors.IsNotFound(err) { return false, err } - featureSetChanged := string(current) != c.clusterFeatures.StartingRequiredFeatureSet - cvoFeaturesChanged := currentCvoGates != c.clusterFeatures.StartingCvoFeatureGates + featureSetChanged := current != *c.startingRequiredFeatureSet + cvoFeaturesChanged := currentCvoGates != *c.startingCvoGates if featureSetChanged || cvoFeaturesChanged { var action string if c.shutdownFn == nil { @@ -81,10 +83,10 @@ func (c *ChangeStopper) syncHandler(_ context.Context) (done bool, err error) { action = "requesting shutdown" } if featureSetChanged { - klog.Infof("FeatureSet was %q, but the current feature set is %q; %s.", c.clusterFeatures.StartingRequiredFeatureSet, current, action) + klog.Infof("FeatureSet was %q, but the current feature set is %q; %s.", *c.startingRequiredFeatureSet, current, action) } if cvoFeaturesChanged { - klog.Infof("CVO feature flags were %v, but changed to %v; %s.", c.clusterFeatures.StartingCvoFeatureGates, currentCvoGates, action) + klog.Infof("CVO feature flags were %+v, but changed to %+v; %s.", c.startingCvoGates, currentCvoGates, action) } if c.shutdownFn != nil { @@ -97,6 +99,10 @@ func (c *ChangeStopper) syncHandler(_ context.Context) (done bool, err error) { // Run launches the controller and blocks until it is canceled or work completes. func (c *ChangeStopper) Run(ctx context.Context, shutdownFn context.CancelFunc) error { + if c.startingRequiredFeatureSet == nil || c.startingCvoGates == nil { + return errors.New("BUG: startingRequiredFeatureSet and startingCvoGates must be set before calling Run") + + } // don't let panics crash the process defer utilruntime.HandleCrash() // make sure the work queue is shutdown which will trigger workers to end @@ -109,13 +115,13 @@ func (c *ChangeStopper) Run(ctx context.Context, shutdownFn context.CancelFunc) }() c.shutdownFn = shutdownFn - klog.Infof("Starting stop-on-featureset-change controller with %q.", c.clusterFeatures.StartingRequiredFeatureSet) - // wait for your secondary caches to fill before starting your work if !cache.WaitForCacheSync(ctx.Done(), c.cacheSynced...) { return errors.New("feature gate cache failed to sync") } + klog.Infof("Starting stop-on-features-change controller with startingRequiredFeatureSet=%q startingCvoGates=%+v", *c.startingRequiredFeatureSet, *c.startingCvoGates) + err := wait.PollUntilContextCancel(ctx, 30*time.Second, true, c.runWorker) klog.Info("Shutting down stop-on-featureset-change controller") return err diff --git a/pkg/featuregates/featurechangestopper_test.go b/pkg/featuregates/featurechangestopper_test.go index d606625671..7e290cf219 100644 --- a/pkg/featuregates/featurechangestopper_test.go +++ b/pkg/featuregates/featurechangestopper_test.go @@ -15,7 +15,7 @@ func TestTechPreviewChangeStopper(t *testing.T) { versionForGates := "1.2.3" tests := []struct { name string - startingRequiredFeatureSet string + startingRequiredFeatureSet configv1.FeatureSet startingCvoFeatureGates CvoGates featureSet string @@ -57,7 +57,8 @@ func TestTechPreviewChangeStopper(t *testing.T) { name: "cvo flags changed", startingRequiredFeatureSet: "TechPreviewNoUpgrade", startingCvoFeatureGates: CvoGates{ - UnknownVersion: true, + desiredVersion: versionForGates, + unknownVersion: true, }, featureSet: "TechPreviewNoUpgrade", featureGateStatus: &configv1.FeatureGateStatus{ @@ -93,22 +94,17 @@ func TestTechPreviewChangeStopper(t *testing.T) { fg.Status = *tt.featureGateStatus } else { fg.Status = configv1.FeatureGateStatus{} - tt.startingCvoFeatureGates = CvoGates{UnknownVersion: true} + tt.startingCvoFeatureGates = CvoGates{unknownVersion: true} } client := fakeconfigv1client.NewSimpleClientset(fg) informerFactory := configv1informer.NewSharedInformerFactory(client, 0) - featureGates := informerFactory.Config().V1().FeatureGates() - cf := ClusterFeatures{ - StartingRequiredFeatureSet: tt.startingRequiredFeatureSet, - StartingCvoFeatureGates: tt.startingCvoFeatureGates, - VersionForGates: versionForGates, - } - c, err := New(cf, featureGates) + c, err := NewChangeStopper(informerFactory.Config().V1().FeatureGates()) if err != nil { t.Fatal(err) } + c.SetStartingFeatures(tt.startingRequiredFeatureSet, tt.startingCvoFeatureGates) informerFactory.Start(ctx.Done()) if err := c.Run(ctx, shutdownFn); err != nil { diff --git a/pkg/featuregates/featuregates.go b/pkg/featuregates/featuregates.go index efbfb6b7ce..e2cfbd2a80 100644 --- a/pkg/featuregates/featuregates.go +++ b/pkg/featuregates/featuregates.go @@ -2,14 +2,10 @@ package featuregates import ( configv1 "github.com/openshift/api/config/v1" - "k8s.io/klog/v2" ) -// CvoGates contains flags that control CVO functionality gated by product feature gates. The -// names do not correspond to product feature gates, the booleans here are "smaller" (product-level -// gate will enable multiple CVO behaviors). -type CvoGates struct { - +// CvoGateChecker allows CVO code to check which feature gates are enabled +type CvoGateChecker interface { // UnknownVersion flag is set to true if CVO did not find a matching version in the FeatureGate // status resource, meaning the current set of enabled and disabled feature gates is unknown for // this version. This should be a temporary state (config-operator should eventually add the @@ -19,7 +15,7 @@ type CvoGates struct { // enabled, it can continue to behave as if it was enabled and vice versa. This temporary state // should be eventually resolved when the FeatureGate status resource is updated, which forces CVO // to restart when the flags change. - UnknownVersion bool + UnknownVersion() bool // ResourceReconciliationIssuesCondition controls whether CVO maintains a Condition with // ResourceReconciliationIssues type, containing a JSON that describes all "issues" that prevented @@ -27,56 +23,79 @@ type CvoGates struct { // that the experimental work for "oc adm upgrade status" uses to report upgrade status, and // should never be relied upon by any production code. We may want to eventually turn this into // some kind of "real" API. - ResourceReconciliationIssuesCondition bool + ResourceReconciliationIssuesCondition() bool } -type ClusterFeatures struct { - StartingRequiredFeatureSet string +type panicOnUsageBeforeInitializationFunc func() - // StartingCvoFeatureGates control gated CVO functionality (not gated cluster functionality) - StartingCvoFeatureGates CvoGates - VersionForGates string +func panicOnUsageBeforeInitialization() { + panic("CVO feature flags were used before they were initialized") } -func DefaultClusterFeatures(version string) ClusterFeatures { - return ClusterFeatures{ - // check to see if techpreview should be on or off. If we cannot read the featuregate for any reason, it is assumed - // to be off. If this value changes, the binary will shutdown and expect the pod lifecycle to restart it. - StartingRequiredFeatureSet: "", - StartingCvoFeatureGates: defaultGatesWhenUnknown(), - VersionForGates: version, - } +// PanicOnUsageBeforeInitialization is a CvoGateChecker that panics if any of its methods are called. This checker should +// be used before CVO feature gates are actually known and some code tries to check them. +var PanicOnUsageBeforeInitialization = panicOnUsageBeforeInitializationFunc(panicOnUsageBeforeInitialization) + +func (p panicOnUsageBeforeInitializationFunc) ResourceReconciliationIssuesCondition() bool { + p() + return false } -func ClusterFeaturesFromFeatureGate(gate *configv1.FeatureGate, version string) ClusterFeatures { - return ClusterFeatures{ - StartingRequiredFeatureSet: string(gate.Spec.FeatureSet), - StartingCvoFeatureGates: getCvoGatesFrom(gate, version), - VersionForGates: version, - } +func (p panicOnUsageBeforeInitializationFunc) UnknownVersion() bool { + p() + return false } -func defaultGatesWhenUnknown() CvoGates { - return CvoGates{ - UnknownVersion: true, +// CvoGates contains flags that control CVO functionality gated by product feature gates. The +// names do not correspond to product feature gates, the booleans here are "smaller" (product-level +// gate will enable multiple CVO behaviors). +type CvoGates struct { + // desiredVersion stores the currently executing version of CVO, for which these feature gates + // are relevant + desiredVersion string + + // individual flags mirror the CvoGateChecker interface + unknownVersion bool + resourceReconciliationIssuesCondition bool +} - ResourceReconciliationIssuesCondition: false, +func (c CvoGates) ResourceReconciliationIssuesCondition() bool { + return c.resourceReconciliationIssuesCondition +} + +func (c CvoGates) UnknownVersion() bool { + return c.unknownVersion +} + +// DefaultCvoGates apply when actual features for given version are unknown +func DefaultCvoGates(version string) CvoGates { + return CvoGates{ + desiredVersion: version, + unknownVersion: true, + resourceReconciliationIssuesCondition: false, } } -func getCvoGatesFrom(gate *configv1.FeatureGate, version string) CvoGates { - enabledGates := defaultGatesWhenUnknown() - klog.Infof("Looking up feature gates for version %s", version) +// CvoGatesFromFeatureGate finds feature gates for a given version in a FeatureGate resource and returns +// CvoGates that reflects them, or the default gates if given version was not found in the FeatureGate +func CvoGatesFromFeatureGate(gate *configv1.FeatureGate, version string) CvoGates { + enabledGates := DefaultCvoGates(version) + for _, g := range gate.Status.FeatureGates { if g.Version != version { continue } // We found the matching version, so we do not need to run in the unknown version mode - enabledGates.UnknownVersion = false + enabledGates.unknownVersion = false for _, enabled := range g.Enabled { if enabled.Name == configv1.FeatureGateUpgradeStatus { - enabledGates.ResourceReconciliationIssuesCondition = true + enabledGates.resourceReconciliationIssuesCondition = true + } + } + for _, disabled := range g.Disabled { + if disabled.Name == configv1.FeatureGateUpgradeStatus { + enabledGates.resourceReconciliationIssuesCondition = false } } } diff --git a/pkg/start/start.go b/pkg/start/start.go index 62397b8e7b..265d8f222b 100644 --- a/pkg/start/start.go +++ b/pkg/start/start.go @@ -13,7 +13,7 @@ import ( "time" "github.com/google/uuid" - + configlistersv1 "github.com/openshift/client-go/config/listers/config/v1" v1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -157,55 +157,13 @@ func (o *Options) Run(ctx context.Context) error { return fmt.Errorf("error creating clients: %v", err) } - var clusterFeatures featuregates.ClusterFeatures - - // This is analogical to VersionForOperatorFromEnv() from o/library-go but the import is pretty - // heavy for a single, simple os.Getenv wrapper, so we just inline the logic here. - operatorVersion := os.Getenv("OPERATOR_IMAGE_VERSION") - - // client-go automatically retries some network blip errors on GETs for 30s by default, and we want to - // retry the remaining ones ourselves. If we fail longer than that, the operator won't be able to do work - // anyway. Return the error and crashloop. - // - // We implement the timeout with a context because the timeout in PollImmediateWithContext does not behave - // well when ConditionFunc takes longer time to execute, like here where the GET can be retried by client-go - var lastError error - if err := wait.PollUntilContextTimeout(context.Background(), 2*time.Second, 25*time.Second, true, func(ctx context.Context) (bool, error) { - gate, fgErr := cb.ClientOrDie("feature-gate-getter").ConfigV1().FeatureGates().Get(ctx, "cluster", metav1.GetOptions{}) - switch { - case apierrors.IsNotFound(fgErr): - // if we have no featuregates, then the cluster is using the default featureset, which is "". - // This excludes everything that could possibly depend on a different feature set. - clusterFeatures = featuregates.DefaultClusterFeatures(operatorVersion) - return true, nil - case fgErr != nil: - lastError = fgErr - klog.Warningf("Failed to get FeatureGate from cluster: %v", fgErr) - return false, nil - default: - clusterFeatures = featuregates.ClusterFeaturesFromFeatureGate(gate, operatorVersion) - return true, nil - } - }); err != nil { - if lastError != nil { - return lastError - } - return err - } - - klog.Infof("Feature set detected at startup: %q", clusterFeatures.StartingRequiredFeatureSet) - if clusterFeatures.StartingCvoFeatureGates.UnknownVersion { - klog.Infof("CVO features for version %s could not be detected from FeatureGate; will use defaults plus special UnkownVersion feature gate", clusterFeatures.VersionForGates) - } - klog.Infof("CVO features for version %s enabled at startup: %+v", clusterFeatures.VersionForGates, clusterFeatures.StartingCvoFeatureGates) - lock, err := createResourceLock(cb, o.Namespace, o.Name) if err != nil { return err } // initialize the controllers and attempt to load the payload information - controllerCtx, err := o.NewControllerContext(cb, clusterFeatures) + controllerCtx, err := o.NewControllerContext(cb) if err != nil { return err } @@ -278,7 +236,7 @@ func (o *Options) run(ctx context.Context, controllerCtx *Context, lock resource resultChannel <- asyncResult{name: "metrics server", error: err} }() } - if err := controllerCtx.CVO.InitializeFromPayload(runContext, restConfig, burstRestConfig); err != nil { + if err := controllerCtx.InitializeFromPayload(runContext, restConfig, burstRestConfig); err != nil { if firstError == nil { firstError = err } @@ -477,11 +435,13 @@ type Context struct { OpenshiftConfigInformerFactory informers.SharedInformerFactory OpenshiftConfigManagedInformerFactory informers.SharedInformerFactory InformerFactory externalversions.SharedInformerFactory + + fgLister configlistersv1.FeatureGateLister } // NewControllerContext initializes the default Context for the current Options. It does // not start any background processes. -func (o *Options) NewControllerContext(cb *ClientBuilder, clusterFeatures featuregates.ClusterFeatures) (*Context, error) { +func (o *Options) NewControllerContext(cb *ClientBuilder) (*Context, error) { client := cb.ClientOrDie("shared-informer") kubeClient := cb.KubeClientOrDie(internal.ConfigNamespace, useProtobuf) @@ -494,10 +454,6 @@ func (o *Options) NewControllerContext(cb *ClientBuilder, clusterFeatures featur sharedInformers := externalversions.NewSharedInformerFactory(client, resyncPeriod(o.ResyncInterval)) coInformer := sharedInformers.Config().V1().ClusterOperators() - featureChangeStopper, err := featuregates.New(clusterFeatures, sharedInformers.Config().V1().FeatureGates()) - if err != nil { - return nil, err - } cvoKubeClient := cb.KubeClientOrDie(o.Namespace, useProtobuf) o.PromQLTarget.KubeClient = cvoKubeClient @@ -515,7 +471,6 @@ func (o *Options) NewControllerContext(cb *ClientBuilder, clusterFeatures featur cb.ClientOrDie(o.Namespace), cvoKubeClient, o.Exclude, - clusterFeatures, o.ClusterProfile, o.PromQLTarget, o.InjectClusterIdIntoPromQL, @@ -524,6 +479,11 @@ func (o *Options) NewControllerContext(cb *ClientBuilder, clusterFeatures featur return nil, err } + featureChangeStopper, err := featuregates.NewChangeStopper(sharedInformers.Config().V1().FeatureGates()) + if err != nil { + return nil, err + } + ctx := &Context{ CVInformerFactory: cvInformer, OpenshiftConfigInformerFactory: openshiftConfigInformer, @@ -531,6 +491,8 @@ func (o *Options) NewControllerContext(cb *ClientBuilder, clusterFeatures featur InformerFactory: sharedInformers, CVO: cvo, StopOnFeatureGateChange: featureChangeStopper, + + fgLister: sharedInformers.Config().V1().FeatureGates().Lister(), } if o.EnableAutoUpdate { @@ -552,3 +514,67 @@ func (o *Options) NewControllerContext(cb *ClientBuilder, clusterFeatures featur } return ctx, nil } + +// InitializeFromPayload initializes the CVO and FeatureGate ChangeStoppers controllers from the payload. It extracts the +// current CVO version from the initial payload and uses it to determine the initial the required featureset and enabled +// feature gates. Both the payload and determined feature information are used to initialize CVO and feature gate +// ChangeStopper controllers. +func (c *Context) InitializeFromPayload(ctx context.Context, restConfig *rest.Config, burstRestConfig *rest.Config) error { + var startingFeatureSet configv1.FeatureSet + var clusterFeatureGate *configv1.FeatureGate + + // client-go automatically retries some network blip errors on GETs for 30s by default, and we want to + // retry the remaining ones ourselves. If we fail longer than that, the operator won't be able to do work + // anyway. Return the error and crashloop. + // + // We implement the timeout with a context because the timeout in PollImmediateWithContext does not behave + // well when ConditionFunc takes longer time to execute, like here where the GET can be retried by client-go + var lastError error + if err := wait.PollUntilContextTimeout(context.Background(), 2*time.Second, 25*time.Second, true, func(ctx context.Context) (bool, error) { + gate, fgErr := c.fgLister.Get("cluster") + switch { + case apierrors.IsNotFound(fgErr): + // if we have no featuregates, then the cluster is using the default featureset, which is "". + // This excludes everything that could possibly depend on a different feature set. + startingFeatureSet = "" + klog.Infof("FeatureGate not found in cluster, using default feature set %q at startup", startingFeatureSet) + return true, nil + case fgErr != nil: + lastError = fgErr + klog.Warningf("Failed to get FeatureGate from cluster: %v", fgErr) + return false, nil + default: + clusterFeatureGate = gate + startingFeatureSet = gate.Spec.FeatureSet + klog.Infof("FeatureGate found in cluster, using its feature set %q at startup", startingFeatureSet) + return true, nil + } + }); err != nil { + if lastError != nil { + return lastError + } + return err + } + + payload, err := c.CVO.LoadInitialPayload(ctx, startingFeatureSet, restConfig) + if err != nil { + return err + } + + var cvoGates featuregates.CvoGates + if clusterFeatureGate != nil { + cvoGates = featuregates.CvoGatesFromFeatureGate(clusterFeatureGate, payload.Release.Version) + } else { + cvoGates = featuregates.DefaultCvoGates(payload.Release.Version) + } + + if cvoGates.UnknownVersion() { + klog.Infof("CVO features for version %s could not be detected from FeatureGate; will use defaults plus special UnknownVersion feature gate", payload.Release.Version) + } + klog.Infof("CVO features for version %s enabled at startup: %+v", payload.Release.Version, cvoGates) + + c.StopOnFeatureGateChange.SetStartingFeatures(startingFeatureSet, cvoGates) + c.CVO.InitializeFromPayload(payload, startingFeatureSet, cvoGates, restConfig, burstRestConfig) + + return nil +} diff --git a/pkg/start/start_integration_test.go b/pkg/start/start_integration_test.go index 8c40607baf..3fd2a2464a 100644 --- a/pkg/start/start_integration_test.go +++ b/pkg/start/start_integration_test.go @@ -33,7 +33,6 @@ import ( "github.com/openshift/cluster-version-operator/lib/resourcemerge" "github.com/openshift/cluster-version-operator/pkg/cvo" - "github.com/openshift/cluster-version-operator/pkg/featuregates" "github.com/openshift/cluster-version-operator/pkg/payload" ) @@ -185,13 +184,12 @@ func TestIntegrationCVO_initializeAndUpgrade(t *testing.T) { options.ReleaseImage = payloadImage1 options.PayloadOverride = filepath.Join(dir, "0.0.1") options.leaderElection = getLeaderElectionConfig(ctx, cfg) - var clusterFeatures featuregates.ClusterFeatures - controllers, err := options.NewControllerContext(cb, clusterFeatures) + controllers, err := options.NewControllerContext(cb) if err != nil { t.Fatal(err) } - worker := cvo.NewSyncWorker(retriever, cvo.NewResourceBuilder(cfg, cfg, nil, nil), 5*time.Second, wait.Backoff{Steps: 3}, "", clusterFeatures.StartingRequiredFeatureSet, record.NewFakeRecorder(100), payload.DefaultClusterProfile) + worker := cvo.NewSyncWorker(retriever, cvo.NewResourceBuilder(cfg, cfg, nil, nil), 5*time.Second, wait.Backoff{Steps: 3}, "", "", record.NewFakeRecorder(100), payload.DefaultClusterProfile) controllers.CVO.SetSyncWorkerForTesting(worker) lock, err := createResourceLock(cb, options.Namespace, options.Name) @@ -317,13 +315,12 @@ func TestIntegrationCVO_gracefulStepDown(t *testing.T) { options.ReleaseImage = payloadImage1 options.PayloadOverride = filepath.Join(dir, "0.0.1") options.leaderElection = getLeaderElectionConfig(ctx, cfg) - var clusterFeatures featuregates.ClusterFeatures - controllers, err := options.NewControllerContext(cb, clusterFeatures) + controllers, err := options.NewControllerContext(cb) if err != nil { t.Fatal(err) } - worker := cvo.NewSyncWorker(retriever, cvo.NewResourceBuilder(cfg, cfg, nil, nil), 5*time.Second, wait.Backoff{Steps: 3}, "", clusterFeatures.StartingRequiredFeatureSet, record.NewFakeRecorder(100), payload.DefaultClusterProfile) + worker := cvo.NewSyncWorker(retriever, cvo.NewResourceBuilder(cfg, cfg, nil, nil), 5*time.Second, wait.Backoff{Steps: 3}, "", "", record.NewFakeRecorder(100), payload.DefaultClusterProfile) controllers.CVO.SetSyncWorkerForTesting(worker) lock, err := createResourceLock(cb, options.Namespace, options.Name) @@ -512,13 +509,12 @@ metadata: options.PayloadOverride = payloadDir options.leaderElection = getLeaderElectionConfig(ctx, cfg) - var clusterFeatures featuregates.ClusterFeatures - controllers, err := options.NewControllerContext(cb, clusterFeatures) + controllers, err := options.NewControllerContext(cb) if err != nil { t.Fatal(err) } - worker := cvo.NewSyncWorker(retriever, cvo.NewResourceBuilder(cfg, cfg, nil, nil), 5*time.Second, wait.Backoff{Steps: 3}, "", clusterFeatures.StartingRequiredFeatureSet, record.NewFakeRecorder(100), payload.DefaultClusterProfile) + worker := cvo.NewSyncWorker(retriever, cvo.NewResourceBuilder(cfg, cfg, nil, nil), 5*time.Second, wait.Backoff{Steps: 3}, "", "", record.NewFakeRecorder(100), payload.DefaultClusterProfile) controllers.CVO.SetSyncWorkerForTesting(worker) arch := runtime.GOARCH From 16044483bd5fe681726f21eb89046376444dd9e9 Mon Sep 17 00:00:00 2001 From: Petr Muller Date: Wed, 27 Mar 2024 16:59:59 +0100 Subject: [PATCH 5/5] Rename `ResourceReconciliationIssues` to `ReconciliationIssues` --- pkg/cvo/reconciliation_issues.go | 13 ++++ pkg/cvo/resource_reconciliation_issues.go | 13 ---- pkg/cvo/status.go | 24 +++--- pkg/cvo/status_test.go | 90 +++++++++++------------ pkg/featuregates/featuregates.go | 26 +++---- 5 files changed, 83 insertions(+), 83 deletions(-) create mode 100644 pkg/cvo/reconciliation_issues.go delete mode 100644 pkg/cvo/resource_reconciliation_issues.go diff --git a/pkg/cvo/reconciliation_issues.go b/pkg/cvo/reconciliation_issues.go new file mode 100644 index 0000000000..144cc619ef --- /dev/null +++ b/pkg/cvo/reconciliation_issues.go @@ -0,0 +1,13 @@ +package cvo + +import v1 "github.com/openshift/api/config/v1" + +const ( + reconciliationIssuesConditionType v1.ClusterStatusConditionType = "ReconciliationIssues" + + noReconciliationIssuesReason string = "NoIssues" + noReconciliationIssuesMessage string = "No issues found during reconciliation" + + reconciliationIssuesFoundReason string = "IssuesFound" + reconciliationIssuesFoundMessage string = "Issues found during reconciliation" +) diff --git a/pkg/cvo/resource_reconciliation_issues.go b/pkg/cvo/resource_reconciliation_issues.go deleted file mode 100644 index 04a57753d7..0000000000 --- a/pkg/cvo/resource_reconciliation_issues.go +++ /dev/null @@ -1,13 +0,0 @@ -package cvo - -import v1 "github.com/openshift/api/config/v1" - -const ( - resourceReconciliationIssuesConditionType v1.ClusterStatusConditionType = "ResourceReconciliationIssues" - - noResourceReconciliationIssuesReason string = "NoIssues" - noResourceReconciliationIssuesMessage string = "No issues found during resource reconciliation" - - resourceReconciliationIssuesFoundReason string = "IssuesFound" - resourceReconciliationIssuesFoundMessage string = "Issues found during resource reconciliation" -) diff --git a/pkg/cvo/status.go b/pkg/cvo/status.go index e5da4fd217..36ec476c22 100644 --- a/pkg/cvo/status.go +++ b/pkg/cvo/status.go @@ -381,22 +381,22 @@ func updateClusterVersionStatus(cvStatus *configv1.ClusterVersionStatus, status } } - oldRriCondition := resourcemerge.FindOperatorStatusCondition(cvStatus.Conditions, resourceReconciliationIssuesConditionType) - if enabledGates.ResourceReconciliationIssuesCondition() || (oldRriCondition != nil && enabledGates.UnknownVersion()) { - rriCondition := configv1.ClusterOperatorStatusCondition{ - Type: resourceReconciliationIssuesConditionType, + oldRiCondition := resourcemerge.FindOperatorStatusCondition(cvStatus.Conditions, reconciliationIssuesConditionType) + if enabledGates.ReconciliationIssuesCondition() || (oldRiCondition != nil && enabledGates.UnknownVersion()) { + riCondition := configv1.ClusterOperatorStatusCondition{ + Type: reconciliationIssuesConditionType, Status: configv1.ConditionFalse, - Reason: noResourceReconciliationIssuesReason, - Message: noResourceReconciliationIssuesMessage, + Reason: noReconciliationIssuesReason, + Message: noReconciliationIssuesMessage, } if status.Failure != nil { - rriCondition.Status = configv1.ConditionTrue - rriCondition.Reason = resourceReconciliationIssuesFoundReason - rriCondition.Message = fmt.Sprintf("%s: %s", resourceReconciliationIssuesFoundMessage, status.Failure.Error()) + riCondition.Status = configv1.ConditionTrue + riCondition.Reason = reconciliationIssuesFoundReason + riCondition.Message = fmt.Sprintf("%s: %s", reconciliationIssuesFoundMessage, status.Failure.Error()) } - resourcemerge.SetOperatorStatusCondition(&cvStatus.Conditions, rriCondition) - } else if oldRriCondition != nil { - resourcemerge.RemoveOperatorStatusCondition(&cvStatus.Conditions, resourceReconciliationIssuesConditionType) + resourcemerge.SetOperatorStatusCondition(&cvStatus.Conditions, riCondition) + } else if oldRiCondition != nil { + resourcemerge.RemoveOperatorStatusCondition(&cvStatus.Conditions, reconciliationIssuesConditionType) } // default retrieved updates if it is not set diff --git a/pkg/cvo/status_test.go b/pkg/cvo/status_test.go index 5b89522b8f..c98d8985db 100644 --- a/pkg/cvo/status_test.go +++ b/pkg/cvo/status_test.go @@ -196,20 +196,20 @@ func TestOperator_syncFailingStatus(t *testing.T) { } } -type fakeRriFlags struct { - unknownVersion bool - resourceReconciliationIssuesCondition bool +type fakeRiFlags struct { + unknownVersion bool + reconciliationIssuesCondition bool } -func (f fakeRriFlags) UnknownVersion() bool { +func (f fakeRiFlags) UnknownVersion() bool { return f.unknownVersion } -func (f fakeRriFlags) ResourceReconciliationIssuesCondition() bool { - return f.resourceReconciliationIssuesCondition +func (f fakeRiFlags) ReconciliationIssuesCondition() bool { + return f.reconciliationIssuesCondition } -func TestUpdateClusterVersionStatus_UnknownVersionAndRRI(t *testing.T) { +func TestUpdateClusterVersionStatus_UnknownVersionAndReconciliationIssues(t *testing.T) { ignoreLastTransitionTime := cmpopts.IgnoreFields(configv1.ClusterOperatorStatusCondition{}, "LastTransitionTime") testCases := []struct { @@ -219,49 +219,49 @@ func TestUpdateClusterVersionStatus_UnknownVersionAndRRI(t *testing.T) { oldCondition *configv1.ClusterOperatorStatusCondition failure error - expectedRriCondition *configv1.ClusterOperatorStatusCondition + expectedRiCondition *configv1.ClusterOperatorStatusCondition }{ { - name: "RRI disabled, version known, no failure => condition not present", - unknownVersion: false, - expectedRriCondition: nil, + name: "ReconciliationIssues disabled, version known, no failure => condition not present", + unknownVersion: false, + expectedRiCondition: nil, }, { - name: "RRI disabled, version known, failure => condition not present", - unknownVersion: false, - failure: fmt.Errorf("Something happened"), - expectedRriCondition: nil, + name: "ReconciliationIssues disabled, version known, failure => condition not present", + unknownVersion: false, + failure: fmt.Errorf("Something happened"), + expectedRiCondition: nil, }, { - name: "RRI disabled, version unknown, failure, existing condition => condition present", + name: "ReconciliationIssues disabled, version unknown, failure, existing condition => condition present", oldCondition: &configv1.ClusterOperatorStatusCondition{ - Type: resourceReconciliationIssuesConditionType, + Type: reconciliationIssuesConditionType, Status: configv1.ConditionFalse, - Reason: noResourceReconciliationIssuesReason, + Reason: noReconciliationIssuesReason, Message: "Happy condition is happy", }, unknownVersion: true, failure: fmt.Errorf("Something happened"), - expectedRriCondition: &configv1.ClusterOperatorStatusCondition{ - Type: resourceReconciliationIssuesConditionType, + expectedRiCondition: &configv1.ClusterOperatorStatusCondition{ + Type: reconciliationIssuesConditionType, Status: configv1.ConditionTrue, - Reason: resourceReconciliationIssuesFoundReason, - Message: "Issues found during resource reconciliation: Something happened", + Reason: reconciliationIssuesFoundReason, + Message: "Issues found during reconciliation: Something happened", }, }, { - name: "RRI disabled, version unknown, failure, no existing condition => condition not present", - unknownVersion: true, - failure: fmt.Errorf("Something happened"), - expectedRriCondition: nil, + name: "ReconciliationIssues disabled, version unknown, failure, no existing condition => condition not present", + unknownVersion: true, + failure: fmt.Errorf("Something happened"), + expectedRiCondition: nil, }, } for _, tc := range testCases { tc := tc t.Run(tc.name, func(t *testing.T) { - gates := fakeRriFlags{ - unknownVersion: tc.unknownVersion, - resourceReconciliationIssuesCondition: false, + gates := fakeRiFlags{ + unknownVersion: tc.unknownVersion, + reconciliationIssuesCondition: false, } release := configv1.Release{} getAvailableUpdates := func() *availableUpdates { return nil } @@ -271,8 +271,8 @@ func TestUpdateClusterVersionStatus_UnknownVersionAndRRI(t *testing.T) { cvStatus.Conditions = append(cvStatus.Conditions, *tc.oldCondition) } updateClusterVersionStatus(&cvStatus, &SyncWorkerStatus{Failure: tc.failure}, release, getAvailableUpdates, gates, noErrors) - condition := resourcemerge.FindOperatorStatusCondition(cvStatus.Conditions, resourceReconciliationIssuesConditionType) - if diff := cmp.Diff(tc.expectedRriCondition, condition, ignoreLastTransitionTime); diff != "" { + condition := resourcemerge.FindOperatorStatusCondition(cvStatus.Conditions, reconciliationIssuesConditionType) + if diff := cmp.Diff(tc.expectedRiCondition, condition, ignoreLastTransitionTime); diff != "" { t.Errorf("unexpected condition\n:%s", diff) } }) @@ -281,7 +281,7 @@ func TestUpdateClusterVersionStatus_UnknownVersionAndRRI(t *testing.T) { } -func TestUpdateClusterVersionStatus_ResourceReconciliationIssues(t *testing.T) { +func TestUpdateClusterVersionStatus_ReconciliationIssues(t *testing.T) { ignoreLastTransitionTime := cmpopts.IgnoreFields(configv1.ClusterOperatorStatusCondition{}, "LastTransitionTime") testCases := []struct { @@ -293,31 +293,31 @@ func TestUpdateClusterVersionStatus_ResourceReconciliationIssues(t *testing.T) { expectedCondition *configv1.ClusterOperatorStatusCondition }{ { - name: "ResourceReconciliationIssues present and happy when gate is enabled and no failures happened", + name: "ReconciliationIssues present and happy when gate is enabled and no failures happened", syncWorkerStatus: SyncWorkerStatus{}, enabled: true, expectedCondition: &configv1.ClusterOperatorStatusCondition{ - Type: resourceReconciliationIssuesConditionType, + Type: reconciliationIssuesConditionType, Status: configv1.ConditionFalse, - Reason: noResourceReconciliationIssuesReason, - Message: noResourceReconciliationIssuesMessage, + Reason: noReconciliationIssuesReason, + Message: noReconciliationIssuesMessage, }, }, { - name: "ResourceReconciliationIssues present and unhappy when gate is enabled and failures happened", + name: "ReconciliationIssues present and unhappy when gate is enabled and failures happened", syncWorkerStatus: SyncWorkerStatus{ Failure: fmt.Errorf("Something happened"), }, enabled: true, expectedCondition: &configv1.ClusterOperatorStatusCondition{ - Type: resourceReconciliationIssuesConditionType, + Type: reconciliationIssuesConditionType, Status: configv1.ConditionTrue, - Reason: resourceReconciliationIssuesFoundReason, - Message: "Issues found during resource reconciliation: Something happened", + Reason: reconciliationIssuesFoundReason, + Message: "Issues found during reconciliation: Something happened", }, }, { - name: "ResourceReconciliationIssues not present when gate is enabled and failures happened", + name: "ReconciliationIssues not present when gate is enabled and failures happened", syncWorkerStatus: SyncWorkerStatus{ Failure: fmt.Errorf("Something happened"), }, @@ -329,16 +329,16 @@ func TestUpdateClusterVersionStatus_ResourceReconciliationIssues(t *testing.T) { for _, tc := range testCases { tc := tc t.Run(tc.name, func(t *testing.T) { - gates := fakeRriFlags{ - unknownVersion: false, - resourceReconciliationIssuesCondition: tc.enabled, + gates := fakeRiFlags{ + unknownVersion: false, + reconciliationIssuesCondition: tc.enabled, } release := configv1.Release{} getAvailableUpdates := func() *availableUpdates { return nil } var noErrors field.ErrorList cvStatus := configv1.ClusterVersionStatus{} updateClusterVersionStatus(&cvStatus, &tc.syncWorkerStatus, release, getAvailableUpdates, gates, noErrors) - condition := resourcemerge.FindOperatorStatusCondition(cvStatus.Conditions, resourceReconciliationIssuesConditionType) + condition := resourcemerge.FindOperatorStatusCondition(cvStatus.Conditions, reconciliationIssuesConditionType) if diff := cmp.Diff(tc.expectedCondition, condition, ignoreLastTransitionTime); diff != "" { t.Errorf("unexpected condition\n:%s", diff) } diff --git a/pkg/featuregates/featuregates.go b/pkg/featuregates/featuregates.go index e2cfbd2a80..2563a3be8c 100644 --- a/pkg/featuregates/featuregates.go +++ b/pkg/featuregates/featuregates.go @@ -17,13 +17,13 @@ type CvoGateChecker interface { // to restart when the flags change. UnknownVersion() bool - // ResourceReconciliationIssuesCondition controls whether CVO maintains a Condition with - // ResourceReconciliationIssues type, containing a JSON that describes all "issues" that prevented + // ReconciliationIssuesCondition controls whether CVO maintains a Condition with + // ReconciliationIssues type, containing a JSON that describes all "issues" that prevented // or delayed CVO from reconciling individual resources in the cluster. This is a pseudo-API // that the experimental work for "oc adm upgrade status" uses to report upgrade status, and // should never be relied upon by any production code. We may want to eventually turn this into // some kind of "real" API. - ResourceReconciliationIssuesCondition() bool + ReconciliationIssuesCondition() bool } type panicOnUsageBeforeInitializationFunc func() @@ -36,7 +36,7 @@ func panicOnUsageBeforeInitialization() { // be used before CVO feature gates are actually known and some code tries to check them. var PanicOnUsageBeforeInitialization = panicOnUsageBeforeInitializationFunc(panicOnUsageBeforeInitialization) -func (p panicOnUsageBeforeInitializationFunc) ResourceReconciliationIssuesCondition() bool { +func (p panicOnUsageBeforeInitializationFunc) ReconciliationIssuesCondition() bool { p() return false } @@ -55,12 +55,12 @@ type CvoGates struct { desiredVersion string // individual flags mirror the CvoGateChecker interface - unknownVersion bool - resourceReconciliationIssuesCondition bool + unknownVersion bool + reconciliationIssuesCondition bool } -func (c CvoGates) ResourceReconciliationIssuesCondition() bool { - return c.resourceReconciliationIssuesCondition +func (c CvoGates) ReconciliationIssuesCondition() bool { + return c.reconciliationIssuesCondition } func (c CvoGates) UnknownVersion() bool { @@ -70,9 +70,9 @@ func (c CvoGates) UnknownVersion() bool { // DefaultCvoGates apply when actual features for given version are unknown func DefaultCvoGates(version string) CvoGates { return CvoGates{ - desiredVersion: version, - unknownVersion: true, - resourceReconciliationIssuesCondition: false, + desiredVersion: version, + unknownVersion: true, + reconciliationIssuesCondition: false, } } @@ -90,12 +90,12 @@ func CvoGatesFromFeatureGate(gate *configv1.FeatureGate, version string) CvoGate enabledGates.unknownVersion = false for _, enabled := range g.Enabled { if enabled.Name == configv1.FeatureGateUpgradeStatus { - enabledGates.resourceReconciliationIssuesCondition = true + enabledGates.reconciliationIssuesCondition = true } } for _, disabled := range g.Disabled { if disabled.Name == configv1.FeatureGateUpgradeStatus { - enabledGates.resourceReconciliationIssuesCondition = false + enabledGates.reconciliationIssuesCondition = false } } }