From 64f63cb7396bced003e12a62e378699f53fb46d3 Mon Sep 17 00:00:00 2001 From: Zach Smith Date: Thu, 19 Feb 2026 21:17:23 -0800 Subject: [PATCH 1/2] fix: auth distributor --- cmd/apiserver/app/config.go | 32 +++-- .../webhook/generic/versioned_attributes.go | 33 ++++++ .../admission/webhook/mutating/dispatcher.go | 2 +- .../webhook/validating/dispatcher.go | 2 +- pkg/multicluster/auth/dispatcher.go | 110 +++++++++++++++-- pkg/multicluster/auth/dispatcher_test.go | 112 +++++++++++++++++- .../bootstrap/crd_runtime_manager_wrapped.go | 66 ++++++++++- 7 files changed, 327 insertions(+), 30 deletions(-) create mode 100644 pkg/multicluster/admission/webhook/generic/versioned_attributes.go diff --git a/cmd/apiserver/app/config.go b/cmd/apiserver/app/config.go index 5c2f474..088c2fa 100644 --- a/cmd/apiserver/app/config.go +++ b/cmd/apiserver/app/config.go @@ -162,10 +162,7 @@ func NewConfig(opts options.CompletedOptions) (*Config, error) { return } if h, err := crdRuntimeMgr.Runtime(cid, genericConfig.DrainedNotify()); err == nil && h != nil { - h = genericfilters.WithRequestInfo(h, conf.RequestInfoResolver) - h = genericfilters.WithAuditInit(h) - h = serverfilters.WithPanicRecovery(h, conf.RequestInfoResolver) - h.ServeHTTP(w, r) + wrapClusterCRDHandler(h, conf, cid, false).ServeHTTP(w, r) return } klog.Errorf("mc.crdRuntime unresolved at kube cluster=%s path=%s", cid, r.URL.Path) @@ -361,13 +358,7 @@ func NewConfig(opts options.CompletedOptions) (*Config, error) { http.Error(w, "cluster CRD runtime unavailable", http.StatusServiceUnavailable) return true } - // Ensure RequestInfo is computed from the normalized /apis path - // before entering the cluster-scoped CRD runtime handler. - h = genericfilters.WithRequestInfo(h, conf.RequestInfoResolver) - h = withClusterCRDRequestInfoRewrite(h, clusterID) - h = genericfilters.WithAuditInit(h) - h = serverfilters.WithPanicRecovery(h, conf.RequestInfoResolver) - h.ServeHTTP(w, r) + wrapClusterCRDHandler(h, conf, clusterID, false).ServeHTTP(w, r) return true } // Ensure CRDs are also routed through the multicluster handler @@ -480,6 +471,25 @@ func withClusterCRDRequestInfoRewrite(next http.Handler, clusterID string) http. }) } +func wrapClusterCRDHandler(next http.Handler, conf *server.Config, clusterID string, rewriteRequestInfo bool) http.Handler { + if next == nil || conf == nil { + return next + } + h := next + if rewriteRequestInfo { + h = withClusterCRDRequestInfoRewrite(h, clusterID) + } + h = genericfilters.WithAuthorization(h, conf.Authorization.Authorizer, conf.Serializer) + failedHandler := genericfilters.Unauthorized(conf.Serializer) + failedHandler = genericfilters.WithFailedAuthenticationAudit(failedHandler, conf.AuditBackend, conf.AuditPolicyRuleEvaluator) + h = genericfilters.WithAuthentication(h, conf.Authentication.Authenticator, failedHandler, conf.Authentication.APIAudiences, conf.Authentication.RequestHeaderConfig) + // RequestInfo must be available before rewrite/authn/authz wrappers execute. + h = genericfilters.WithRequestInfo(h, conf.RequestInfoResolver) + h = genericfilters.WithAuditInit(h) + h = serverfilters.WithPanicRecovery(h, conf.RequestInfoResolver) + return h +} + func decorateRESTOptionsGetter(server string, getter generic.RESTOptionsGetter, opts mc.Options) generic.RESTOptionsGetter { if _, ok := getter.(mc.RESTOptionsDecorator); ok { klog.Infof("mc.restOptionsGetter server=%s alreadyDecorated=true", server) diff --git a/pkg/multicluster/admission/webhook/generic/versioned_attributes.go b/pkg/multicluster/admission/webhook/generic/versioned_attributes.go new file mode 100644 index 0000000..bfd9534 --- /dev/null +++ b/pkg/multicluster/admission/webhook/generic/versioned_attributes.go @@ -0,0 +1,33 @@ +package generic + +import ( + "k8s.io/apiserver/pkg/admission" + "k8s.io/apiserver/pkg/authentication/user" +) + +// EnsureVersionedAttributesUserInfo guarantees that AdmissionReview construction +// always sees a non-nil user.Info, preventing nil dereferences in upstream +// request builders when attributes are missing user context. +func EnsureVersionedAttributesUserInfo(attr *admission.VersionedAttributes) *admission.VersionedAttributes { + if attr == nil || attr.Attributes == nil || attr.Attributes.GetUserInfo() != nil { + return attr + } + + cloned := *attr + cloned.Attributes = userInfoFallbackAttributes{Attributes: attr.Attributes} + return &cloned +} + +type userInfoFallbackAttributes struct { + admission.Attributes +} + +func (a userInfoFallbackAttributes) GetUserInfo() user.Info { + if a.Attributes == nil { + return &user.DefaultInfo{} + } + if info := a.Attributes.GetUserInfo(); info != nil { + return info + } + return &user.DefaultInfo{} +} diff --git a/pkg/multicluster/admission/webhook/mutating/dispatcher.go b/pkg/multicluster/admission/webhook/mutating/dispatcher.go index b98716b..16120e0 100644 --- a/pkg/multicluster/admission/webhook/mutating/dispatcher.go +++ b/pkg/multicluster/admission/webhook/mutating/dispatcher.go @@ -254,7 +254,7 @@ func (a *mutatingDispatcher) callAttrMutatingHook(ctx context.Context, h *admiss } } - uid, request, response, err := webhookrequest.CreateAdmissionObjects(attr, invocation) + uid, request, response, err := webhookrequest.CreateAdmissionObjects(generic.EnsureVersionedAttributesUserInfo(attr), invocation) if err != nil { return false, &webhookutil.ErrCallingWebhook{WebhookName: h.Name, Reason: fmt.Errorf("could not create admission objects: %w", err), Status: apierrors.NewBadRequest("error creating admission objects")} } diff --git a/pkg/multicluster/admission/webhook/validating/dispatcher.go b/pkg/multicluster/admission/webhook/validating/dispatcher.go index 8afd335..c208030 100644 --- a/pkg/multicluster/admission/webhook/validating/dispatcher.go +++ b/pkg/multicluster/admission/webhook/validating/dispatcher.go @@ -255,7 +255,7 @@ func (d *validatingDispatcher) callHook(ctx context.Context, h *v1.ValidatingWeb } } - uid, request, response, err := webhookrequest.CreateAdmissionObjects(versionedAttr, invocation) + uid, request, response, err := webhookrequest.CreateAdmissionObjects(generic.EnsureVersionedAttributesUserInfo(versionedAttr), invocation) if err != nil { return &webhookutil.ErrCallingWebhook{WebhookName: h.Name, Reason: fmt.Errorf("could not create admission objects: %w", err), Status: apierrors.NewBadRequest("error creating admission objects")} } diff --git a/pkg/multicluster/auth/dispatcher.go b/pkg/multicluster/auth/dispatcher.go index 8333cdc..410d132 100644 --- a/pkg/multicluster/auth/dispatcher.go +++ b/pkg/multicluster/auth/dispatcher.go @@ -2,7 +2,9 @@ package auth import ( "context" + "fmt" "net/http" + "reflect" "strings" mc "github.com/kplane-dev/apiserver/pkg/multicluster" @@ -48,8 +50,8 @@ func (c *ClusterAuthenticator) AuthenticateRequest(req *http.Request) (*authenti } } useRoot := cid == "" || cid == c.rootCluster - if useRoot && c.root != nil { - return c.root.AuthenticateRequest(req) + if useRoot && !isNil(c.root) { + return authenticateSafely(c.root, req, "root") } if c.resolver == nil { return nil, false, nil @@ -58,10 +60,10 @@ func (c *ClusterAuthenticator) AuthenticateRequest(req *http.Request) (*authenti if err != nil { return nil, false, err } - if authn == nil { + if isNil(authn) { return nil, false, nil } - return authn.AuthenticateRequest(req) + return authenticateSafely(authn, req, cid) } // ClusterAuthorizer dispatches authorization per cluster. @@ -93,27 +95,33 @@ func NewClusterAuthorizer(rootCluster string, root authorizer.Authorizer, rootRe // Authorize dispatches by cluster context. func (c *ClusterAuthorizer) Authorize(ctx context.Context, a authorizer.Attributes) (authorizer.Decision, string, error) { cid := clusterFromContext(ctx) - if cid == "" || (cid == c.rootCluster && c.root != nil) || c.resolver == nil { - if c.root == nil { + if cid == "" || (cid == c.rootCluster && !isNil(c.root)) || c.resolver == nil { + if isNil(c.root) { return authorizer.DecisionNoOpinion, "no root authorizer", nil } - return c.root.Authorize(ctx, a) + if err := validateAttributesForAuthorize(a, "root", authorizerType(c.root)); err != nil { + return authorizer.DecisionDeny, "", err + } + return authorizeSafely(c.root, ctx, a, "root") } authz, _, err := c.resolver.AuthorizerForCluster(cid) if err != nil { - return authorizer.DecisionNoOpinion, "", err + return authorizer.DecisionDeny, "", err } - if authz == nil { + if isNil(authz) { return authorizer.DecisionNoOpinion, "no cluster authorizer", nil } - return authz.Authorize(ctx, a) + if err := validateAttributesForAuthorize(a, cid, authorizerType(authz)); err != nil { + return authorizer.DecisionDeny, "", err + } + return authorizeSafely(authz, ctx, a, cid) } // RulesFor dispatches rule resolution per cluster. func (c *ClusterAuthorizer) RulesFor(ctx context.Context, u user.Info, namespace string) ([]authorizer.ResourceRuleInfo, []authorizer.NonResourceRuleInfo, bool, error) { cid := clusterFromContext(ctx) - if cid == "" || (cid == c.rootCluster && c.rootResolver != nil) || c.resolver == nil { - if c.rootResolver == nil { + if cid == "" || (cid == c.rootCluster && !isNil(c.rootResolver)) || c.resolver == nil { + if isNil(c.rootResolver) { return nil, nil, false, nil } return c.rootResolver.RulesFor(ctx, u, namespace) @@ -122,12 +130,88 @@ func (c *ClusterAuthorizer) RulesFor(ctx context.Context, u user.Info, namespace if err != nil { return nil, nil, false, err } - if resolver == nil { + if isNil(resolver) { return nil, nil, false, nil } return resolver.RulesFor(ctx, u, namespace) } +func isNil(v any) bool { + if v == nil { + return true + } + rv := reflect.ValueOf(v) + switch rv.Kind() { + case reflect.Chan, reflect.Func, reflect.Interface, reflect.Map, reflect.Pointer, reflect.Slice: + return rv.IsNil() + default: + return false + } +} + +func authorizeSafely(authz authorizer.Authorizer, ctx context.Context, a authorizer.Attributes, target string) (decision authorizer.Decision, reason string, err error) { + defer func() { + if r := recover(); r != nil { + decision = authorizer.DecisionDeny + reason = "" + err = fmt.Errorf("authorizer panic for cluster %q (type=%s): %v", target, authorizerType(authz), r) + } + }() + return authz.Authorize(ctx, a) +} + +func validateAttributesForAuthorize(a authorizer.Attributes, clusterID, authzType string) error { + if isNil(a) { + return fmt.Errorf("invalid authorization attributes for cluster %q (authorizer=%s): attributes is nil", clusterID, authzType) + } + u, err := userFromAttributes(a) + if err != nil { + return fmt.Errorf("invalid authorization attributes for cluster %q (authorizer=%s): %w", clusterID, authzType, err) + } + if isNil(u) { + return fmt.Errorf("invalid authorization attributes for cluster %q (authorizer=%s): user is nil", clusterID, authzType) + } + return nil +} + +func userFromAttributes(a authorizer.Attributes) (u user.Info, err error) { + defer func() { + if r := recover(); r != nil { + u = nil + err = fmt.Errorf("GetUser panic: %v", r) + } + }() + return a.GetUser(), nil +} + +func authorizerType(authz authorizer.Authorizer) string { + if isNil(authz) { + return "" + } + return fmt.Sprintf("%T", authz) +} + +func authenticateSafely(authn authenticator.Request, req *http.Request, target string) (resp *authenticator.Response, ok bool, err error) { + defer func() { + if r := recover(); r != nil { + resp = nil + ok = false + err = fmt.Errorf("authenticator panic for cluster %q (type=%T): %v", target, authn, r) + } + }() + resp, ok, err = authn.AuthenticateRequest(req) + if err != nil || !ok { + return resp, ok, err + } + if resp == nil { + return nil, false, fmt.Errorf("invalid authenticator response for cluster %q (type=%T): response is nil", target, authn) + } + if isNil(resp.User) { + return nil, false, fmt.Errorf("invalid authenticator response for cluster %q (type=%T): user is nil", target, authn) + } + return resp, ok, nil +} + func clusterFromContext(ctx context.Context) string { cid, _, _ := mc.FromContext(ctx) return cid diff --git a/pkg/multicluster/auth/dispatcher_test.go b/pkg/multicluster/auth/dispatcher_test.go index fa30345..ea4d588 100644 --- a/pkg/multicluster/auth/dispatcher_test.go +++ b/pkg/multicluster/auth/dispatcher_test.go @@ -24,6 +24,12 @@ func (f *fakeAuthenticator) AuthenticateRequest(*http.Request) (*authenticator.R return &authenticator.Response{User: &user.DefaultInfo{Name: f.name}}, true, nil } +type badUserAuthenticator struct{} + +func (b *badUserAuthenticator) AuthenticateRequest(*http.Request) (*authenticator.Response, bool, error) { + return &authenticator.Response{}, true, nil +} + type fakeAuthorizer struct { name string called *string @@ -43,6 +49,16 @@ func (f *fakeAuthorizer) RulesFor(ctx context.Context, _ user.Info, _ string) ([ return nil, nil, false, nil } +type panicAuthorizer struct{} + +func (p *panicAuthorizer) Authorize(context.Context, authorizer.Attributes) (authorizer.Decision, string, error) { + panic("boom") +} + +func (p *panicAuthorizer) RulesFor(context.Context, user.Info, string) ([]authorizer.ResourceRuleInfo, []authorizer.NonResourceRuleInfo, bool, error) { + return nil, nil, false, nil +} + type fakeResolver struct { authn authenticator.Request authz authorizer.Authorizer @@ -102,9 +118,10 @@ func TestClusterAuthorizerDispatch(t *testing.T) { resolver := &fakeResolver{authz: cluster, ruleResolver: cluster, lastCluster: &lastCluster} dispatch := NewClusterAuthorizer("root", root, root, resolver) + attrs := authorizer.AttributesRecord{User: &user.DefaultInfo{Name: "test-user"}} ctx := mc.WithCluster(context.Background(), "root", false) - _, _, _ = dispatch.Authorize(ctx, authorizer.AttributesRecord{}) + _, _, _ = dispatch.Authorize(ctx, attrs) if called != "root" { t.Fatalf("expected root authorizer, got %q", called) } @@ -112,7 +129,7 @@ func TestClusterAuthorizerDispatch(t *testing.T) { called = "" lastCluster = "" ctx = mc.WithCluster(context.Background(), "c-2", false) - _, _, _ = dispatch.Authorize(ctx, authorizer.AttributesRecord{}) + _, _, _ = dispatch.Authorize(ctx, attrs) if called != "cluster" { t.Fatalf("expected cluster authorizer, got %q", called) } @@ -155,3 +172,94 @@ func TestClusterAuthenticatorUsesTokenHintWithoutClusterContext(t *testing.T) { t.Fatalf("expected resolver cluster c-42, got %q", lastCluster) } } + +func TestClusterAuthenticatorRejectsNilUserResponse(t *testing.T) { + dispatch := NewClusterAuthenticator("root", nil, &fakeResolver{ + authn: &badUserAuthenticator{}, + }) + req := httptest.NewRequest("GET", "http://example", nil) + req = req.WithContext(mc.WithCluster(req.Context(), "c-bad-user", false)) + + resp, ok, err := dispatch.AuthenticateRequest(req) + if err == nil { + t.Fatalf("expected error, got nil") + } + if ok { + t.Fatalf("expected ok=false for invalid auth response") + } + if resp != nil { + t.Fatalf("expected nil response on invalid auth response") + } +} + +func TestClusterAuthorizerTypedNilDoesNotPanic(t *testing.T) { + var typedNilCluster *fakeAuthorizer + dispatch := NewClusterAuthorizer("root", &fakeAuthorizer{name: "root"}, nil, &fakeResolver{ + authz: typedNilCluster, + ruleResolver: typedNilCluster, + }) + attrs := authorizer.AttributesRecord{User: &user.DefaultInfo{Name: "test-user"}} + + ctx := mc.WithCluster(context.Background(), "c-typed-nil", false) + decision, reason, err := dispatch.Authorize(ctx, attrs) + if err != nil { + t.Fatalf("expected nil error, got %v", err) + } + if decision != authorizer.DecisionNoOpinion { + t.Fatalf("expected DecisionNoOpinion, got %v", decision) + } + if reason != "no cluster authorizer" { + t.Fatalf("expected no cluster authorizer reason, got %q", reason) + } + + _, _, incomplete, err := dispatch.RulesFor(ctx, &user.DefaultInfo{Name: "test"}, "") + if err != nil { + t.Fatalf("expected nil error from RulesFor, got %v", err) + } + if incomplete { + t.Fatalf("expected incomplete=false for missing resolver") + } +} + +func TestClusterAuthorizerRootTypedNilDoesNotPanic(t *testing.T) { + var typedNilRoot *fakeAuthorizer + dispatch := NewClusterAuthorizer("root", typedNilRoot, nil, nil) + + decision, reason, err := dispatch.Authorize(context.Background(), authorizer.AttributesRecord{User: &user.DefaultInfo{Name: "test-user"}}) + if err != nil { + t.Fatalf("expected nil error, got %v", err) + } + if decision != authorizer.DecisionNoOpinion { + t.Fatalf("expected DecisionNoOpinion, got %v", decision) + } + if reason != "no root authorizer" { + t.Fatalf("expected no root authorizer reason, got %q", reason) + } +} + +func TestClusterAuthorizerPanicIsRecovered(t *testing.T) { + dispatch := NewClusterAuthorizer("root", &fakeAuthorizer{name: "root"}, nil, &fakeResolver{ + authz: &panicAuthorizer{}, + }) + attrs := authorizer.AttributesRecord{User: &user.DefaultInfo{Name: "test-user"}} + + ctx := mc.WithCluster(context.Background(), "c-panic", false) + decision, _, err := dispatch.Authorize(ctx, attrs) + if err == nil { + t.Fatalf("expected recovered panic error, got nil") + } + if decision != authorizer.DecisionDeny { + t.Fatalf("expected DecisionDeny on panic, got %v", decision) + } +} + +func TestClusterAuthorizerNilUserDenied(t *testing.T) { + dispatch := NewClusterAuthorizer("root", &fakeAuthorizer{name: "root"}, nil, nil) + decision, _, err := dispatch.Authorize(context.Background(), authorizer.AttributesRecord{}) + if err == nil { + t.Fatalf("expected error for nil user, got nil") + } + if decision != authorizer.DecisionDeny { + t.Fatalf("expected DecisionDeny for nil user, got %v", decision) + } +} diff --git a/pkg/multicluster/bootstrap/crd_runtime_manager_wrapped.go b/pkg/multicluster/bootstrap/crd_runtime_manager_wrapped.go index 2e20978..5ecf0a5 100644 --- a/pkg/multicluster/bootstrap/crd_runtime_manager_wrapped.go +++ b/pkg/multicluster/bootstrap/crd_runtime_manager_wrapped.go @@ -131,7 +131,23 @@ func (m *CRDRuntimeManager) ServesGroupVersion(clusterID, group, version string, crdServesLookupLat.WithLabelValues(r).Observe(time.Since(start).Seconds()) return served, nil } - // No fallback direct API lookup: shared projection is the source of truth. + // Reconcile from shared projection, then briefly wait for projection updates + // to absorb fresh CRD install/update events before returning not served. + if served, ok := m.lookupFromSharedProjection(clusterID, group, version); ok { + r := result(served) + crdServesCacheMiss.WithLabelValues("projection").Inc() + crdServesLookupTotal.WithLabelValues(r).Inc() + crdServesLookupLat.WithLabelValues(r).Observe(time.Since(start).Seconds()) + return served, nil + } + if served, ok := m.waitForSharedProjection(clusterID, group, version, 2*time.Second); ok { + r := result(served) + crdServesCacheMiss.WithLabelValues("projection-wait").Inc() + crdServesLookupTotal.WithLabelValues(r).Inc() + crdServesLookupLat.WithLabelValues(r).Observe(time.Since(start).Seconds()) + return served, nil + } + r := result(false) crdServesLookupTotal.WithLabelValues(r).Inc() crdServesLookupLat.WithLabelValues(r).Observe(time.Since(start).Seconds()) @@ -180,7 +196,7 @@ func (m *CRDRuntimeManager) ensureSharedRuntime(stopCh <-chan struct{}) (runtime go crdServer.GenericAPIServer.RunPostStartHooks(runCtx) entry := runtimeEntry{ - handler: crdServer.GenericAPIServer.Handler.NonGoRestfulMux, + handler: crdServer.GenericAPIServer.Handler.Director, server: crdServer.GenericAPIServer, cancel: cancel, } @@ -261,6 +277,52 @@ func (m *CRDRuntimeManager) lookupFromInformerIndex(clusterID, group, version st return m.servesIndex.Lookup(clusterID, group, version) } +func (m *CRDRuntimeManager) lookupFromSharedProjection(clusterID, group, version string) (bool, bool) { + crds := m.sharedProjection.List(clusterID) + objs := make([]interface{}, 0, len(crds)) + served := false + for _, crd := range crds { + if crd == nil { + continue + } + objs = append(objs, crd) + if !isCRDEstablished(crd) || crd.Spec.Group != group { + continue + } + for _, v := range crd.Spec.Versions { + if v.Served && v.Name == version { + served = true + break + } + } + } + m.rebuildClusterIndex(clusterID, objs) + return served, true +} + +func (m *CRDRuntimeManager) waitForSharedProjection(clusterID, group, version string, timeout time.Duration) (bool, bool) { + if timeout <= 0 { + timeout = 2 * time.Second + } + deadline := time.Now().Add(timeout) + for time.Now().Before(deadline) { + if served, ok := m.lookupFromInformerIndex(clusterID, group, version); ok { + return served, true + } + if served, ok := m.lookupFromSharedProjection(clusterID, group, version); ok && served { + return true, true + } + time.Sleep(25 * time.Millisecond) + } + if served, ok := m.lookupFromInformerIndex(clusterID, group, version); ok { + return served, true + } + if served, ok := m.lookupFromSharedProjection(clusterID, group, version); ok { + return served, true + } + return false, false +} + func (m *CRDRuntimeManager) ensureSharedCRDState(stopCh <-chan struct{}) error { m.mu.Lock() if m.sharedStarted { From e5b96f40d3f30669245127227fbc395e9b4e493e Mon Sep 17 00:00:00 2001 From: Zach Smith Date: Fri, 27 Feb 2026 21:33:57 -0800 Subject: [PATCH 2/2] feat: cluster context storage rewrite --- README.md | 27 +- cmd/apiserver/app/config.go | 25 +- cmd/apiserver/app/server.go | 3 +- cmd/apiserver/app/version_override.go | 51 ++ cmd/apiserver/app/version_override_test.go | 30 + docs/storage-aware-key-design.md | 184 ++++++ pkg/multicluster/admission/mutating.go | 23 - .../admission/namespace/manager.go | 51 +- .../admission/namespace/scoped_factory.go | 47 +- pkg/multicluster/admission/validating.go | 43 -- pkg/multicluster/admission/webhook/manager.go | 76 +-- .../admission/webhook/scoped_factory.go | 34 +- pkg/multicluster/auth/manager.go | 296 ++++++++- pkg/multicluster/auth/scoped_factory.go | 211 +++++-- pkg/multicluster/bootstrap/crd_controller.go | 5 +- .../bootstrap/crd_runtime_manager_wrapped.go | 436 ++++++++++++- .../bootstrap/kubernetesservice_controller.go | 5 + pkg/multicluster/object_identity.go | 15 + pkg/multicluster/options.go | 4 - pkg/multicluster/scopedinformer/shared.go | 116 +++- .../scopedinformer/shared_test.go | 80 +++ pkg/multicluster/storage.go | 589 +++++++++++++++++- pkg/multicluster/storage/entry.go | 31 + pkg/multicluster/storage/keyaware_cache.go | 40 ++ pkg/multicluster/storage/keyaware_index.go | 9 + .../storage/keyaware_index_test.go | 12 + pkg/multicluster/storage/keyaware_watch.go | 50 ++ .../storage/keyaware_watch_test.go | 34 + .../storage/placement_resolver.go | 36 ++ .../storage/placement_resolver_test.go | 32 + pkg/multicluster/storage_test.go | 8 +- pkg/multicluster/storagekeyaware/backend.go | 420 +++++++++++++ test/smoke/apiserver_test.go | 6 +- test/smoke/auth_restart_test.go | 121 ++++ test/smoke/auth_test.go | 17 +- test/smoke/memory_200vcp_test.go | 2 +- 36 files changed, 2731 insertions(+), 438 deletions(-) create mode 100644 cmd/apiserver/app/version_override.go create mode 100644 cmd/apiserver/app/version_override_test.go create mode 100644 docs/storage-aware-key-design.md create mode 100644 pkg/multicluster/object_identity.go create mode 100644 pkg/multicluster/scopedinformer/shared_test.go create mode 100644 pkg/multicluster/storage/entry.go create mode 100644 pkg/multicluster/storage/keyaware_cache.go create mode 100644 pkg/multicluster/storage/keyaware_index.go create mode 100644 pkg/multicluster/storage/keyaware_index_test.go create mode 100644 pkg/multicluster/storage/keyaware_watch.go create mode 100644 pkg/multicluster/storage/keyaware_watch_test.go create mode 100644 pkg/multicluster/storage/placement_resolver.go create mode 100644 pkg/multicluster/storage/placement_resolver_test.go create mode 100644 pkg/multicluster/storagekeyaware/backend.go create mode 100644 test/smoke/auth_restart_test.go diff --git a/README.md b/README.md index 22dbb0a..12236f3 100644 --- a/README.md +++ b/README.md @@ -147,29 +147,14 @@ keyFunc := func(obj runtime.Object) (string, error) { } ``` -#### Admission: server-owned cluster label -We stamp a server-owned cluster label on persisted objects and validate it on -create/update to prevent cross-cluster writes and support watchcache keying. +#### Admission: internal cluster identity +We store cluster identity in managed fields and validate it on create/update +to prevent cross-cluster writes and support watchcache keying. ```go -lbls := accessor.GetLabels() -if lbls == nil { - lbls = map[string]string{} -} -key := m.Options.ClusterAnnotationKey -if key == "" { - key = mcv1.DefaultClusterAnnotation -} -lbls[key] = cid -accessor.SetLabels(lbls) -``` -Note: the `ClusterAnnotationKey` fallback to `DefaultClusterAnnotation` is -temporary and not ideal. It exists to preserve compatibility while we migrate -callers to an explicit label key; avoid relying on this fallback long term. - -```go -if cid := acc.GetLabels()[key]; cid != reqCID { - return fmt.Errorf("cluster label %q=%q must match request cluster %q", key, cid, reqCID) +mc.SetObjectClusterIdentity(obj, reqCID) +if cid := mc.ObjectClusterIdentity(obj); cid != reqCID { + return fmt.Errorf("cluster identity %q must match request cluster %q", cid, reqCID) } ``` diff --git a/cmd/apiserver/app/config.go b/cmd/apiserver/app/config.go index 088c2fa..64960a1 100644 --- a/cmd/apiserver/app/config.go +++ b/cmd/apiserver/app/config.go @@ -146,7 +146,7 @@ func NewConfig(opts options.CompletedOptions) (*Config, error) { }) genericConfig.BuildHandlerChainFunc = func(h http.Handler, conf *server.Config) http.Handler { ex := mc.PathExtractor{PathPrefix: mcOpts.PathPrefix, ControlPlaneSegment: mcOpts.ControlPlaneSegment} - base := server.DefaultBuildHandlerChain(h, conf) + base := withVersionOverride(server.DefaultBuildHandlerChain(h, conf)) dispatch := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { cid, _, _ := mc.FromContext(r.Context()) if cid != "" && cid != mcOpts.DefaultCluster && crdRuntimeMgr != nil { @@ -179,6 +179,8 @@ func NewConfig(opts options.CompletedOptions) (*Config, error) { BaseLoopbackClientConfig: genericConfig.LoopbackClientConfig, PathPrefix: mcOpts.PathPrefix, ControlPlaneSegment: mcOpts.ControlPlaneSegment, + EtcdPrefix: storageFactory.StorageConfig.Prefix, + EtcdTransport: storageFactory.StorageConfig.Transport, Authentication: opts.Authentication, Authorization: opts.Authorization, EgressSelector: genericConfig.EgressSelector, @@ -190,7 +192,9 @@ func NewConfig(opts options.CompletedOptions) (*Config, error) { genericConfig.Authentication.Authenticator = mcauth.NewClusterAuthenticator(mcOpts.DefaultCluster, genericConfig.Authentication.Authenticator, authManager) } if genericConfig.Authorization.Authorizer != nil { - clusterAuthorizer := mcauth.NewClusterAuthorizer(mcOpts.DefaultCluster, genericConfig.Authorization.Authorizer, genericConfig.RuleResolver, authManager) + // Route root and tenant authorization through the same multicluster + // manager-backed path to avoid root-only stale lister divergence. + clusterAuthorizer := mcauth.NewClusterAuthorizer(mcOpts.DefaultCluster, nil, nil, authManager) genericConfig.Authorization.Authorizer = clusterAuthorizer genericConfig.RuleResolver = clusterAuthorizer } @@ -314,6 +318,9 @@ func NewConfig(opts options.CompletedOptions) (*Config, error) { if apiExtensions.GenericConfig.RESTOptionsGetter != nil { apiExtensions.GenericConfig.RESTOptionsGetter = decorateRESTOptionsGetter("apiextensions", apiExtensions.GenericConfig.RESTOptionsGetter, mcOpts) } + if apiExtensions.ExtraConfig.CRDRESTOptionsGetter != nil { + apiExtensions.ExtraConfig.CRDRESTOptionsGetter = decorateRESTOptionsGetter("apiextensions-crd", apiExtensions.ExtraConfig.CRDRESTOptionsGetter, mcOpts) + } apiExtensionsClientPool := mc.NewAPIExtensionsClientPool(apiExtensions.GenericConfig.LoopbackClientConfig, mcOpts.PathPrefix, mcOpts.ControlPlaneSegment) apiExtensionsInformerPool := mc.NewAPIExtensionsInformerPoolFromClientPool(apiExtensionsClientPool, 0, genericConfig.DrainedNotify()) crdRuntimeMgr = mcbootstrap.NewCRDRuntimeManager(mcbootstrap.CRDRuntimeManagerOptions{ @@ -322,6 +329,8 @@ func NewConfig(opts options.CompletedOptions) (*Config, error) { PathPrefix: mcOpts.PathPrefix, ControlPlaneSegment: mcOpts.ControlPlaneSegment, DefaultCluster: mcOpts.DefaultCluster, + EtcdPrefix: storageFactory.StorageConfig.Prefix, + EtcdTransport: storageFactory.StorageConfig.Transport, }) setCRDRequestResolvers(apiExtensions, crdRuntimeMgr.CRDGetterForRequest, crdRuntimeMgr.CRDListerForRequest) crdController := mcbootstrap.NewMulticlusterCRDController(crdRuntimeMgr, mcOpts.DefaultCluster) @@ -364,7 +373,7 @@ func NewConfig(opts options.CompletedOptions) (*Config, error) { // Ensure CRDs are also routed through the multicluster handler apiExtensions.GenericConfig.BuildHandlerChainFunc = func(h http.Handler, conf *server.Config) http.Handler { ex := mc.PathExtractor{PathPrefix: mcOpts.PathPrefix, ControlPlaneSegment: mcOpts.ControlPlaneSegment} - base := server.DefaultBuildHandlerChain(h, conf) + base := withVersionOverride(server.DefaultBuildHandlerChain(h, conf)) dispatch := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { cid, _, _ := mc.FromContext(r.Context()) if cid != "" && cid != mcOpts.DefaultCluster { @@ -381,11 +390,11 @@ func NewConfig(opts options.CompletedOptions) (*Config, error) { mut := mca.NewMutating(mcOpts) val := mca.NewValidating(mcOpts) base := apiExtensions.GenericConfig.AdmissionControl - chain := []admission.Interface{mut, mcNamespaceLifecycle, mcMutatingWebhook} + chain := []admission.Interface{mut, mcNamespaceLifecycle} if base != nil { chain = append(chain, base) } - chain = append(chain, mcValidatingWebhook, val) + chain = append(chain, val) apiExtensions.GenericConfig.AdmissionControl = admission.NewChainHandler(chain...) } c.ApiExtensions = apiExtensions @@ -400,7 +409,7 @@ func NewConfig(opts options.CompletedOptions) (*Config, error) { // Ensure aggregator also receives multicluster routing aggregator.GenericConfig.BuildHandlerChainFunc = func(h http.Handler, conf *server.Config) http.Handler { ex := mc.PathExtractor{PathPrefix: mcOpts.PathPrefix, ControlPlaneSegment: mcOpts.ControlPlaneSegment} - base := server.DefaultBuildHandlerChain(h, conf) + base := withVersionOverride(server.DefaultBuildHandlerChain(h, conf)) dispatch := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { cid, _, _ := mc.FromContext(r.Context()) if cid != "" && cid != mcOpts.DefaultCluster && crdRuntimeMgr != nil { @@ -417,11 +426,11 @@ func NewConfig(opts options.CompletedOptions) (*Config, error) { mut := mca.NewMutating(mcOpts) val := mca.NewValidating(mcOpts) base := aggregator.GenericConfig.AdmissionControl - chain := []admission.Interface{mut, mcNamespaceLifecycle, mcMutatingWebhook} + chain := []admission.Interface{mut, mcNamespaceLifecycle} if base != nil { chain = append(chain, base) } - chain = append(chain, mcValidatingWebhook, val) + chain = append(chain, val) aggregator.GenericConfig.AdmissionControl = admission.NewChainHandler(chain...) } c.Aggregator = aggregator diff --git a/cmd/apiserver/app/server.go b/cmd/apiserver/app/server.go index f30bcf1..7f439c7 100644 --- a/cmd/apiserver/app/server.go +++ b/cmd/apiserver/app/server.go @@ -49,7 +49,6 @@ import ( logsapi "k8s.io/component-base/logs/api/v1" _ "k8s.io/component-base/metrics/prometheus/workqueue" "k8s.io/component-base/term" - utilversion "k8s.io/component-base/version" "k8s.io/component-base/version/verflag" "k8s.io/component-base/zpages/flagz" "k8s.io/klog/v2" @@ -148,7 +147,7 @@ cluster's shared state through which all other components interact.`, // Run runs the specified APIServer. This should never exit. func Run(ctx context.Context, opts options.CompletedOptions) error { // To help debugging, immediately log version - klog.Infof("Version: %+v", utilversion.Get()) + klog.Infof("Version: %+v", normalizedServerVersion()) klog.InfoS("Golang settings", "GOGC", os.Getenv("GOGC"), "GOMAXPROCS", os.Getenv("GOMAXPROCS"), "GOTRACEBACK", os.Getenv("GOTRACEBACK")) diff --git a/cmd/apiserver/app/version_override.go b/cmd/apiserver/app/version_override.go new file mode 100644 index 0000000..a6b3843 --- /dev/null +++ b/cmd/apiserver/app/version_override.go @@ -0,0 +1,51 @@ +package app + +import ( + "encoding/json" + "net/http" + "strings" + + apimachineryversion "k8s.io/apimachinery/pkg/version" + utilversion "k8s.io/component-base/version" +) + +const gitVersionArchivePlaceholder = "v0.0.0-master+$Format:%H$" + +func withVersionOverride(next http.Handler) http.Handler { + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if isVersionPath(r.URL.Path) { + info := normalizedServerVersion() + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(info) + return + } + next.ServeHTTP(w, r) + }) +} + +func isVersionPath(path string) bool { + if path == "/version" || path == "/version/" { + return true + } + trimmed := strings.Trim(path, "/") + parts := strings.Split(trimmed, "/") + return len(parts) == 3 && parts[0] == "clusters" && parts[2] == "version" +} + +func normalizedServerVersion() apimachineryversion.Info { + info := utilversion.Get() + if info.GitVersion != gitVersionArchivePlaceholder { + return info + } + + parts := strings.SplitN(utilversion.DefaultKubeBinaryVersion, ".", 2) + if len(parts) != 2 { + return info + } + + info.Major = parts[0] + info.Minor = parts[1] + info.GitVersion = "v" + utilversion.DefaultKubeBinaryVersion + ".0" + return info +} + diff --git a/cmd/apiserver/app/version_override_test.go b/cmd/apiserver/app/version_override_test.go new file mode 100644 index 0000000..134a55f --- /dev/null +++ b/cmd/apiserver/app/version_override_test.go @@ -0,0 +1,30 @@ +package app + +import "testing" + +func TestIsVersionPath(t *testing.T) { + tests := []struct { + path string + want bool + }{ + {path: "/version", want: true}, + {path: "/version/", want: true}, + {path: "/clusters/kpt-1/version", want: true}, + {path: "/clusters/kpt-1/version/", want: true}, + {path: "/apis", want: false}, + } + + for _, tt := range tests { + if got := isVersionPath(tt.path); got != tt.want { + t.Fatalf("isVersionPath(%q)=%v, want %v", tt.path, got, tt.want) + } + } +} + +func TestNormalizedServerVersion(t *testing.T) { + info := normalizedServerVersion() + if info.GitVersion == gitVersionArchivePlaceholder { + t.Fatalf("expected non-placeholder git version, got %q", info.GitVersion) + } +} + diff --git a/docs/storage-aware-key-design.md b/docs/storage-aware-key-design.md new file mode 100644 index 0000000..976a47c --- /dev/null +++ b/docs/storage-aware-key-design.md @@ -0,0 +1,184 @@ +# Key-Aware Shared Storage Design + +## Overview +This document proposes a storage architecture that keeps the shared informer/watch +fanout model while removing the need to persist a user-visible cluster label on +objects for list/watch correctness. + +The core idea is to carry cluster identity as internal storage metadata derived +from keyspace semantics, not from object metadata. + +## Problem Statement +Current multicluster storage uses one shared store/watchcache per resource kind, +which is the right scalability direction. However, cache keying currently depends +on a server-owned label (for example `multicluster.k8s.io/cluster`) persisted on +objects so cluster identity can be recovered from object-only key functions. + +That creates two issues: +- **API shape drift**: clients may observe server-owned placement labels. +- **Coupling**: cluster identity is coupled to object metadata rather than storage + metadata, making future layout changes harder. + +## Goals +- Preserve one shared watcher/cache per resource kind. +- Preserve strict per-cluster list/watch/read/write isolation. +- Eliminate dependency on object labels for watchcache/index keying. +- Keep cluster identity internal-only and non-serialized to clients. +- Allow future etcd key layout evolution without touching API object schema. + +## Non-Goals +- Replacing Kubernetes generic registry behavior wholesale. +- Introducing per-cluster watchcaches/stores again. +- Changing external Kubernetes API semantics for selectors/versioning/watch. + +## Design Summary +Introduce a key-aware internal envelope used inside storage/cache fanout: +- `Object runtime.Object` (decoded Kubernetes object) +- `ClusterID string` (internal placement identity) +- `StorageKey string` (canonical key) +- optional `LayoutVersion string` + +All fanout/index decisions use `ClusterID` from the envelope. Only `Object` is +serialized in API responses. + +### Placement Resolver +Add a small abstraction that resolves cluster identity from keyspace metadata: +- `ClusterFromStorageKey(key string) (clusterID string, ok bool)` +- versioned implementations for key layout evolution. + +This centralizes parsing and decouples routing logic from hard-coded path parsing +scattered across caches. + +## Architecture +1. Request routing still establishes cluster scope in context. +2. Storage key rewrite still places data under cluster-specific subpaths. +3. Shared low-level watch/list reads from kind-root prefix. +4. Internal cache transforms events into envelope entries. +5. Envelope `ClusterID` drives per-request filter/fanout. +6. API response encoding emits only wrapped object payload. + +## New and Modified Files / Methods + +### New Files (proposed) +- `pkg/multicluster/storage/placement_resolver.go` + - `type PlacementResolver interface` + - `type KeyLayoutPlacementResolver struct` + - `func (r *KeyLayoutPlacementResolver) ClusterFromStorageKey(...)` + +- `pkg/multicluster/storage/entry.go` + - `type InternalEntry struct { Object, ClusterID, StorageKey, LayoutVersion }` + - helper constructors and cloning utilities. + +- `pkg/multicluster/storage/keyaware_cache.go` + - shared cache wrapper that consumes watch events and stores `InternalEntry`. + - per-request filtering by cluster. + +- `pkg/multicluster/storage/keyaware_watch.go` + - wraps watch streams and emits only runtime objects while using entry metadata + for internal dispatch. + +- `pkg/multicluster/storage/keyaware_index.go` + - index helpers keyed by `(clusterID, namespace, name)` from entry metadata. + +- `pkg/multicluster/storage/keyaware_cache_test.go` +- `pkg/multicluster/storage/placement_resolver_test.go` + +### Existing Files to Modify +- `pkg/multicluster/storage.go` + - keep context-based key rewrite and shared kind-root behavior. + - replace object-label-based key derivation path with key-aware cache adapter. + - remove cluster-label enforcement paths once migration completes. + +- `pkg/multicluster/options.go` + - add options for placement resolver selection and layout version. + +- `cmd/apiserver/app/config.go` + - wire new decorator/options into server config path. + +- `pkg/multicluster/storage_test.go` + - adapt tests to assert isolation without relying on persisted cluster label. + +- `test/smoke/*.go` (targeted) + - add/adjust smoke tests for list/watch isolation with no visible cluster label. + +## Wiring Plan + +### 1) Decorator Wiring +Continue using `RESTOptionsDecorator` entrypoint, but swap internals: +- current: wraps storage and relies on object metadata for cache keying. +- proposed: wraps storage with key-aware cache/entry adapter. + +Wiring location: +- `cmd/apiserver/app/config.go` + - existing `decorateRESTOptionsGetter(...)` integration remains the anchor. + +### 2) Resolver Injection +Inject `PlacementResolver` via multicluster options: +- default resolver handles current `/.../clusters//...` layout. +- optional future resolver supports new key layouts. + +## Upstream Reuse vs Custom Implementation + +### Reuse from Upstream +- Generic registry strategy flow (`Create/Get/List/Update/Delete/Watch` contract). +- Etcd storage backend and encoding pipeline. +- Selection predicates and API machinery object conversion. +- Existing apiserver handler chain, authn/authz/admission plumbing. + +### Wrap / Extend Locally +- Multicluster RESTOptions decoration and key rewrite. +- Placement resolver and internal entry envelope. +- Shared cache dispatch keyed by cluster metadata. + +### Likely Not Reusable As-Is +- Upstream cacher assumptions where keying/indexing is object-only. + - We should avoid deep invasive upstream patching in-tree. + - Prefer local key-aware adapter/fork boundary with minimal surface area. + +## Migration Plan + +### Phase 0: Design and Scaffolding +- Add resolver and entry types. +- No behavior change. + +### Phase 1: Immediate Cutover +- Implement key-aware cache path and switch read/watch dispatch directly during implementation. +- Keep label writes temporarily for rollback safety. + +### Phase 2: Remove Label Dependency +- Stop requiring label for keying/indexing. +- Remove cluster-label mutating/validation requirements where safe. +- Add response-shape tests ensuring no server-owned cluster label leakage. + +### Phase 3: Cleanup +- Delete legacy code paths. +- Update docs and operational runbooks. + +## Validation and Test Plan +- Unit: + - resolver correctness for current and alternate layouts. + - key-aware cache fanout correctness and concurrency behavior. + - watch event ordering and bookmark behavior. +- Integration: + - existing smoke suite must pass. + - dedicated tests for list/watch isolation with absent cluster label. + - CRD/list/watch behavior parity tests. +- Performance: + - compare goroutine count, memory, and watch fanout metrics vs current model. + +## Risks and Mitigations +- **Risk**: subtle watch semantics regressions. + - Mitigation: broaden integration/smoke coverage and validate watch semantics in targeted tests before merge. +- **Risk**: cache key collisions across clusters. + - Mitigation: explicit `(clusterID, namespace, name)` composite indexes. +- **Risk**: rollout complexity. + - Mitigation: feature gate + phased migration + smoke regression gates. + +## Open Questions +- Should we expose layout version in metrics for easier migrations? +- Where should fork boundary live to minimize divergence from upstream changes? + +## Decision +Proceed with key-aware internal entry and placement resolver design, preserving +shared informer/watch fanout while removing object-label dependency for cluster +identity. diff --git a/pkg/multicluster/admission/mutating.go b/pkg/multicluster/admission/mutating.go index e90a26b..65747ed 100644 --- a/pkg/multicluster/admission/mutating.go +++ b/pkg/multicluster/admission/mutating.go @@ -6,7 +6,6 @@ import ( mcv1 "github.com/kplane-dev/apiserver/pkg/multicluster" mcauth "github.com/kplane-dev/apiserver/pkg/multicluster/auth" - "k8s.io/apimachinery/pkg/api/meta" apiserveradmission "k8s.io/apiserver/pkg/admission" authenticationapi "k8s.io/kubernetes/pkg/apis/authentication" ) @@ -45,27 +44,5 @@ func (m *Mutating) Admit(ctx context.Context, a apiserveradmission.Attributes, _ if gvk.Group == "authorization.k8s.io" || gvk.Group == "authentication.k8s.io" || strings.HasSuffix(gvk.Kind, "Review") { return nil } - obj := a.GetObject() - if obj == nil { - return nil - } - cid, _, _ := mcv1.FromContext(ctx) - if cid == "" { - cid = m.Options.DefaultCluster - } - accessor, err := meta.Accessor(obj) - if err != nil { - return nil - } - lbls := accessor.GetLabels() - if lbls == nil { - lbls = map[string]string{} - } - key := m.Options.ClusterAnnotationKey - if key == "" { - key = mcv1.DefaultClusterAnnotation - } - lbls[key] = cid - accessor.SetLabels(lbls) return nil } diff --git a/pkg/multicluster/admission/namespace/manager.go b/pkg/multicluster/admission/namespace/manager.go index 3717920..6001354 100644 --- a/pkg/multicluster/admission/namespace/manager.go +++ b/pkg/multicluster/admission/namespace/manager.go @@ -3,7 +3,6 @@ package namespace import ( "sync" - "github.com/kplane-dev/apiserver/pkg/multicluster/scopedinformer" "k8s.io/client-go/informers" "k8s.io/client-go/kubernetes" "k8s.io/client-go/rest" @@ -32,12 +31,6 @@ type Manager struct { mu sync.Mutex clusters map[string]*clusterEnv - - sharedOnce sync.Once - sharedErr error - shared informers.SharedInformerFactory - sharedStop <-chan struct{} - sharedOwn chan struct{} } type clusterEnv struct { @@ -73,7 +66,7 @@ func (m *Manager) envForCluster(clusterID string) (*clusterEnv, error) { if err != nil { return nil, err } - scoped, err := m.scopedNamespaceFactory(clusterID) + scoped, err := m.scopedNamespaceFactory(clusterID, cs) if err != nil { return nil, err } @@ -93,45 +86,15 @@ func (m *Manager) envForCluster(clusterID string) (*clusterEnv, error) { return e, nil } -func (m *Manager) scopedNamespaceFactory(clusterID string) (informers.SharedInformerFactory, error) { - shared, err := m.ensureSharedFactory() - if err != nil { - return nil, err - } - return newScopedFactory(clusterID, mc.DefaultClusterAnnotation, shared), nil -} - -func (m *Manager) ensureSharedFactory() (informers.SharedInformerFactory, error) { - m.sharedOnce.Do(func() { - if m.opts.BaseLoopbackClientConfig == nil { - m.sharedErr = mc.ErrMissingClientFactory - return - } - cs, err := scopedinformer.NewAllClustersKubeClient(m.opts.BaseLoopbackClientConfig) +func (m *Manager) scopedNamespaceFactory(clusterID string, cs kubernetes.Interface) (informers.SharedInformerFactory, error) { + if m.opts.InformerPool != nil { + _, factory, _, err := m.opts.InformerPool.Get(clusterID) if err != nil { - m.sharedErr = err - return - } - factory := informers.NewSharedInformerFactory(cs, 0) - if err := factory.Core().V1().Namespaces().Informer().SetTransform(transformNamespaceForShared(mc.DefaultClusterAnnotation)); err != nil { - m.sharedErr = err - return - } - if err := scopedinformer.EnsureClusterIndex(factory.Core().V1().Namespaces().Informer(), mc.DefaultClusterAnnotation); err != nil { - m.sharedErr = err - return - } - if m.sharedStop == nil { - m.sharedOwn = make(chan struct{}) - m.sharedStop = m.sharedOwn + return nil, err } - factory.Start(m.sharedStop) - m.shared = factory - }) - if m.sharedErr != nil { - return nil, m.sharedErr + return factory, nil } - return m.shared, nil + return informers.NewSharedInformerFactory(cs, 0), nil } // StopCluster is test-oriented cleanup; production can leave informers running. diff --git a/pkg/multicluster/admission/namespace/scoped_factory.go b/pkg/multicluster/admission/namespace/scoped_factory.go index 1a615b4..a4fc644 100644 --- a/pkg/multicluster/admission/namespace/scoped_factory.go +++ b/pkg/multicluster/admission/namespace/scoped_factory.go @@ -5,8 +5,8 @@ import ( "reflect" "strings" - mc "github.com/kplane-dev/apiserver/pkg/multicluster" "github.com/kplane-dev/apiserver/pkg/multicluster/scopedinformer" + mcstorage "github.com/kplane-dev/apiserver/pkg/multicluster/storage" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/labels" "k8s.io/client-go/informers" @@ -20,19 +20,14 @@ const sharedNamespaceNamePrefix = "__mcns__" type scopedFactory struct { informers.SharedInformerFactory - clusterID string - clusterLabelKey string - shared informers.SharedInformerFactory + clusterID string + shared informers.SharedInformerFactory } -func newScopedFactory(clusterID, clusterLabelKey string, shared informers.SharedInformerFactory) informers.SharedInformerFactory { - if clusterLabelKey == "" { - clusterLabelKey = mc.DefaultClusterAnnotation - } +func newScopedFactory(clusterID string, shared informers.SharedInformerFactory) informers.SharedInformerFactory { return &scopedFactory{ SharedInformerFactory: shared, clusterID: clusterID, - clusterLabelKey: clusterLabelKey, shared: shared, } } @@ -72,7 +67,7 @@ func (v *scopedCoreV1) LimitRanges() coreinformersv1.LimitRangeInformer { func (v *scopedCoreV1) Namespaces() coreinformersv1.NamespaceInformer { base := v.f.shared.Core().V1().Namespaces().Informer() return &scopedNamespaceInformer{ - informer: newFilteredSharedIndexInformer(base, v.f.clusterID, v.f.clusterLabelKey), + informer: newFilteredSharedIndexInformer(base, v.f.clusterID), lister: &scopedNamespaceLister{indexer: base.GetIndexer(), clusterID: v.f.clusterID}, } } @@ -107,30 +102,50 @@ func (v *scopedCoreV1) ServiceAccounts() coreinformersv1.ServiceAccountInformer return v.f.SharedInformerFactory.Core().V1().ServiceAccounts() } -func newFilteredSharedIndexInformer(shared cache.SharedIndexInformer, clusterID, clusterLabelKey string) cache.SharedIndexInformer { - return scopedinformer.NewFilteredSharedIndexInformer(shared, clusterID, clusterLabelKey) +func newFilteredSharedIndexInformer(shared cache.SharedIndexInformer, clusterID string) cache.SharedIndexInformer { + return scopedinformer.NewFilteredSharedIndexInformer(shared, clusterID) } -func objectCluster(obj interface{}, clusterLabelKey string) string { - return scopedinformer.ObjectCluster(obj, clusterLabelKey) +func objectCluster(obj interface{}) string { + return scopedinformer.ObjectCluster(obj) } func filteredByCluster(indexer cache.Indexer, clusterID string) []interface{} { return scopedinformer.FilteredByCluster(indexer, clusterID) } -func transformNamespaceForShared(clusterLabelKey string) cache.TransformFunc { +func transformNamespaceForShared() cache.TransformFunc { return func(obj interface{}) (interface{}, error) { - cid := objectCluster(obj, clusterLabelKey) + var entry *mcstorage.InternalEntry + if e, ok := obj.(*mcstorage.InternalEntry); ok && e != nil { + entry = e + obj = e.Object + } + cid := objectCluster(obj) + if cid == "" && entry != nil { + cid = entry.ClusterID + } if cid == "" { + if entry != nil { + return entry, nil + } return obj, nil } ns, ok := obj.(*corev1.Namespace) if !ok { + if entry != nil { + return entry, nil + } return obj, nil } cp := ns.DeepCopy() cp.Name = encodeSharedNamespaceName(cid, cp.Name) + if entry != nil { + cpEntry := *entry + cpEntry.Object = cp + cpEntry.ClusterID = cid + return &cpEntry, nil + } return cp, nil } } diff --git a/pkg/multicluster/admission/validating.go b/pkg/multicluster/admission/validating.go index 5a01a91..88264cf 100644 --- a/pkg/multicluster/admission/validating.go +++ b/pkg/multicluster/admission/validating.go @@ -2,11 +2,9 @@ package admission import ( "context" - "fmt" "strings" mcv1 "github.com/kplane-dev/apiserver/pkg/multicluster" - "k8s.io/apimachinery/pkg/api/meta" apiserveradmission "k8s.io/apiserver/pkg/admission" ) @@ -32,46 +30,5 @@ func (v *Validating) Validate(ctx context.Context, a apiserveradmission.Attribut if gvk.Group == "authorization.k8s.io" || gvk.Group == "authentication.k8s.io" || strings.HasSuffix(gvk.Kind, "Review") { return nil } - key := v.Options.ClusterAnnotationKey - if key == "" { - key = mcv1.DefaultClusterAnnotation - } - reqCID, _, _ := mcv1.FromContext(ctx) - if reqCID == "" { - reqCID = v.Options.DefaultCluster - } - - if a.GetOperation() == apiserveradmission.Create { - obj := a.GetObject() - if obj == nil { - return nil - } - acc, err := meta.Accessor(obj) - if err != nil { - return nil - } - if cid := acc.GetLabels()[key]; cid != reqCID { - return fmt.Errorf("cluster label %q=%q must match request cluster %q", key, cid, reqCID) - } - return nil - } - - if a.GetOperation() == apiserveradmission.Update { - newObj := a.GetObject() - oldObj := a.GetOldObject() - if newObj == nil || oldObj == nil { - return nil - } - newAcc, err1 := meta.Accessor(newObj) - oldAcc, err2 := meta.Accessor(oldObj) - if err1 != nil || err2 != nil { - return nil - } - oldCID := oldAcc.GetLabels()[key] - newCID := newAcc.GetLabels()[key] - if (oldCID != "" && oldCID != reqCID) || (newCID != "" && newCID != oldCID) { - return fmt.Errorf("cross-cluster updates are forbidden (old=%q new=%q request=%q)", oldCID, newCID, reqCID) - } - } return nil } diff --git a/pkg/multicluster/admission/webhook/manager.go b/pkg/multicluster/admission/webhook/manager.go index d8ae25c..7cfeea8 100644 --- a/pkg/multicluster/admission/webhook/manager.go +++ b/pkg/multicluster/admission/webhook/manager.go @@ -1,7 +1,6 @@ package webhook import ( - "fmt" "reflect" "sync" @@ -9,10 +8,8 @@ import ( clientgoinformers "k8s.io/client-go/informers" "k8s.io/client-go/kubernetes" "k8s.io/client-go/rest" - "k8s.io/client-go/tools/cache" mc "github.com/kplane-dev/apiserver/pkg/multicluster" - "github.com/kplane-dev/apiserver/pkg/multicluster/scopedinformer" ) type Options struct { @@ -49,13 +46,6 @@ type Manager struct { mu sync.Mutex clusters map[string]*clusterEnv - - sharedOnce sync.Once - sharedErr error - shared clientgoinformers.SharedInformerFactory - sharedStop <-chan struct{} - sharedOwn chan struct{} - sharedSync chan struct{} } type clusterEnv struct { @@ -96,11 +86,12 @@ func (m *Manager) envForCluster(clusterID string) (*clusterEnv, error) { if err != nil { return nil, err } - scoped, err := m.scopedWebhookFactory(clusterID) + scoped, err := m.scopedWebhookFactory(clusterID, cs) if err != nil { return nil, err } stopCh := make(chan struct{}) + synced := make(chan struct{}) sr := newDirectServiceResolver( scoped.Core().V1().Services().Lister(), @@ -113,7 +104,7 @@ func (m *Manager) envForCluster(clusterID string) (*clusterEnv, error) { cid: clusterID, stopCh: stopCh, ownCh: stopCh, - synced: m.sharedSync, + synced: synced, clientset: cs, informers: scoped, serviceResolver: sr, @@ -126,63 +117,26 @@ func (m *Manager) envForCluster(clusterID string) (*clusterEnv, error) { _ = scoped.Admissionregistration().V1().MutatingWebhookConfigurations().Informer() _ = scoped.Admissionregistration().V1().ValidatingWebhookConfigurations().Informer() scoped.Start(stopCh) + go func() { + ok := scoped.WaitForCacheSync(stopCh) + if allSynced(ok) { + close(synced) + } + }() m.clusters[clusterID] = e return e, nil } -func (m *Manager) scopedWebhookFactory(clusterID string) (clientgoinformers.SharedInformerFactory, error) { - shared, err := m.ensureSharedFactory() - if err != nil { - return nil, err - } - return newScopedFactory(clusterID, mc.DefaultClusterAnnotation, shared), nil -} - -func (m *Manager) ensureSharedFactory() (clientgoinformers.SharedInformerFactory, error) { - m.sharedOnce.Do(func() { - if m.opts.BaseLoopbackClientConfig == nil { - m.sharedErr = fmt.Errorf("base loopback config is required for shared webhook factory") - return - } - cs, err := scopedinformer.NewAllClustersKubeClient(m.opts.BaseLoopbackClientConfig) +func (m *Manager) scopedWebhookFactory(clusterID string, cs kubernetes.Interface) (clientgoinformers.SharedInformerFactory, error) { + if m.opts.InformerPool != nil { + _, factory, _, err := m.opts.InformerPool.Get(clusterID) if err != nil { - m.sharedErr = err - return - } - factory := clientgoinformers.NewSharedInformerFactory(cs, 0) - webhookInformers := []cache.SharedIndexInformer{ - factory.Core().V1().Namespaces().Informer(), - factory.Core().V1().Services().Informer(), - factory.Discovery().V1().EndpointSlices().Informer(), - factory.Admissionregistration().V1().MutatingWebhookConfigurations().Informer(), - factory.Admissionregistration().V1().ValidatingWebhookConfigurations().Informer(), - } - for _, inf := range webhookInformers { - if err := scopedinformer.EnsureClusterIndex(inf, mc.DefaultClusterAnnotation); err != nil { - m.sharedErr = err - return - } - } - if m.sharedStop == nil { - m.sharedOwn = make(chan struct{}) - m.sharedStop = m.sharedOwn + return nil, err } - factory.Start(m.sharedStop) - // One shared cache-sync signal for all clusters; scoped informers are projections over shared caches. - m.sharedSync = make(chan struct{}) - go func() { - ok := factory.WaitForCacheSync(m.sharedStop) - if allSynced(ok) { - close(m.sharedSync) - } - }() - m.shared = factory - }) - if m.sharedErr != nil { - return nil, m.sharedErr + return factory, nil } - return m.shared, nil + return clientgoinformers.NewSharedInformerFactory(cs, 0), nil } func allSynced(m map[reflect.Type]bool) bool { diff --git a/pkg/multicluster/admission/webhook/scoped_factory.go b/pkg/multicluster/admission/webhook/scoped_factory.go index 9b74036..a692a49 100644 --- a/pkg/multicluster/admission/webhook/scoped_factory.go +++ b/pkg/multicluster/admission/webhook/scoped_factory.go @@ -43,24 +43,18 @@ import ( discoverylisters "k8s.io/client-go/listers/discovery/v1" "k8s.io/client-go/tools/cache" - mc "github.com/kplane-dev/apiserver/pkg/multicluster" "github.com/kplane-dev/apiserver/pkg/multicluster/scopedinformer" ) type scopedFactory struct { - clusterID string - clusterLabelKey string - shared informers.SharedInformerFactory + clusterID string + shared informers.SharedInformerFactory } -func newScopedFactory(clusterID, clusterLabelKey string, shared informers.SharedInformerFactory) informers.SharedInformerFactory { - if clusterLabelKey == "" { - clusterLabelKey = mc.DefaultClusterAnnotation - } +func newScopedFactory(clusterID string, shared informers.SharedInformerFactory) informers.SharedInformerFactory { return &scopedFactory{ - clusterID: clusterID, - clusterLabelKey: clusterLabelKey, - shared: shared, + clusterID: clusterID, + shared: shared, } } @@ -155,7 +149,7 @@ func (v *scopedCoreV1) LimitRanges() coreinformersv1.LimitRangeInformer { func (v *scopedCoreV1) Namespaces() coreinformersv1.NamespaceInformer { base := v.f.shared.Core().V1().Namespaces().Informer() return &scopedNamespaceInformer{ - informer: newFilteredSharedIndexInformer(base, v.f.clusterID, v.f.clusterLabelKey), + informer: newFilteredSharedIndexInformer(base, v.f.clusterID), lister: &scopedNamespaceLister{indexer: base.GetIndexer(), clusterID: v.f.clusterID}, } } @@ -182,7 +176,7 @@ func (v *scopedCoreV1) Secrets() coreinformersv1.SecretInformer { func (v *scopedCoreV1) Services() coreinformersv1.ServiceInformer { base := v.f.shared.Core().V1().Services().Informer() return &scopedServiceInformer{ - informer: newFilteredSharedIndexInformer(base, v.f.clusterID, v.f.clusterLabelKey), + informer: newFilteredSharedIndexInformer(base, v.f.clusterID), lister: &scopedServiceLister{indexer: base.GetIndexer(), clusterID: v.f.clusterID}, } } @@ -204,7 +198,7 @@ type scopedDiscoveryV1 struct{ f *scopedFactory } func (v *scopedDiscoveryV1) EndpointSlices() discoveryinformersv1.EndpointSliceInformer { base := v.f.shared.Discovery().V1().EndpointSlices().Informer() return &scopedEndpointSliceInformer{ - informer: newFilteredSharedIndexInformer(base, v.f.clusterID, v.f.clusterLabelKey), + informer: newFilteredSharedIndexInformer(base, v.f.clusterID), lister: &scopedEndpointSliceLister{indexer: base.GetIndexer(), clusterID: v.f.clusterID}, } } @@ -226,14 +220,14 @@ type scopedAdmissionregistrationV1 struct{ f *scopedFactory } func (v *scopedAdmissionregistrationV1) MutatingWebhookConfigurations() admissionregistrationinformersv1.MutatingWebhookConfigurationInformer { base := v.f.shared.Admissionregistration().V1().MutatingWebhookConfigurations().Informer() return &scopedMutatingWebhookConfigurationInformer{ - informer: newFilteredSharedIndexInformer(base, v.f.clusterID, v.f.clusterLabelKey), + informer: newFilteredSharedIndexInformer(base, v.f.clusterID), lister: &scopedMutatingWebhookConfigurationLister{indexer: base.GetIndexer(), clusterID: v.f.clusterID}, } } func (v *scopedAdmissionregistrationV1) ValidatingWebhookConfigurations() admissionregistrationinformersv1.ValidatingWebhookConfigurationInformer { base := v.f.shared.Admissionregistration().V1().ValidatingWebhookConfigurations().Informer() return &scopedValidatingWebhookConfigurationInformer{ - informer: newFilteredSharedIndexInformer(base, v.f.clusterID, v.f.clusterLabelKey), + informer: newFilteredSharedIndexInformer(base, v.f.clusterID), lister: &scopedValidatingWebhookConfigurationLister{indexer: base.GetIndexer(), clusterID: v.f.clusterID}, } } @@ -244,12 +238,8 @@ func (v *scopedAdmissionregistrationV1) ValidatingAdmissionPolicyBindings() admi return v.f.shared.Admissionregistration().V1().ValidatingAdmissionPolicyBindings() } -func newFilteredSharedIndexInformer(shared cache.SharedIndexInformer, clusterID, clusterLabelKey string) cache.SharedIndexInformer { - return scopedinformer.NewFilteredSharedIndexInformer(shared, clusterID, clusterLabelKey) -} - -func objectCluster(obj interface{}, clusterLabelKey string) string { - return scopedinformer.ObjectCluster(obj, clusterLabelKey) +func newFilteredSharedIndexInformer(shared cache.SharedIndexInformer, clusterID string) cache.SharedIndexInformer { + return scopedinformer.NewFilteredSharedIndexInformer(shared, clusterID) } func filteredByCluster(indexer cache.Indexer, clusterID string) []interface{} { diff --git a/pkg/multicluster/auth/manager.go b/pkg/multicluster/auth/manager.go index 0738acd..65621e5 100644 --- a/pkg/multicluster/auth/manager.go +++ b/pkg/multicluster/auth/manager.go @@ -3,11 +3,16 @@ package auth import ( "context" "fmt" + "strconv" "strings" "sync" + "time" mc "github.com/kplane-dev/apiserver/pkg/multicluster" "github.com/kplane-dev/apiserver/pkg/multicluster/scopedinformer" + mcstorage "github.com/kplane-dev/apiserver/pkg/multicluster/storage" + "go.etcd.io/etcd/client/pkg/v3/transport" + clientv3 "go.etcd.io/etcd/client/v3" rbacv1 "k8s.io/api/rbac/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apiserver/pkg/authentication/authenticator" @@ -23,6 +28,7 @@ import ( corelisters "k8s.io/client-go/listers/core/v1" "k8s.io/client-go/rest" "k8s.io/client-go/tools/cache" + "k8s.io/apiserver/pkg/storage/storagebackend" "k8s.io/kubernetes/pkg/controller/serviceaccount" "k8s.io/kubernetes/pkg/features" rbacregistryvalidation "k8s.io/kubernetes/pkg/registry/rbac/validation" @@ -36,6 +42,8 @@ type Options struct { BaseLoopbackClientConfig *rest.Config PathPrefix string ControlPlaneSegment string + EtcdPrefix string + EtcdTransport storagebackend.TransportConfig Authentication *kubeoptions.BuiltInAuthenticationOptions Authorization *kubeoptions.BuiltInAuthorizationOptions EgressSelector *egressselector.EgressSelector @@ -58,6 +66,10 @@ type Manager struct { rbacStore *rbacProjectionStore sharedStop <-chan struct{} sharedOwn chan struct{} + rbacDirty chan struct{} + + etcdMu sync.Mutex + etcdClient *clientv3.Client } type clusterEnv struct { @@ -71,6 +83,7 @@ type clusterEnv struct { ruleResolver authorizer.RuleResolver } + // NewManager constructs a per-cluster auth manager. func NewManager(ctx context.Context, opts Options) *Manager { if ctx == nil { @@ -121,15 +134,14 @@ func (m *Manager) envForCluster(clusterID string) (*clusterEnv, error) { m.mu.Unlock() return env, nil } - if m.opts.ClientPool == nil { m.mu.Unlock() return nil, fmt.Errorf("loopback client pool is required for cluster auth") } + m.mu.Unlock() cs, err := m.opts.ClientPool.KubeClientForCluster(clusterID) if err != nil { - m.mu.Unlock() return nil, err } @@ -142,35 +154,29 @@ func (m *Manager) envForCluster(clusterID string) (*clusterEnv, error) { resolver authorizer.RuleResolver ) if m.useSharedRBACAuthorizer() { - listers, err := m.coreListersForCluster(clusterID) + _, coreFactory, _, err := m.coreAuthFactoryForCluster(clusterID) if err != nil { - m.mu.Unlock() return nil, err } - authn, err = buildAuthenticatorWithCoreListers(m.ctx, m.opts, cs, listers) + authn, err = buildAuthenticator(m.ctx, m.opts, cs, coreFactory) if err != nil { - m.mu.Unlock() return nil, err } authz, resolver, err = m.buildSharedRBACAuthorizerForCluster(clusterID) } else { scopedFactory, err = m.scopedAuthFactory(clusterID) if err != nil { - m.mu.Unlock() return nil, err } authn, err = buildAuthenticator(m.ctx, m.opts, cs, scopedFactory) if err != nil { - m.mu.Unlock() return nil, err } authz, resolver, err = buildAuthorizer(m.ctx, m.opts, scopedFactory) } if err != nil { - m.mu.Unlock() return nil, err } - env := &clusterEnv{ cid: clusterID, clientset: cs, @@ -180,17 +186,29 @@ func (m *Manager) envForCluster(clusterID string) (*clusterEnv, error) { ruleResolver: resolver, } + m.mu.Lock() + if existing, ok := m.clusters[clusterID]; ok { + m.mu.Unlock() + return existing, nil + } m.clusters[clusterID] = env m.mu.Unlock() return env, nil } +func (m *Manager) coreAuthFactoryForCluster(clusterID string) (kubernetes.Interface, informers.SharedInformerFactory, <-chan struct{}, error) { + if m.opts.InformerPool == nil { + return nil, nil, nil, fmt.Errorf("informer pool is required for core authn in shared RBAC mode") + } + return m.opts.InformerPool.Get(clusterID) +} + func (m *Manager) scopedAuthFactory(clusterID string) (informers.SharedInformerFactory, error) { shared, err := m.ensureSharedAuthFactory() if err != nil { return nil, err } - return newScopedFactory(clusterID, mc.DefaultClusterAnnotation, shared, m.rbacStore), nil + return newScopedFactory(clusterID, shared, m.rbacStore), nil } func (m *Manager) ensureSharedAuthFactory() (informers.SharedInformerFactory, error) { @@ -211,27 +229,43 @@ func (m *Manager) ensureSharedAuthFactory() (informers.SharedInformerFactory, er factory.Core().V1().ServiceAccounts().Informer(), factory.Core().V1().Pods().Informer(), factory.Core().V1().Nodes().Informer(), - factory.Rbac().V1().Roles().Informer(), - factory.Rbac().V1().RoleBindings().Informer(), - factory.Rbac().V1().ClusterRoles().Informer(), - factory.Rbac().V1().ClusterRoleBindings().Informer(), } for _, inf := range authInformers { - if err := scopedinformer.EnsureClusterIndex(inf, mc.DefaultClusterAnnotation); err != nil { + if err := inf.SetTransform(scopedinformer.ClusterEntryTransform()); err != nil { + m.sharedErr = err + return + } + if err := scopedinformer.EnsureClusterIndex(inf); err != nil { m.sharedErr = err return } } - rbacStore := newRBACProjectionStore(mc.DefaultClusterAnnotation) - if err := registerRBACProjectionHandlers( - rbacStore, + rbacStore := newRBACProjectionStore() + m.rbacDirty = make(chan struct{}, 1) + for _, inf := range []cache.SharedIndexInformer{ factory.Rbac().V1().Roles().Informer(), factory.Rbac().V1().RoleBindings().Informer(), factory.Rbac().V1().ClusterRoles().Informer(), factory.Rbac().V1().ClusterRoleBindings().Informer(), - ); err != nil { - m.sharedErr = err - return + } { + _, err := inf.AddEventHandler(cache.ResourceEventHandlerFuncs{ + AddFunc: func(obj interface{}) { + _ = obj + m.markRBACProjectionDirty() + }, + UpdateFunc: func(oldObj, newObj interface{}) { + _, _ = oldObj, newObj + m.markRBACProjectionDirty() + }, + DeleteFunc: func(obj interface{}) { + _ = obj + m.markRBACProjectionDirty() + }, + }) + if err != nil { + m.sharedErr = err + return + } } if m.sharedStop == nil { m.sharedOwn = make(chan struct{}) @@ -240,6 +274,8 @@ func (m *Manager) ensureSharedAuthFactory() (informers.SharedInformerFactory, er factory.Start(m.sharedStop) m.sharedAuth = factory m.rbacStore = rbacStore + go m.runRBACProjectionRebuildWorker() + m.markRBACProjectionDirty() }) if m.sharedErr != nil { return nil, m.sharedErr @@ -247,6 +283,45 @@ func (m *Manager) ensureSharedAuthFactory() (informers.SharedInformerFactory, er return m.sharedAuth, nil } +func (m *Manager) runRBACProjectionRebuildWorker() { + if m == nil || m.sharedStop == nil { + return + } + timer := time.NewTimer(0) + if !timer.Stop() { + select { + case <-timer.C: + default: + } + } + pending := false + for { + select { + case <-m.sharedStop: + return + case <-m.rbacDirty: + pending = true + timer.Reset(150 * time.Millisecond) + case <-timer.C: + if !pending { + continue + } + pending = false + m.rebuildSharedRBACProjection() + } + } +} + +func (m *Manager) markRBACProjectionDirty() { + if m == nil || m.rbacDirty == nil { + return + } + select { + case m.rbacDirty <- struct{}{}: + default: + } +} + type coreAuthListers struct { secrets corelisters.SecretLister serviceAccounts corelisters.ServiceAccountLister @@ -261,24 +336,20 @@ func (m *Manager) coreListersForCluster(clusterID string) (*coreAuthListers, err } return &coreAuthListers{ secrets: &scopedSecretLister{ - indexer: shared.Core().V1().Secrets().Informer().GetIndexer(), - clusterID: clusterID, - clusterLabelKey: mc.DefaultClusterAnnotation, + indexer: shared.Core().V1().Secrets().Informer().GetIndexer(), + clusterID: clusterID, }, serviceAccounts: &scopedServiceAccountLister{ - indexer: shared.Core().V1().ServiceAccounts().Informer().GetIndexer(), - clusterID: clusterID, - clusterLabelKey: mc.DefaultClusterAnnotation, + indexer: shared.Core().V1().ServiceAccounts().Informer().GetIndexer(), + clusterID: clusterID, }, pods: &scopedPodLister{ - indexer: shared.Core().V1().Pods().Informer().GetIndexer(), - clusterID: clusterID, - clusterLabelKey: mc.DefaultClusterAnnotation, + indexer: shared.Core().V1().Pods().Informer().GetIndexer(), + clusterID: clusterID, }, nodes: &scopedNodeLister{ - indexer: shared.Core().V1().Nodes().Informer().GetIndexer(), - clusterID: clusterID, - clusterLabelKey: mc.DefaultClusterAnnotation, + indexer: shared.Core().V1().Nodes().Informer().GetIndexer(), + clusterID: clusterID, }, }, nil } @@ -300,6 +371,7 @@ func (m *Manager) buildSharedRBACAuthorizerForCluster(clusterID string) (authori if m.rbacStore == nil { return nil, nil, fmt.Errorf("shared RBAC projection store is not initialized") } + m.rebuildSharedRBACProjection() resolver := &clusterAwareRBACDataSource{ store: m.rbacStore, defaultCluster: clusterID, @@ -311,6 +383,162 @@ func (m *Manager) buildSharedRBACAuthorizerForCluster(clusterID string) (authori return authzunion.New(superuser, rbacAuthz), rbacAuthz, nil } +func (m *Manager) rebuildSharedRBACProjection() { + if m == nil || m.sharedAuth == nil || m.rbacStore == nil { + return + } + roles := m.sharedAuth.Rbac().V1().Roles().Informer().GetStore().List() + roleBindings := m.sharedAuth.Rbac().V1().RoleBindings().Informer().GetStore().List() + clusterRoles := m.sharedAuth.Rbac().V1().ClusterRoles().Informer().GetStore().List() + clusterRoleBindings := m.sharedAuth.Rbac().V1().ClusterRoleBindings().Informer().GetStore().List() + + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + var ( + roleIndex map[string]string + roleBindingIndex map[string]string + clusterRoleIndex map[string]string + clusterRoleBindingIndex map[string]string + errRoles, errRB error + errCR, errCRB error + wg sync.WaitGroup + ) + wg.Add(4) + go func() { + defer wg.Done() + roleIndex, errRoles = m.revisionClusterIndex(ctx, "/roles/clusters") + }() + go func() { + defer wg.Done() + roleBindingIndex, errRB = m.revisionClusterIndex(ctx, "/rolebindings/clusters") + }() + go func() { + defer wg.Done() + clusterRoleIndex, errCR = m.revisionClusterIndex(ctx, "/clusterroles/clusters") + }() + go func() { + defer wg.Done() + clusterRoleBindingIndex, errCRB = m.revisionClusterIndex(ctx, "/clusterrolebindings/clusters") + }() + wg.Wait() + // Do not fall back to UID/RV object-memory attribution when durable key + // indexing fails; that can leak cross-cluster RBAC permissions. + if errRoles != nil { + roleIndex = map[string]string{} + } + if errRB != nil { + roleBindingIndex = map[string]string{} + } + if errCR != nil { + clusterRoleIndex = map[string]string{} + } + if errCRB != nil { + clusterRoleBindingIndex = map[string]string{} + } + m.rbacStore.RebuildFromInformerStoresWithResolver( + roles, roleBindings, clusterRoles, clusterRoleBindings, + func(rv, namespace, name string) string { return clusterFromRevisionIndex(roleIndex, rv, namespace, name) }, + func(rv, namespace, name string) string { return clusterFromRevisionIndex(roleBindingIndex, rv, namespace, name) }, + func(rv, namespace, name string) string { return clusterFromRevisionIndex(clusterRoleIndex, rv, namespace, name) }, + func(rv, namespace, name string) string { return clusterFromRevisionIndex(clusterRoleBindingIndex, rv, namespace, name) }, + ) +} + +func clusterFromRevisionIndex(index map[string]string, rv, namespace, name string) string { + if len(index) == 0 || rv == "" { + return "" + } + return index[rv+"|"+namespace+"|"+name] +} + +func (m *Manager) revisionClusterIndex(ctx context.Context, key string) (map[string]string, error) { + cli, err := m.getETCDClient() + if err != nil { + return nil, err + } + fullPrefix := strings.TrimSuffix(m.opts.EtcdPrefix, "/") + "/" + strings.TrimPrefix(key, "/") + queryPrefix := strings.TrimSuffix(fullPrefix, "/") + "/" + etcdCtx, cancel := context.WithTimeout(ctx, 5*time.Second) + defer cancel() + resp, err := cli.Get(etcdCtx, queryPrefix, clientv3.WithPrefix(), clientv3.WithKeysOnly()) + if err != nil { + return nil, err + } + out := make(map[string]string, len(resp.Kvs)) + resolver := mcstorage.KeyLayoutPlacementResolver{KindRootPrefix: strings.TrimSuffix(key, "/")} + storagePrefix := strings.TrimSuffix(key, "/") + "/" + for _, kv := range resp.Kvs { + etcdKey := string(kv.Key) + storageKey := strings.TrimPrefix(etcdKey, strings.TrimSuffix(m.opts.EtcdPrefix, "/")) + if storageKey == etcdKey { + continue + } + if !strings.HasPrefix(storageKey, "/") { + storageKey = "/" + storageKey + } + cid, ok := resolver.ClusterFromStorageKey(storageKey) + if !ok || cid == "" { + continue + } + rv := strconv.FormatInt(kv.ModRevision, 10) + out[rv] = cid + if ns, name, ok := objectPathFromStorageKey(storageKey, storagePrefix); ok { + out[rv+"|"+ns+"|"+name] = cid + } + } + return out, nil +} + +func objectPathFromStorageKey(storageKey, rootPrefix string) (namespace, name string, ok bool) { + if !strings.HasPrefix(storageKey, rootPrefix) { + return "", "", false + } + rest := strings.TrimPrefix(storageKey, rootPrefix) + parts := strings.Split(rest, "/") + if len(parts) < 2 { + return "", "", false + } + switch len(parts) { + case 2: + return "", parts[1], true + default: + return parts[1], parts[2], true + } +} + +func (m *Manager) getETCDClient() (*clientv3.Client, error) { + m.etcdMu.Lock() + defer m.etcdMu.Unlock() + if m.etcdClient != nil { + return m.etcdClient, nil + } + if len(m.opts.EtcdTransport.ServerList) == 0 { + return nil, fmt.Errorf("no etcd servers configured") + } + cfg := clientv3.Config{ + Endpoints: append([]string{}, m.opts.EtcdTransport.ServerList...), + DialTimeout: 5 * time.Second, + } + if m.opts.EtcdTransport.CertFile != "" || m.opts.EtcdTransport.KeyFile != "" || m.opts.EtcdTransport.TrustedCAFile != "" { + tlsInfo := transport.TLSInfo{ + CertFile: m.opts.EtcdTransport.CertFile, + KeyFile: m.opts.EtcdTransport.KeyFile, + TrustedCAFile: m.opts.EtcdTransport.TrustedCAFile, + } + tlsCfg, err := tlsInfo.ClientConfig() + if err != nil { + return nil, err + } + cfg.TLS = tlsCfg + } + cli, err := clientv3.New(cfg) + if err != nil { + return nil, err + } + m.etcdClient = cli + return m.etcdClient, nil +} + type clusterAwareRBACDataSource struct { store *rbacProjectionStore defaultCluster string diff --git a/pkg/multicluster/auth/scoped_factory.go b/pkg/multicluster/auth/scoped_factory.go index 2eb46af..1e69569 100644 --- a/pkg/multicluster/auth/scoped_factory.go +++ b/pkg/multicluster/auth/scoped_factory.go @@ -42,29 +42,24 @@ import ( rbaclisters "k8s.io/client-go/listers/rbac/v1" "k8s.io/client-go/tools/cache" - mc "github.com/kplane-dev/apiserver/pkg/multicluster" "github.com/kplane-dev/apiserver/pkg/multicluster/scopedinformer" + mcstorage "github.com/kplane-dev/apiserver/pkg/multicluster/storage" ) type scopedFactory struct { - clusterID string - clusterLabelKey string - shared informers.SharedInformerFactory - rbacStore *rbacProjectionStore + clusterID string + shared informers.SharedInformerFactory + rbacStore *rbacProjectionStore } -func newScopedFactory(clusterID, clusterLabelKey string, shared informers.SharedInformerFactory, rbacStore *rbacProjectionStore) informers.SharedInformerFactory { - if clusterLabelKey == "" { - clusterLabelKey = mc.DefaultClusterAnnotation - } +func newScopedFactory(clusterID string, shared informers.SharedInformerFactory, rbacStore *rbacProjectionStore) informers.SharedInformerFactory { if rbacStore == nil { - rbacStore = newRBACProjectionStore(clusterLabelKey) + rbacStore = newRBACProjectionStore() } return &scopedFactory{ - clusterID: clusterID, - clusterLabelKey: clusterLabelKey, - shared: shared, - rbacStore: rbacStore, + clusterID: clusterID, + shared: shared, + rbacStore: rbacStore, } } @@ -181,8 +176,8 @@ func (v *scopedCoreV1) Namespaces() coreinformersv1.NamespaceInformer { func (v *scopedCoreV1) Nodes() coreinformersv1.NodeInformer { base := v.f.shared.Core().V1().Nodes().Informer() return &scopedNodeInformer{ - informer: newFilteredSharedIndexInformer(base, v.f.clusterID, v.f.clusterLabelKey), - lister: &scopedNodeLister{indexer: base.GetIndexer(), clusterID: v.f.clusterID, clusterLabelKey: v.f.clusterLabelKey}, + informer: newFilteredSharedIndexInformer(base, v.f.clusterID), + lister: &scopedNodeLister{indexer: base.GetIndexer(), clusterID: v.f.clusterID}, } } func (v *scopedCoreV1) PersistentVolumes() coreinformersv1.PersistentVolumeInformer { @@ -194,8 +189,8 @@ func (v *scopedCoreV1) PersistentVolumeClaims() coreinformersv1.PersistentVolume func (v *scopedCoreV1) Pods() coreinformersv1.PodInformer { base := v.f.shared.Core().V1().Pods().Informer() return &scopedPodInformer{ - informer: newFilteredSharedIndexInformer(base, v.f.clusterID, v.f.clusterLabelKey), - lister: &scopedPodLister{indexer: base.GetIndexer(), clusterID: v.f.clusterID, clusterLabelKey: v.f.clusterLabelKey}, + informer: newFilteredSharedIndexInformer(base, v.f.clusterID), + lister: &scopedPodLister{indexer: base.GetIndexer(), clusterID: v.f.clusterID}, } } func (v *scopedCoreV1) PodTemplates() coreinformersv1.PodTemplateInformer { @@ -210,8 +205,8 @@ func (v *scopedCoreV1) ResourceQuotas() coreinformersv1.ResourceQuotaInformer { func (v *scopedCoreV1) Secrets() coreinformersv1.SecretInformer { base := v.f.shared.Core().V1().Secrets().Informer() return &scopedSecretInformer{ - informer: newFilteredSharedIndexInformer(base, v.f.clusterID, v.f.clusterLabelKey), - lister: &scopedSecretLister{indexer: base.GetIndexer(), clusterID: v.f.clusterID, clusterLabelKey: v.f.clusterLabelKey}, + informer: newFilteredSharedIndexInformer(base, v.f.clusterID), + lister: &scopedSecretLister{indexer: base.GetIndexer(), clusterID: v.f.clusterID}, } } func (v *scopedCoreV1) Services() coreinformersv1.ServiceInformer { @@ -220,8 +215,8 @@ func (v *scopedCoreV1) Services() coreinformersv1.ServiceInformer { func (v *scopedCoreV1) ServiceAccounts() coreinformersv1.ServiceAccountInformer { base := v.f.shared.Core().V1().ServiceAccounts().Informer() return &scopedServiceAccountInformer{ - informer: newFilteredSharedIndexInformer(base, v.f.clusterID, v.f.clusterLabelKey), - lister: &scopedServiceAccountLister{indexer: base.GetIndexer(), clusterID: v.f.clusterID, clusterLabelKey: v.f.clusterLabelKey}, + informer: newFilteredSharedIndexInformer(base, v.f.clusterID), + lister: &scopedServiceAccountLister{indexer: base.GetIndexer(), clusterID: v.f.clusterID}, } } @@ -242,38 +237,38 @@ type scopedRbacV1 struct{ f *scopedFactory } func (v *scopedRbacV1) ClusterRoleBindings() rbacinformersv1.ClusterRoleBindingInformer { base := v.f.shared.Rbac().V1().ClusterRoleBindings().Informer() return &scopedClusterRoleBindingInformer{ - informer: newFilteredSharedIndexInformer(base, v.f.clusterID, v.f.clusterLabelKey), + informer: newFilteredSharedIndexInformer(base, v.f.clusterID), lister: &scopedClusterRoleBindingLister{store: v.f.rbacStore, clusterID: v.f.clusterID}, } } func (v *scopedRbacV1) ClusterRoles() rbacinformersv1.ClusterRoleInformer { base := v.f.shared.Rbac().V1().ClusterRoles().Informer() return &scopedClusterRoleInformer{ - informer: newFilteredSharedIndexInformer(base, v.f.clusterID, v.f.clusterLabelKey), + informer: newFilteredSharedIndexInformer(base, v.f.clusterID), lister: &scopedClusterRoleLister{store: v.f.rbacStore, clusterID: v.f.clusterID}, } } func (v *scopedRbacV1) RoleBindings() rbacinformersv1.RoleBindingInformer { base := v.f.shared.Rbac().V1().RoleBindings().Informer() return &scopedRoleBindingInformer{ - informer: newFilteredSharedIndexInformer(base, v.f.clusterID, v.f.clusterLabelKey), + informer: newFilteredSharedIndexInformer(base, v.f.clusterID), lister: &scopedRoleBindingLister{store: v.f.rbacStore, clusterID: v.f.clusterID}, } } func (v *scopedRbacV1) Roles() rbacinformersv1.RoleInformer { base := v.f.shared.Rbac().V1().Roles().Informer() return &scopedRoleInformer{ - informer: newFilteredSharedIndexInformer(base, v.f.clusterID, v.f.clusterLabelKey), + informer: newFilteredSharedIndexInformer(base, v.f.clusterID), lister: &scopedRoleLister{store: v.f.rbacStore, clusterID: v.f.clusterID}, } } -func newFilteredSharedIndexInformer(shared cache.SharedIndexInformer, clusterID, clusterLabelKey string) cache.SharedIndexInformer { - return scopedinformer.NewFilteredSharedIndexInformer(shared, clusterID, clusterLabelKey) +func newFilteredSharedIndexInformer(shared cache.SharedIndexInformer, clusterID string) cache.SharedIndexInformer { + return scopedinformer.NewFilteredSharedIndexInformer(shared, clusterID) } -func objectCluster(obj interface{}, clusterLabelKey string) string { - return scopedinformer.ObjectCluster(obj, clusterLabelKey) +func objectCluster(obj interface{}) string { + return scopedinformer.ObjectCluster(obj) } func filteredByCluster(indexer cache.Indexer, clusterID string) []interface{} { @@ -282,19 +277,16 @@ func filteredByCluster(indexer cache.Indexer, clusterID string) []interface{} { type rbacProjectionStore struct { mu sync.RWMutex - clusterLabelKey string clusterRoles map[string]*rbacv1.ClusterRole clusterBindings map[string]*rbacv1.ClusterRoleBinding roles map[string]*rbacv1.Role roleBindings map[string]*rbacv1.RoleBinding } -func newRBACProjectionStore(clusterLabelKey string) *rbacProjectionStore { - if clusterLabelKey == "" { - clusterLabelKey = mc.DefaultClusterAnnotation - } +type clusterResolver func(rv, namespace, name string) string + +func newRBACProjectionStore() *rbacProjectionStore { return &rbacProjectionStore{ - clusterLabelKey: clusterLabelKey, clusterRoles: map[string]*rbacv1.ClusterRole{}, clusterBindings: map[string]*rbacv1.ClusterRoleBinding{}, roles: map[string]*rbacv1.Role{}, @@ -302,6 +294,75 @@ func newRBACProjectionStore(clusterLabelKey string) *rbacProjectionStore { } } +func (s *rbacProjectionStore) RebuildFromInformerStores(roleObjs, roleBindingObjs, clusterRoleObjs, clusterRoleBindingObjs []interface{}) { + if s == nil { + return + } + s.mu.Lock() + s.clusterRoles = map[string]*rbacv1.ClusterRole{} + s.clusterBindings = map[string]*rbacv1.ClusterRoleBinding{} + s.roles = map[string]*rbacv1.Role{} + s.roleBindings = map[string]*rbacv1.RoleBinding{} + s.mu.Unlock() + + for _, obj := range roleObjs { + s.upsertRole(obj) + } + for _, obj := range roleBindingObjs { + s.upsertRoleBinding(obj) + } + for _, obj := range clusterRoleObjs { + s.upsertClusterRole(obj) + } + for _, obj := range clusterRoleBindingObjs { + s.upsertClusterRoleBinding(obj) + } +} + +func (s *rbacProjectionStore) RebuildFromInformerStoresWithResolver( + roleObjs, roleBindingObjs, clusterRoleObjs, clusterRoleBindingObjs []interface{}, + roleCluster, roleBindingCluster, clusterRoleCluster, clusterRoleBindingCluster clusterResolver, +) { + if s == nil { + return + } + s.mu.Lock() + s.clusterRoles = map[string]*rbacv1.ClusterRole{} + s.clusterBindings = map[string]*rbacv1.ClusterRoleBinding{} + s.roles = map[string]*rbacv1.Role{} + s.roleBindings = map[string]*rbacv1.RoleBinding{} + s.mu.Unlock() + + for _, obj := range roleObjs { + r, ok := tombstoneObj(obj).(*rbacv1.Role) + if !ok || r == nil || roleCluster == nil { + continue + } + s.upsertRoleWithCluster(r, roleCluster(r.ResourceVersion, r.Namespace, r.Name)) + } + for _, obj := range roleBindingObjs { + rb, ok := tombstoneObj(obj).(*rbacv1.RoleBinding) + if !ok || rb == nil || roleBindingCluster == nil { + continue + } + s.upsertRoleBindingWithCluster(rb, roleBindingCluster(rb.ResourceVersion, rb.Namespace, rb.Name)) + } + for _, obj := range clusterRoleObjs { + cr, ok := tombstoneObj(obj).(*rbacv1.ClusterRole) + if !ok || cr == nil || clusterRoleCluster == nil { + continue + } + s.upsertClusterRoleWithCluster(cr, clusterRoleCluster(cr.ResourceVersion, "", cr.Name)) + } + for _, obj := range clusterRoleBindingObjs { + crb, ok := tombstoneObj(obj).(*rbacv1.ClusterRoleBinding) + if !ok || crb == nil || clusterRoleBindingCluster == nil { + continue + } + s.upsertClusterRoleBindingWithCluster(crb, clusterRoleBindingCluster(crb.ResourceVersion, "", crb.Name)) + } +} + func registerRBACProjectionHandlers(store *rbacProjectionStore, roleInf, roleBindingInf, clusterRoleInf, clusterRoleBindingInf cache.SharedIndexInformer) error { register := func(inf cache.SharedIndexInformer, upsert func(interface{}), del func(interface{})) error { _, err := inf.AddEventHandler(cache.ResourceEventHandlerFuncs{ @@ -326,12 +387,11 @@ func registerRBACProjectionHandlers(store *rbacProjectionStore, roleInf, roleBin return nil } -func (s *rbacProjectionStore) objectKey(obj runtime.Object) string { +func (s *rbacProjectionStore) objectKey(obj runtime.Object, clusterID string) string { acc, err := meta.Accessor(obj) if err != nil { return "" } - clusterID := acc.GetLabels()[s.clusterLabelKey] if clusterID == "" { return "" } @@ -342,15 +402,26 @@ func tombstoneObj(obj interface{}) interface{} { if tombstone, ok := obj.(cache.DeletedFinalStateUnknown); ok { return tombstone.Obj } + if entry, ok := obj.(*mcstorage.InternalEntry); ok && entry != nil { + return entry.Object + } return obj } func (s *rbacProjectionStore) upsertClusterRole(obj interface{}) { + clusterID := objectCluster(obj) cr, ok := tombstoneObj(obj).(*rbacv1.ClusterRole) if !ok || cr == nil { return } - key := s.objectKey(cr) + s.upsertClusterRoleWithCluster(cr, clusterID) +} + +func (s *rbacProjectionStore) upsertClusterRoleWithCluster(cr *rbacv1.ClusterRole, clusterID string) { + if cr == nil { + return + } + key := s.objectKey(cr, clusterID) if key == "" { return } @@ -360,11 +431,12 @@ func (s *rbacProjectionStore) upsertClusterRole(obj interface{}) { } func (s *rbacProjectionStore) deleteClusterRole(obj interface{}) { + clusterID := objectCluster(obj) cr, ok := tombstoneObj(obj).(*rbacv1.ClusterRole) if !ok || cr == nil { return } - key := s.objectKey(cr) + key := s.objectKey(cr, clusterID) if key == "" { return } @@ -374,11 +446,19 @@ func (s *rbacProjectionStore) deleteClusterRole(obj interface{}) { } func (s *rbacProjectionStore) upsertClusterRoleBinding(obj interface{}) { + clusterID := objectCluster(obj) crb, ok := tombstoneObj(obj).(*rbacv1.ClusterRoleBinding) if !ok || crb == nil { return } - key := s.objectKey(crb) + s.upsertClusterRoleBindingWithCluster(crb, clusterID) +} + +func (s *rbacProjectionStore) upsertClusterRoleBindingWithCluster(crb *rbacv1.ClusterRoleBinding, clusterID string) { + if crb == nil { + return + } + key := s.objectKey(crb, clusterID) if key == "" { return } @@ -388,11 +468,12 @@ func (s *rbacProjectionStore) upsertClusterRoleBinding(obj interface{}) { } func (s *rbacProjectionStore) deleteClusterRoleBinding(obj interface{}) { + clusterID := objectCluster(obj) crb, ok := tombstoneObj(obj).(*rbacv1.ClusterRoleBinding) if !ok || crb == nil { return } - key := s.objectKey(crb) + key := s.objectKey(crb, clusterID) if key == "" { return } @@ -402,11 +483,19 @@ func (s *rbacProjectionStore) deleteClusterRoleBinding(obj interface{}) { } func (s *rbacProjectionStore) upsertRole(obj interface{}) { + clusterID := objectCluster(obj) r, ok := tombstoneObj(obj).(*rbacv1.Role) if !ok || r == nil { return } - key := s.objectKey(r) + s.upsertRoleWithCluster(r, clusterID) +} + +func (s *rbacProjectionStore) upsertRoleWithCluster(r *rbacv1.Role, clusterID string) { + if r == nil { + return + } + key := s.objectKey(r, clusterID) if key == "" { return } @@ -416,11 +505,12 @@ func (s *rbacProjectionStore) upsertRole(obj interface{}) { } func (s *rbacProjectionStore) deleteRole(obj interface{}) { + clusterID := objectCluster(obj) r, ok := tombstoneObj(obj).(*rbacv1.Role) if !ok || r == nil { return } - key := s.objectKey(r) + key := s.objectKey(r, clusterID) if key == "" { return } @@ -430,11 +520,19 @@ func (s *rbacProjectionStore) deleteRole(obj interface{}) { } func (s *rbacProjectionStore) upsertRoleBinding(obj interface{}) { + clusterID := objectCluster(obj) rb, ok := tombstoneObj(obj).(*rbacv1.RoleBinding) if !ok || rb == nil { return } - key := s.objectKey(rb) + s.upsertRoleBindingWithCluster(rb, clusterID) +} + +func (s *rbacProjectionStore) upsertRoleBindingWithCluster(rb *rbacv1.RoleBinding, clusterID string) { + if rb == nil { + return + } + key := s.objectKey(rb, clusterID) if key == "" { return } @@ -444,11 +542,12 @@ func (s *rbacProjectionStore) upsertRoleBinding(obj interface{}) { } func (s *rbacProjectionStore) deleteRoleBinding(obj interface{}) { + clusterID := objectCluster(obj) rb, ok := tombstoneObj(obj).(*rbacv1.RoleBinding) if !ok || rb == nil { return } - key := s.objectKey(rb) + key := s.objectKey(rb, clusterID) if key == "" { return } @@ -726,9 +825,8 @@ func (i *scopedNodeInformer) Informer() cache.SharedIndexInformer { return i.inf func (i *scopedNodeInformer) Lister() corelisters.NodeLister { return i.lister } type scopedSecretLister struct { - indexer cache.Indexer - clusterID string - clusterLabelKey string + indexer cache.Indexer + clusterID string } func (l *scopedSecretLister) List(sel labels.Selector) (ret []*corev1.Secret, err error) { @@ -775,9 +873,8 @@ func (l *scopedSecretNamespaceLister) Get(name string) (*corev1.Secret, error) { } type scopedServiceAccountLister struct { - indexer cache.Indexer - clusterID string - clusterLabelKey string + indexer cache.Indexer + clusterID string } func (l *scopedServiceAccountLister) List(sel labels.Selector) (ret []*corev1.ServiceAccount, err error) { @@ -824,9 +921,8 @@ func (l *scopedServiceAccountNamespaceLister) Get(name string) (*corev1.ServiceA } type scopedPodLister struct { - indexer cache.Indexer - clusterID string - clusterLabelKey string + indexer cache.Indexer + clusterID string } func (l *scopedPodLister) List(sel labels.Selector) (ret []*corev1.Pod, err error) { @@ -873,9 +969,8 @@ func (l *scopedPodNamespaceLister) Get(name string) (*corev1.Pod, error) { } type scopedNodeLister struct { - indexer cache.Indexer - clusterID string - clusterLabelKey string + indexer cache.Indexer + clusterID string } func (l *scopedNodeLister) List(sel labels.Selector) (ret []*corev1.Node, err error) { diff --git a/pkg/multicluster/bootstrap/crd_controller.go b/pkg/multicluster/bootstrap/crd_controller.go index 2247812..3632452 100644 --- a/pkg/multicluster/bootstrap/crd_controller.go +++ b/pkg/multicluster/bootstrap/crd_controller.go @@ -41,10 +41,13 @@ func (c *MulticlusterCRDController) Start(stopCh <-chan struct{}) { c.mu.Unlock() go c.run() + if c.defaultCluster != "" { + c.EnsureCluster(c.defaultCluster) + } } func (c *MulticlusterCRDController) EnsureCluster(clusterID string) { - if c == nil || clusterID == "" || clusterID == c.defaultCluster { + if c == nil || clusterID == "" { return } diff --git a/pkg/multicluster/bootstrap/crd_runtime_manager_wrapped.go b/pkg/multicluster/bootstrap/crd_runtime_manager_wrapped.go index 5ecf0a5..06fec49 100644 --- a/pkg/multicluster/bootstrap/crd_runtime_manager_wrapped.go +++ b/pkg/multicluster/bootstrap/crd_runtime_manager_wrapped.go @@ -4,6 +4,8 @@ import ( "context" "fmt" "net/http" + "os" + "strconv" "strings" "sync" "time" @@ -13,22 +15,29 @@ import ( apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" apiextensionsapiserver "k8s.io/apiextensions-apiserver/pkg/apiserver" apiextensionsclient "k8s.io/apiextensions-apiserver/pkg/client/clientset/clientset" - apiextensionsinformers "k8s.io/apiextensions-apiserver/pkg/client/informers/externalversions" apierrors "k8s.io/apimachinery/pkg/api/errors" - "k8s.io/apimachinery/pkg/api/meta" + apiextensionsinformers "k8s.io/apiextensions-apiserver/pkg/client/informers/externalversions" "k8s.io/apimachinery/pkg/api/equality" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/util/sets" + "k8s.io/apimachinery/pkg/util/wait" "k8s.io/apiserver/pkg/server" + "k8s.io/apiserver/pkg/storage/storagebackend" "k8s.io/apiserver/pkg/util/webhook" + "k8s.io/client-go/dynamic" "k8s.io/client-go/rest" "k8s.io/client-go/tools/cache" "k8s.io/client-go/util/workqueue" "k8s.io/component-base/metrics" "k8s.io/component-base/metrics/legacyregistry" "k8s.io/klog/v2" + "go.etcd.io/etcd/client/pkg/v3/transport" + clientv3 "go.etcd.io/etcd/client/v3" "k8s.io/apimachinery/pkg/util/validation/field" mc "github.com/kplane-dev/apiserver/pkg/multicluster" + mcstorage "github.com/kplane-dev/apiserver/pkg/multicluster/storage" ) var ( @@ -47,6 +56,8 @@ type CRDRuntimeManagerOptions struct { ControlPlaneSegment string DefaultCluster string Delegate http.Handler + EtcdPrefix string + EtcdTransport storagebackend.TransportConfig } type runtimeEntry struct { @@ -76,6 +87,7 @@ type CRDRuntimeManager struct { sharedOwnedStop chan struct{} crdQueue workqueue.TypedRateLimitingInterface[string] crdWorkersStarted bool + crdPlacement *crdStoragePlacementResolver } func NewCRDRuntimeManager(opts CRDRuntimeManagerOptions) *CRDRuntimeManager { @@ -87,6 +99,11 @@ func NewCRDRuntimeManager(opts CRDRuntimeManagerOptions) *CRDRuntimeManager { clients: map[string]apiextensionsclient.Interface{}, servesIndex: NewCRDServesIndex(), sharedProjection: newCRDProjectionStore(), + crdPlacement: newCRDStoragePlacementResolver( + opts.EtcdPrefix, + opts.EtcdTransport, + "/apiextensions.k8s.io/customresourcedefinitions/clusters", + ), crdQueue: workqueue.NewTypedRateLimitingQueueWithConfig( workqueue.DefaultTypedControllerRateLimiter[string](), workqueue.TypedRateLimitingQueueConfig[string]{Name: "mc_shared_crd_status"}, @@ -94,6 +111,7 @@ func NewCRDRuntimeManager(opts CRDRuntimeManagerOptions) *CRDRuntimeManager { } } +// Start initializes the shared CRD informer/workers used for status and delete reconciliation. func (m *CRDRuntimeManager) Runtime(clusterID string, stopCh <-chan struct{}) (http.Handler, error) { if m == nil || clusterID == "" || clusterID == m.opts.DefaultCluster || m.opts.BaseAPIExtensionsConfig == nil { return nil, nil @@ -147,7 +165,6 @@ func (m *CRDRuntimeManager) ServesGroupVersion(clusterID, group, version string, crdServesLookupLat.WithLabelValues(r).Observe(time.Since(start).Seconds()) return served, nil } - r := result(false) crdServesLookupTotal.WithLabelValues(r).Inc() crdServesLookupLat.WithLabelValues(r).Observe(time.Since(start).Seconds()) @@ -348,9 +365,9 @@ func (m *CRDRuntimeManager) ensureSharedCRDState(stopCh <-chan struct{}) error { return nil, err } - factory := apiextensionsinformers.NewSharedInformerFactory(cs, 0) + factory := apiextensionsinformers.NewSharedInformerFactory(cs, 5*time.Second) crdInformer := factory.Apiextensions().V1().CustomResourceDefinitions().Informer() - if err := crdInformer.SetTransform(transformCRDForShared(mc.DefaultClusterAnnotation)); err != nil { + if err := crdInformer.SetTransform(transformCRDForShared(m.opts.DefaultCluster, m.clusterForCRDFromStorage)); err != nil { return nil, err } _, err = crdInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{ @@ -374,7 +391,6 @@ func (m *CRDRuntimeManager) ensureSharedCRDState(stopCh <-chan struct{}) error { return nil, fmt.Errorf("failed waiting for shared CRD informer sync") } m.startSharedCRDWorkers(startStop) - m.rebuildSharedProjection(crdInformer.GetStore().List()) m.mu.Lock() @@ -398,7 +414,13 @@ func (m *CRDRuntimeManager) rebuildSharedProjection(objs []interface{}) { if !ok { continue } - clusterID := objectClusterID(crd, mc.DefaultClusterAnnotation) + clusterID := objectClusterID(crd) + if clusterID == "" { + clusterID = m.clusterForCRDFromStorage(crd) + } + if clusterID == "" { + clusterID = m.opts.DefaultCluster + } if clusterID == "" { continue } @@ -406,7 +428,7 @@ func (m *CRDRuntimeManager) rebuildSharedProjection(objs []interface{}) { decodedObjs = append(decodedObjs, decoded) byCluster[clusterID] = append(byCluster[clusterID], decoded) } - m.sharedProjection.ReplaceAll(decodedObjs, mc.DefaultClusterAnnotation) + m.sharedProjection.ReplaceAll(decodedObjs) for clusterID, clusterObjs := range byCluster { m.servesIndex.RebuildCluster(clusterID, clusterObjs) } @@ -417,7 +439,13 @@ func (m *CRDRuntimeManager) onSharedCRDUpsert(obj interface{}) { if !ok { return } - clusterID := objectClusterID(crd, mc.DefaultClusterAnnotation) + clusterID := objectClusterID(crd) + if clusterID == "" { + clusterID = m.clusterForCRDFromStorage(crd) + } + if clusterID == "" { + clusterID = m.opts.DefaultCluster + } if clusterID == "" { return } @@ -432,7 +460,13 @@ func (m *CRDRuntimeManager) onSharedCRDDelete(obj interface{}) { if !ok { return } - clusterID := objectClusterID(crd, mc.DefaultClusterAnnotation) + clusterID := objectClusterID(crd) + if clusterID == "" { + clusterID = m.clusterForCRDFromStorage(crd) + } + if clusterID == "" { + clusterID = m.opts.DefaultCluster + } if clusterID == "" { return } @@ -442,11 +476,13 @@ func (m *CRDRuntimeManager) onSharedCRDDelete(obj interface{}) { } func (m *CRDRuntimeManager) EnsureCluster(clusterID string, stopCh <-chan struct{}) error { - if m == nil || clusterID == "" || clusterID == m.opts.DefaultCluster { + if m == nil || clusterID == "" { return nil } - if _, err := m.ensureSharedRuntime(stopCh); err != nil { - return err + if clusterID != m.opts.DefaultCluster { + if _, err := m.ensureSharedRuntime(stopCh); err != nil { + return err + } } if err := m.ensureSharedCRDState(stopCh); err != nil { return err @@ -608,7 +644,7 @@ func (s *crdProjectionStore) List(clusterID string) []*apiextensionsv1.CustomRes return out } -func (s *crdProjectionStore) ReplaceAll(objs []interface{}, clusterLabelKey string) { +func (s *crdProjectionStore) ReplaceAll(objs []interface{}) { if s == nil { return } @@ -618,7 +654,7 @@ func (s *crdProjectionStore) ReplaceAll(objs []interface{}, clusterLabelKey stri if !ok { continue } - clusterID := objectClusterID(crd, clusterLabelKey) + clusterID := objectClusterID(crd) if clusterID == "" { continue } @@ -680,26 +716,62 @@ func (m *CRDRuntimeManager) sharedStartStopCh(stopCh <-chan struct{}) <-chan str return m.sharedStopCh } -func objectClusterID(obj interface{}, clusterLabelKey string) string { - if clusterLabelKey == "" { - clusterLabelKey = mc.DefaultClusterAnnotation +func objectClusterID(obj interface{}) string { + if entry, ok := obj.(*mcstorage.InternalEntry); ok && entry != nil { + if entry.ClusterID != "" { + return entry.ClusterID + } + if entry.StorageKey != "" { + i := strings.Index(entry.StorageKey, "/clusters/") + if i > 0 { + root := strings.TrimSuffix(entry.StorageKey[:i+len("/clusters")], "/") + resolver := mcstorage.KeyLayoutPlacementResolver{KindRootPrefix: root} + if cid, ok := resolver.ClusterFromStorageKey(entry.StorageKey); ok { + return cid + } + } + } + if entry.Object != nil { + obj = entry.Object + } } - accessor, err := meta.Accessor(obj) - if err != nil { + crd, ok := obj.(*apiextensionsv1.CustomResourceDefinition) + if !ok || crd == nil { return "" } - return accessor.GetLabels()[clusterLabelKey] + if crd.Name == "" { + return "" + } + if !strings.HasPrefix(crd.Name, sharedCRDNamePrefix) { + return "" + } + rest := strings.TrimPrefix(crd.Name, sharedCRDNamePrefix) + idx := strings.Index(rest, "__") + if idx <= 0 { + return "" + } + return rest[:idx] } const sharedCRDNamePrefix = "__mc_shared_crd__" -func transformCRDForShared(clusterLabelKey string) cache.TransformFunc { +func (m *CRDRuntimeManager) clusterForCRDFromStorage(crd *apiextensionsv1.CustomResourceDefinition) string { + if m == nil || m.crdPlacement == nil { + return "" + } + return m.crdPlacement.ClusterForCRD(crd) +} + +func transformCRDForShared(_ string, resolver func(*apiextensionsv1.CustomResourceDefinition) string) cache.TransformFunc { return func(obj interface{}) (interface{}, error) { crd, ok := obj.(*apiextensionsv1.CustomResourceDefinition) if !ok || crd == nil { return obj, nil } - clusterID := objectClusterID(crd, clusterLabelKey) + clusterID := objectClusterID(crd) + if clusterID == "" && resolver != nil { + clusterID = resolver(crd) + } if clusterID == "" { return obj, nil } @@ -709,6 +781,139 @@ func transformCRDForShared(clusterLabelKey string) cache.TransformFunc { } } +type crdStoragePlacementResolver struct { + etcdPrefix string + transport storagebackend.TransportConfig + kindRootPrefix string + resolver mcstorage.PlacementResolver + + mu sync.Mutex + client *clientv3.Client +} + +func newCRDStoragePlacementResolver(etcdPrefix string, transportCfg storagebackend.TransportConfig, kindRootPrefix string) *crdStoragePlacementResolver { + trimmedRoot := strings.TrimSuffix(kindRootPrefix, "/") + if strings.TrimSpace(etcdPrefix) == "" { + etcdPrefix = "/registry" + } + return &crdStoragePlacementResolver{ + etcdPrefix: strings.TrimSuffix(etcdPrefix, "/"), + transport: transportCfg, + kindRootPrefix: trimmedRoot, + resolver: mcstorage.KeyLayoutPlacementResolver{KindRootPrefix: trimmedRoot}, + } +} + +func (r *crdStoragePlacementResolver) ClusterForCRD(crd *apiextensionsv1.CustomResourceDefinition) string { + if r == nil || crd == nil || crd.Name == "" { + return "" + } + if crd.GetResourceVersion() == "" { + return "" + } + cli, err := r.getClient() + if err != nil { + return "" + } + wantRV := int64(0) + if rv := crd.GetResourceVersion(); rv != "" { + if parsed, err := strconv.ParseInt(rv, 10, 64); err == nil { + wantRV = parsed + } + } + if wantRV == 0 { + return "" + } + queryPrefix := strings.TrimSuffix(r.fullETCDPrefixForRoot(), "/") + "/" + wantSuffix := "/" + crd.Name + deadline := time.Now().Add(30 * time.Second) + for time.Now().Before(deadline) { + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + resp, err := cli.Get( + ctx, + queryPrefix, + clientv3.WithPrefix(), + clientv3.WithKeysOnly(), + clientv3.WithMinModRev(wantRV), + clientv3.WithMaxModRev(wantRV), + ) + cancel() + if err == nil { + for _, kv := range resp.Kvs { + storageKey := r.storageKeyFromETCDKey(string(kv.Key)) + if storageKey == "" || !strings.HasSuffix(storageKey, wantSuffix) { + continue + } + cid, ok := r.resolver.ClusterFromStorageKey(storageKey) + if ok && cid != "" && kv.ModRevision == wantRV { + return cid + } + } + } + time.Sleep(25 * time.Millisecond) + } + return "" +} + +func (r *crdStoragePlacementResolver) fullETCDPrefixForRoot() string { + return strings.TrimSuffix(r.etcdPrefix, "/") + "/" + strings.TrimPrefix(r.kindRootPrefix, "/") +} + +func (r *crdStoragePlacementResolver) storageKeyFromETCDKey(etcdKey string) string { + prefix := strings.TrimSuffix(r.etcdPrefix, "/") + trimmed := strings.TrimPrefix(etcdKey, prefix) + if trimmed == etcdKey { + return "" + } + if !strings.HasPrefix(trimmed, "/") { + trimmed = "/" + trimmed + } + return trimmed +} + +func (r *crdStoragePlacementResolver) getClient() (*clientv3.Client, error) { + r.mu.Lock() + defer r.mu.Unlock() + if r.client != nil { + return r.client, nil + } + if len(r.transport.ServerList) == 0 { + if env := strings.TrimSpace(os.Getenv("ETCD_ENDPOINTS")); env != "" { + for _, ep := range strings.Split(env, ",") { + ep = strings.TrimSpace(ep) + if ep != "" { + r.transport.ServerList = append(r.transport.ServerList, ep) + } + } + } + } + if len(r.transport.ServerList) == 0 { + return nil, fmt.Errorf("no etcd servers configured") + } + cfg := clientv3.Config{ + Endpoints: append([]string{}, r.transport.ServerList...), + DialTimeout: 5 * time.Second, + } + if r.transport.CertFile != "" || r.transport.KeyFile != "" || r.transport.TrustedCAFile != "" { + tlsInfo := transport.TLSInfo{ + CertFile: r.transport.CertFile, + KeyFile: r.transport.KeyFile, + TrustedCAFile: r.transport.TrustedCAFile, + } + tlsCfg, err := tlsInfo.ClientConfig() + if err != nil { + return nil, err + } + cfg.TLS = tlsCfg + } + cli, err := clientv3.New(cfg) + if err != nil { + return nil, err + } + r.client = cli + return r.client, nil +} + func encodeSharedCRDName(clusterID, name string) string { if clusterID == "" || name == "" { return name @@ -799,11 +1004,30 @@ func (m *CRDRuntimeManager) reconcileCRDStatusKey(key string) error { if !ok { return nil } - crd, found := m.sharedProjection.Get(clusterID, name) - if !found || crd == nil { + _, found := m.sharedProjection.Get(clusterID, name) + if !found { + return nil + } + cs, err := m.ensureClusterClient(clusterID) + if err != nil { + return err + } + live, err := cs.ApiextensionsV1().CustomResourceDefinitions().Get(context.TODO(), name, metav1.GetOptions{}) + if apierrors.IsNotFound(err) { + return nil + } + if err != nil { + return err + } + if live.DeletionTimestamp != nil && hasCRDCleanupFinalizer(live) { + return m.reconcileDeletingCRD(clusterID, live) + } + // Let upstream finalizer/status controllers own terminating CRDs. + // Overwriting status during deletion can stall customresourcecleanup. + if live.DeletionTimestamp != nil { return nil } - desired := crd.DeepCopy() + desired := live.DeepCopy() desired.Status.AcceptedNames = desired.Spec.Names namesAccepted := apiextensionsv1.CustomResourceDefinitionCondition{ @@ -830,20 +1054,172 @@ func (m *CRDRuntimeManager) reconcileCRDStatusKey(key string) error { } apiextensionshelpers.SetCRDCondition(desired, established) - if equality.Semantic.DeepEqual(crd.Status, desired.Status) { + if equality.Semantic.DeepEqual(live.Status, desired.Status) { return nil } + _, err = cs.ApiextensionsV1().CustomResourceDefinitions().UpdateStatus(context.TODO(), desired, metav1.UpdateOptions{}) + if apierrors.IsNotFound(err) { + return nil + } + if apierrors.IsConflict(err) { + return err + } + return err +} +func hasCRDCleanupFinalizer(crd *apiextensionsv1.CustomResourceDefinition) bool { + for _, f := range crd.Finalizers { + if f == apiextensionsv1.CustomResourceCleanupFinalizer { + return true + } + } + return false +} + +func (m *CRDRuntimeManager) reconcileDeletingCRD(clusterID string, crd *apiextensionsv1.CustomResourceDefinition) error { cs, err := m.ensureClusterClient(clusterID) if err != nil { return err } - _, err = cs.ApiextensionsV1().CustomResourceDefinitions().UpdateStatus(context.TODO(), desired, metav1.UpdateOptions{}) + + live, err := cs.ApiextensionsV1().CustomResourceDefinitions().Get(context.TODO(), crd.Name, metav1.GetOptions{}) if apierrors.IsNotFound(err) { return nil } - if apierrors.IsConflict(err) { + if err != nil { return err } - return err + if live.DeletionTimestamp == nil || !hasCRDCleanupFinalizer(live) { + return nil + } + + inProgress := live.DeepCopy() + apiextensionshelpers.SetCRDCondition(inProgress, apiextensionsv1.CustomResourceDefinitionCondition{ + Type: apiextensionsv1.Terminating, + Status: apiextensionsv1.ConditionTrue, + Reason: "InstanceDeletionInProgress", + Message: "CustomResource deletion is in progress", + }) + if _, err := cs.ApiextensionsV1().CustomResourceDefinitions().UpdateStatus(context.TODO(), inProgress, metav1.UpdateOptions{}); err != nil { + if apierrors.IsNotFound(err) || apierrors.IsConflict(err) { + return nil + } + return err + } + + cond, delErr := m.deleteCRInstances(clusterID, live) + + live, err = cs.ApiextensionsV1().CustomResourceDefinitions().Get(context.TODO(), crd.Name, metav1.GetOptions{}) + if apierrors.IsNotFound(err) { + return nil + } + if err != nil { + return err + } + next := live.DeepCopy() + apiextensionshelpers.SetCRDCondition(next, cond) + if delErr != nil { + if _, err := cs.ApiextensionsV1().CustomResourceDefinitions().UpdateStatus(context.TODO(), next, metav1.UpdateOptions{}); err != nil && !apierrors.IsConflict(err) && !apierrors.IsNotFound(err) { + klog.Errorf("mc.crd status update failed while deleting instances cluster=%s crd=%s err=%v", clusterID, crd.Name, err) + } + return delErr + } + + apiextensionshelpers.CRDRemoveFinalizer(next, apiextensionsv1.CustomResourceCleanupFinalizer) + if _, err := cs.ApiextensionsV1().CustomResourceDefinitions().UpdateStatus(context.TODO(), next, metav1.UpdateOptions{}); err != nil { + if apierrors.IsNotFound(err) || apierrors.IsConflict(err) { + return nil + } + return err + } + return nil } + +func (m *CRDRuntimeManager) deleteCRInstances(clusterID string, crd *apiextensionsv1.CustomResourceDefinition) (apiextensionsv1.CustomResourceDefinitionCondition, error) { + dc, err := m.dynamicClientForCluster(clusterID) + if err != nil { + return deletingCond("InstanceDeletionFailed", fmt.Sprintf("could not create dynamic client: %v", err)), err + } + + storageVersion, err := apiextensionshelpers.GetCRDStorageVersion(crd) + if err != nil { + return deletingCond("InstanceDeletionFailed", fmt.Sprintf("could not resolve storage version: %v", err)), err + } + resource := crd.Status.AcceptedNames.Plural + if resource == "" { + resource = crd.Spec.Names.Plural + } + gvr := schema.GroupVersionResource{ + Group: crd.Spec.Group, + Version: storageVersion, + Resource: resource, + } + rc := dc.Resource(gvr) + + ctx := context.TODO() + list, err := rc.List(ctx, metav1.ListOptions{}) + if err != nil { + return deletingCond("InstanceDeletionFailed", fmt.Sprintf("could not list instances: %v", err)), err + } + + if crd.Spec.Scope == apiextensionsv1.NamespaceScoped { + seen := sets.New[string]() + for _, item := range list.Items { + ns := item.GetNamespace() + if ns == "" || seen.Has(ns) { + continue + } + seen.Insert(ns) + if err := rc.Namespace(ns).DeleteCollection(ctx, metav1.DeleteOptions{}, metav1.ListOptions{}); err != nil && !apierrors.IsNotFound(err) { + return deletingCond("InstanceDeletionFailed", fmt.Sprintf("could not issue all deletes: %v", err)), err + } + } + } else { + if err := rc.DeleteCollection(ctx, metav1.DeleteOptions{}, metav1.ListOptions{}); err != nil && !apierrors.IsNotFound(err) { + return deletingCond("InstanceDeletionFailed", fmt.Sprintf("could not issue all deletes: %v", err)), err + } + } + + err = wait.PollUntilContextTimeout(ctx, 5*time.Second, 1*time.Minute, true, func(ctx context.Context) (bool, error) { + l, err := rc.List(ctx, metav1.ListOptions{}) + if err != nil { + return false, err + } + return len(l.Items) == 0, nil + }) + if err != nil { + return deletingCond("InstanceDeletionCheck", fmt.Sprintf("could not confirm zero CustomResources remaining: %v", err)), err + } + return deletingCond("InstanceDeletionCompleted", "removed all instances"), nil +} + +func deletingCond(reason, message string) apiextensionsv1.CustomResourceDefinitionCondition { + status := apiextensionsv1.ConditionTrue + if reason == "InstanceDeletionCompleted" { + status = apiextensionsv1.ConditionFalse + } + return apiextensionsv1.CustomResourceDefinitionCondition{ + Type: apiextensionsv1.Terminating, + Status: status, + Reason: reason, + Message: message, + } +} + +func (m *CRDRuntimeManager) dynamicClientForCluster(clusterID string) (dynamic.Interface, error) { + base := m.baseLoopbackConfig() + if base == nil { + return nil, fmt.Errorf("base apiextensions loopback config is required") + } + cfg := rest.CopyConfig(base) + host, err := mc.ClusterHost(cfg.Host, mc.Options{ + PathPrefix: m.opts.PathPrefix, + ControlPlaneSegment: m.opts.ControlPlaneSegment, + }, clusterID) + if err != nil { + return nil, err + } + cfg.Host = host + return dynamic.NewForConfig(cfg) +} + diff --git a/pkg/multicluster/bootstrap/kubernetesservice_controller.go b/pkg/multicluster/bootstrap/kubernetesservice_controller.go index b34f261..2e679aa 100644 --- a/pkg/multicluster/bootstrap/kubernetesservice_controller.go +++ b/pkg/multicluster/bootstrap/kubernetesservice_controller.go @@ -138,6 +138,11 @@ func (c *kubernetesServiceController) reconcile() error { } epClient := c.client.CoreV1().Endpoints(metav1.NamespaceDefault) + // Endpoint objects reject loopback addresses. In local envtest setups the + // loopback listener is expected; skip endpoint reconciliation in that case. + if c.publicIP != nil && c.publicIP.IsLoopback() { + return nil + } want := &corev1.Endpoints{ ObjectMeta: metav1.ObjectMeta{ Name: "kubernetes", diff --git a/pkg/multicluster/object_identity.go b/pkg/multicluster/object_identity.go new file mode 100644 index 0000000..0c75e3a --- /dev/null +++ b/pkg/multicluster/object_identity.go @@ -0,0 +1,15 @@ +package multicluster + +import "k8s.io/apimachinery/pkg/runtime" + +// SetObjectClusterIdentity intentionally does not persist identity on objects. +func SetObjectClusterIdentity(_ runtime.Object, _ string) {} + +// ClearObjectClusterIdentity intentionally no-ops. +func ClearObjectClusterIdentity(_ runtime.Object) {} + +// ObjectClusterIdentity intentionally returns empty to avoid metadata coupling. +func ObjectClusterIdentity(_ interface{}) string { + return "" +} + diff --git a/pkg/multicluster/options.go b/pkg/multicluster/options.go index d0f017b..31df147 100644 --- a/pkg/multicluster/options.go +++ b/pkg/multicluster/options.go @@ -12,7 +12,6 @@ import ( // EtcdPrefix must match the apiserver's configured etcd prefix (e.g. "/registry"). // DefaultCluster is used when no cluster can be extracted. // PathPrefix and ControlPlaneSegment define the URL form for path-based extraction. -// ClusterAnnotationKey is the server-owned annotation storing the cluster id. // ClusterFieldKey is a synthetic field name exposed via AttrFunc for selectors. // WatchStrategy selects shared kind-root or per-cluster watch behavior. // ClusterSource optionally provides cluster IDs for per-cluster watch. @@ -24,7 +23,6 @@ type Options struct { DefaultCluster string PathPrefix string ControlPlaneSegment string - ClusterAnnotationKey string ClusterFieldKey string WatchStrategy WatchStrategy ClusterSource ClusterSource @@ -54,7 +52,6 @@ const ( DefaultInternalCrossClusterUser = "system:apiserver" DefaultInternalCrossClusterCapability = "kplane.internal/cross-cluster-read" DefaultInternalCrossClusterUserAgent = "kplane-internal-cross-cluster" - DefaultClusterAnnotation = "multicluster.k8s.io/cluster" DefaultClusterField = "metadata.cluster" ) @@ -63,7 +60,6 @@ var DefaultOptions = Options{ DefaultCluster: DefaultClusterName, PathPrefix: DefaultPathPrefix, ControlPlaneSegment: DefaultControlPlaneSegment, - ClusterAnnotationKey: DefaultClusterAnnotation, ClusterFieldKey: DefaultClusterField, WatchStrategy: KindRootWatch{}, } diff --git a/pkg/multicluster/scopedinformer/shared.go b/pkg/multicluster/scopedinformer/shared.go index d23fb84..eef2144 100644 --- a/pkg/multicluster/scopedinformer/shared.go +++ b/pkg/multicluster/scopedinformer/shared.go @@ -3,14 +3,16 @@ package scopedinformer import ( "context" "fmt" + "strings" "time" - "k8s.io/apimachinery/pkg/api/meta" + "k8s.io/apimachinery/pkg/runtime" "k8s.io/client-go/kubernetes" "k8s.io/client-go/rest" "k8s.io/client-go/tools/cache" mc "github.com/kplane-dev/apiserver/pkg/multicluster" + mcstorage "github.com/kplane-dev/apiserver/pkg/multicluster/storage" ) const ClusterIndexName = "mc.cluster" @@ -34,10 +36,10 @@ func NewAllClustersKubeClient(base *rest.Config) (kubernetes.Interface, error) { return kubernetes.NewForConfig(cfg) } -func EnsureClusterIndex(inf cache.SharedIndexInformer, clusterLabelKey string) error { +func EnsureClusterIndex(inf cache.SharedIndexInformer) error { return inf.AddIndexers(cache.Indexers{ ClusterIndexName: func(obj interface{}) ([]string, error) { - cid := ObjectCluster(obj, clusterLabelKey) + cid := ObjectCluster(obj) if cid == "" { return nil, nil } @@ -46,15 +48,66 @@ func EnsureClusterIndex(inf cache.SharedIndexInformer, clusterLabelKey string) e }) } -func ObjectCluster(obj interface{}, clusterLabelKey string) string { +// ClusterEntryTransform wraps informer objects into InternalEntry so cluster +// metadata can be carried outside model fields. +func ClusterEntryTransform() cache.TransformFunc { + return func(obj interface{}) (interface{}, error) { + if entry, ok := obj.(*mcstorage.InternalEntry); ok && entry != nil { + return entry, nil + } + ro, ok := obj.(runtime.Object) + if !ok || ro == nil { + return obj, nil + } + return &mcstorage.InternalEntry{ + Object: ro, + }, nil + } +} + +func ObjectCluster(obj interface{}) string { if tombstone, ok := obj.(cache.DeletedFinalStateUnknown); ok { obj = tombstone.Obj } - acc, err := meta.Accessor(obj) - if err != nil { + if entry, ok := obj.(*mcstorage.InternalEntry); ok && entry != nil { + if entry.ClusterID != "" { + return entry.ClusterID + } + if entry.StorageKey != "" { + if cid := clusterFromStorageKey(entry.StorageKey); cid != "" { + return cid + } + } return "" } - return acc.GetLabels()[clusterLabelKey] + return "" +} + +func clusterFromStorageKey(storageKey string) string { + i := strings.Index(storageKey, "/clusters/") + if i <= 0 { + return "" + } + root := strings.TrimSuffix(storageKey[:i+len("/clusters")], "/") + resolver := mcstorage.KeyLayoutPlacementResolver{KindRootPrefix: root} + cid, ok := resolver.ClusterFromStorageKey(storageKey) + if !ok { + return "" + } + return cid +} + +func UnwrapObject(obj interface{}) interface{} { + if tombstone, ok := obj.(cache.DeletedFinalStateUnknown); ok { + obj = tombstone.Obj + } + if entry, ok := obj.(*mcstorage.InternalEntry); ok && entry != nil { + if entry.Object != nil { + return entry.Object + } + return nil + } + return obj } func FilteredByCluster(indexer cache.Indexer, clusterID string) []interface{} { @@ -62,17 +115,24 @@ func FilteredByCluster(indexer cache.Indexer, clusterID string) []interface{} { if err != nil { return nil } - return items + out := make([]interface{}, 0, len(items)) + for _, item := range items { + unwrapped := UnwrapObject(item) + if unwrapped == nil { + continue + } + out = append(out, unwrapped) + } + return out } -func NewFilteredSharedIndexInformer(shared cache.SharedIndexInformer, clusterID, clusterLabelKey string) cache.SharedIndexInformer { - return &filteredSharedIndexInformer{shared: shared, clusterID: clusterID, clusterLabelKey: clusterLabelKey} +func NewFilteredSharedIndexInformer(shared cache.SharedIndexInformer, clusterID string) cache.SharedIndexInformer { + return &filteredSharedIndexInformer{shared: shared, clusterID: clusterID} } type filteredSharedIndexInformer struct { - shared cache.SharedIndexInformer - clusterID string - clusterLabelKey string + shared cache.SharedIndexInformer + clusterID string } func (f *filteredSharedIndexInformer) AddEventHandler(handler cache.ResourceEventHandler) (cache.ResourceEventHandlerRegistration, error) { @@ -122,25 +182,37 @@ func (f *filteredSharedIndexInformer) wrap(handler cache.ResourceEventHandler) c } return cache.ResourceEventHandlerFuncs{ AddFunc: func(obj interface{}) { - if ObjectCluster(obj, f.clusterLabelKey) == f.clusterID { - handler.OnAdd(obj, false) + if ObjectCluster(obj) == f.clusterID { + if unwrapped := UnwrapObject(obj); unwrapped != nil { + handler.OnAdd(unwrapped, false) + } } }, UpdateFunc: func(oldObj, newObj interface{}) { - oldMatch := ObjectCluster(oldObj, f.clusterLabelKey) == f.clusterID - newMatch := ObjectCluster(newObj, f.clusterLabelKey) == f.clusterID + oldMatch := ObjectCluster(oldObj) == f.clusterID + newMatch := ObjectCluster(newObj) == f.clusterID switch { case oldMatch && newMatch: - handler.OnUpdate(oldObj, newObj) + oldUnwrapped := UnwrapObject(oldObj) + newUnwrapped := UnwrapObject(newObj) + if oldUnwrapped != nil && newUnwrapped != nil { + handler.OnUpdate(oldUnwrapped, newUnwrapped) + } case !oldMatch && newMatch: - handler.OnAdd(newObj, false) + if unwrapped := UnwrapObject(newObj); unwrapped != nil { + handler.OnAdd(unwrapped, false) + } case oldMatch && !newMatch: - handler.OnDelete(oldObj) + if unwrapped := UnwrapObject(oldObj); unwrapped != nil { + handler.OnDelete(unwrapped) + } } }, DeleteFunc: func(obj interface{}) { - if ObjectCluster(obj, f.clusterLabelKey) == f.clusterID { - handler.OnDelete(obj) + if ObjectCluster(obj) == f.clusterID { + if unwrapped := UnwrapObject(obj); unwrapped != nil { + handler.OnDelete(unwrapped) + } } }, } diff --git a/pkg/multicluster/scopedinformer/shared_test.go b/pkg/multicluster/scopedinformer/shared_test.go new file mode 100644 index 0000000..c771e4a --- /dev/null +++ b/pkg/multicluster/scopedinformer/shared_test.go @@ -0,0 +1,80 @@ +package scopedinformer + +import ( + "testing" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/tools/cache" + + mcstorage "github.com/kplane-dev/apiserver/pkg/multicluster/storage" +) + +func TestObjectCluster_FromInternalEntryClusterID(t *testing.T) { + obj := &corev1.ConfigMap{ObjectMeta: metav1.ObjectMeta{Name: "cm"}} + entry := &mcstorage.InternalEntry{ + Object: obj, + ClusterID: "c1", + } + if got := ObjectCluster(entry); got != "c1" { + t.Fatalf("expected c1 from internal entry, got %q", got) + } +} + +func TestObjectCluster_FromInternalEntryWithoutClusterID(t *testing.T) { + obj := &corev1.ConfigMap{ObjectMeta: metav1.ObjectMeta{Name: "cm"}} + entry := &mcstorage.InternalEntry{ + Object: obj, + } + if got := ObjectCluster(entry); got != "" { + t.Fatalf("expected empty cluster from internal entry without ClusterID, got %q", got) + } +} + +func TestObjectCluster_FromTombstoneInternalEntry(t *testing.T) { + obj := &corev1.ConfigMap{ObjectMeta: metav1.ObjectMeta{Name: "cm"}} + entry := &mcstorage.InternalEntry{ + Object: obj, + ClusterID: "c3", + } + tombstone := cache.DeletedFinalStateUnknown{Obj: entry} + if got := ObjectCluster(tombstone); got != "c3" { + t.Fatalf("expected c3 from tombstone internal entry, got %q", got) + } +} + +func TestObjectCluster_NonEntryHasNoFallback(t *testing.T) { + obj := &corev1.ConfigMap{ObjectMeta: metav1.ObjectMeta{Name: "cm"}} + if got := ObjectCluster(obj); got != "" { + t.Fatalf("expected empty cluster for non-entry object, got %q", got) + } +} + +func TestFilteredByCluster_UnwrapsInternalEntry(t *testing.T) { + indexer := cache.NewIndexer(func(obj interface{}) (string, error) { + if e, ok := obj.(*mcstorage.InternalEntry); ok && e != nil { + if e.StorageKey != "" { + return e.StorageKey, nil + } + return "entry/default/cm", nil + } + return cache.MetaNamespaceKeyFunc(obj) + }, cache.Indexers{ + ClusterIndexName: func(obj interface{}) ([]string, error) { + return []string{ObjectCluster(obj)}, nil + }, + }) + + obj := &corev1.ConfigMap{ObjectMeta: metav1.ObjectMeta{Name: "cm", Namespace: "default"}} + entry := &mcstorage.InternalEntry{Object: obj, ClusterID: "c5"} + if err := indexer.Add(entry); err != nil { + t.Fatalf("add: %v", err) + } + items := FilteredByCluster(indexer, "c5") + if len(items) != 1 { + t.Fatalf("expected one item, got %d", len(items)) + } + if _, ok := items[0].(*corev1.ConfigMap); !ok { + t.Fatalf("expected unwrapped ConfigMap, got %T", items[0]) + } +} diff --git a/pkg/multicluster/storage.go b/pkg/multicluster/storage.go index a14a887..05d4748 100644 --- a/pkg/multicluster/storage.go +++ b/pkg/multicluster/storage.go @@ -8,25 +8,33 @@ import ( "fmt" "os" "runtime/debug" + "strconv" "strings" "sync" "sync/atomic" + "time" "k8s.io/apimachinery/pkg/api/meta" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/apimachinery/pkg/watch" "k8s.io/apiserver/pkg/registry/generic" + apirequest "k8s.io/apiserver/pkg/endpoints/request" "k8s.io/apiserver/pkg/storage" "k8s.io/apiserver/pkg/storage/storagebackend" "k8s.io/apiserver/pkg/storage/storagebackend/factory" "k8s.io/client-go/tools/cache" "k8s.io/klog/v2" + mvccpb "go.etcd.io/etcd/api/v3/mvccpb" + "go.etcd.io/etcd/client/pkg/v3/transport" + clientv3 "go.etcd.io/etcd/client/v3" "k8s.io/component-base/metrics" "k8s.io/component-base/metrics/legacyregistry" "github.com/kplane-dev/apiserver/pkg/multicluster/internalcap" + mcstorage "github.com/kplane-dev/apiserver/pkg/multicluster/storage" + "github.com/kplane-dev/apiserver/pkg/multicluster/storagekeyaware" ) // RESTOptionsDecorator wraps the underlying getter to inject a decorator that @@ -132,6 +140,9 @@ type clusteredStorage struct { mu sync.Mutex store storage.Interface destroyFn factory.DestroyFunc + + etcdMu sync.Mutex + etcdClient *clientv3.Client } func newClusteredStorage( @@ -173,12 +184,16 @@ func (c *clusteredStorage) Create(ctx context.Context, key string, obj, out runt if err := c.rejectAllClustersMutation(ctx); err != nil { return err } - c.enforceObjectClusterLabel(obj, c.clusterFromContext(ctx)) + c.enforceObjectClusterIdentity(obj, c.clusterFromContext(ctx)) store, key, err := c.storeAndKey(ctx, key) if err != nil { return err } - return store.Create(ctx, key, obj, out, ttl) + if err := store.Create(ctx, key, obj, out, ttl); err != nil { + return err + } + c.sanitizeResponseObject(ctx, out) + return nil } func (c *clusteredStorage) Delete(ctx context.Context, key string, out runtime.Object, preconditions *storage.Preconditions, validateDeletion storage.ValidateObjectFunc, cachedExistingObject runtime.Object, opts storage.DeleteOptions) error { @@ -189,7 +204,11 @@ func (c *clusteredStorage) Delete(ctx context.Context, key string, out runtime.O if err != nil { return err } - return store.Delete(ctx, key, out, preconditions, validateDeletion, cachedExistingObject, opts) + if err := store.Delete(ctx, key, out, preconditions, validateDeletion, cachedExistingObject, opts); err != nil { + return err + } + c.sanitizeResponseObject(ctx, out) + return nil } func (c *clusteredStorage) Watch(ctx context.Context, key string, opts storage.ListOptions) (watch.Interface, error) { @@ -197,7 +216,11 @@ func (c *clusteredStorage) Watch(ctx context.Context, key string, opts storage.L if err != nil { return nil, err } - return store.Watch(ctx, key, opts) + w, err := store.Watch(ctx, key, opts) + if err != nil { + return nil, err + } + return mcstorage.NewEntryWatch(w), nil } func (c *clusteredStorage) Get(ctx context.Context, key string, opts storage.GetOptions, objPtr runtime.Object) error { @@ -205,7 +228,11 @@ func (c *clusteredStorage) Get(ctx context.Context, key string, opts storage.Get if err != nil { return err } - return store.Get(ctx, key, opts, objPtr) + if err := store.Get(ctx, key, opts, objPtr); err != nil { + return err + } + c.sanitizeResponseObject(ctx, objPtr) + return nil } func (c *clusteredStorage) GetList(ctx context.Context, key string, opts storage.ListOptions, listObj runtime.Object) error { @@ -213,27 +240,58 @@ func (c *clusteredStorage) GetList(ctx context.Context, key string, opts storage if err != nil { return err } - return store.GetList(ctx, key, opts, listObj) + if err := store.GetList(ctx, key, opts, listObj); err != nil { + return err + } + c.sanitizeResponseList(ctx, listObj) + return nil } func (c *clusteredStorage) GuaranteedUpdate(ctx context.Context, key string, destination runtime.Object, ignoreNotFound bool, precond *storage.Preconditions, tryUpdate storage.UpdateFunc, cachedExistingObject runtime.Object) error { if err := c.rejectAllClustersMutation(ctx); err != nil { return err } - store, key, err := c.storeAndKey(ctx, key) + store, rewrittenKey, err := c.storeAndKey(ctx, key) if err != nil { return err } + // CRD and other internal controllers run via loopback context without an + // explicit cluster in the URL. Resolve cluster from durable storage identity + // before writing status so updates route to the owning cluster keyspace. + if isInternalLoopbackContext(ctx) { + ctxCID, _, _ := FromContextScope(ctx) + if ctxCID == "" { + hintCID := c.clusterHintFromDestination(destination) + if hintCID == "" { + hintCID = c.clusterHintFromDestination(cachedExistingObject) + } + if hintCID == "" { + hintCID = c.clusterHintFromKey(key, precond) + } + // Never route CRD controller status writes to a guessed/default cluster. + // If attribution is unresolved, force retry so correctness stays key-derived. + if hintCID == "" && strings.Trim(c.resourcePrefix, "/") == "apiextensions.k8s.io/customresourcedefinitions" { + return fmt.Errorf("unable to resolve CRD cluster from storage identity for key %q", key) + } + if hintCID != "" { + rewrittenKey = c.rewriteKey(hintCID, key) + } + } + } cid := c.clusterFromContext(ctx) wrappedUpdate := func(input runtime.Object, res storage.ResponseMeta) (runtime.Object, *uint64, error) { outObj, ttl, err := tryUpdate(input, res) if err != nil || outObj == nil { return outObj, ttl, err } - c.enforceObjectClusterLabel(outObj, cid) + c.enforceObjectClusterIdentity(outObj, cid) return outObj, ttl, nil } - return store.GuaranteedUpdate(ctx, key, destination, ignoreNotFound, precond, wrappedUpdate, cachedExistingObject) + if err := store.GuaranteedUpdate(ctx, rewrittenKey, destination, ignoreNotFound, precond, wrappedUpdate, cachedExistingObject); err != nil { + return err + } + c.sanitizeResponseObject(ctx, destination) + return nil } func (c *clusteredStorage) Stats(ctx context.Context) (storage.Stats, error) { @@ -351,42 +409,501 @@ func (c *clusteredStorage) clusterFromContext(ctx context.Context) string { return cid } -func (c *clusteredStorage) enforceObjectClusterLabel(obj runtime.Object, cid string) { - if obj == nil { +func (c *clusteredStorage) enforceObjectClusterIdentity(obj runtime.Object, cid string) { + SetObjectClusterIdentity(obj, cid) +} + +func (c *clusteredStorage) sanitizeResponseObject(ctx context.Context, obj runtime.Object) { + if obj == nil || c.exposeInternalIdentity(ctx) { return } - acc, err := meta.Accessor(obj) - if err != nil { + ClearObjectClusterIdentity(obj) +} + +func (c *clusteredStorage) sanitizeResponseList(ctx context.Context, listObj runtime.Object) { + if listObj == nil || c.exposeInternalIdentity(ctx) { return } - key := c.options.ClusterAnnotationKey - if key == "" { - key = DefaultClusterAnnotation + _ = meta.EachListItem(listObj, func(obj runtime.Object) error { + ClearObjectClusterIdentity(obj) + return nil + }) +} + +func (c *clusteredStorage) exposeInternalIdentity(ctx context.Context) bool { + _, scope, _ := FromContextScope(ctx) + return scope == ResourceScopeCrossClusterRead +} + +func (c *clusteredStorage) clusterFromObject(obj runtime.Object) string { + if entry, ok := obj.(*mcstorage.InternalEntry); ok && entry != nil { + if entry.ClusterID != "" { + return entry.ClusterID + } + if entry.StorageKey != "" { + resolver := mcstorage.KeyLayoutPlacementResolver{KindRootPrefix: c.kindRootPrefix()} + if cid, ok := resolver.ClusterFromStorageKey(entry.StorageKey); ok && cid != "" { + return cid + } + } + if entry.Object != nil { + obj = entry.Object + } } - lbls := acc.GetLabels() - if lbls == nil { - lbls = map[string]string{} + if c.requiresStorageClusterLookup() { + if cid := c.lookupClusterFromStorage(obj); cid != "" { + return cid + } } - lbls[key] = cid - acc.SetLabels(lbls) + return c.defaultCluster() } -func (c *clusteredStorage) clusterFromObject(obj runtime.Object) string { +func (c *clusteredStorage) requiresStorageClusterLookup() bool { + rp := strings.Trim(c.resourcePrefix, "/") + switch rp { + case "apiextensions.k8s.io/customresourcedefinitions", + "serviceaccounts", + "secrets", + "configmaps", + "pods", + "nodes", + "endpointslices", + "services/specs", + "services/endpoints", + "namespaces", + "validatingwebhookconfigurations", + "mutatingwebhookconfigurations": + return true + default: + return false + } +} + +func (c *clusteredStorage) lookupClusterFromStorage(obj runtime.Object) string { if obj == nil { - return c.defaultCluster() + return "" + } + accessor, err := meta.Accessor(obj) + if err != nil { + return "" } - acc, err := meta.Accessor(obj) + name := accessor.GetName() + if name == "" { + return "" + } + namespace := accessor.GetNamespace() + wantRV := int64(0) + if rv := accessor.GetResourceVersion(); rv != "" { + if parsed, err := strconv.ParseInt(rv, 10, 64); err == nil { + wantRV = parsed + } + } + cli, err := c.getETCDClient() if err != nil { - return c.defaultCluster() + return "" + } + storagePrefix := strings.TrimSuffix(c.config.Prefix, "/") + "/" + strings.TrimPrefix(c.kindRootPrefix(), "/") + queryPrefix := strings.TrimSuffix(storagePrefix, "/") + "/" + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + matchCID := "" + matchScore := 0 + ambiguous := false + scan := func(kvs []*mvccpb.KeyValue) { + for _, kv := range kvs { + key := c.storageKeyFromETCDKey(string(kv.Key)) + if key == "" { + continue + } + cid, ok := mcstorage.KeyLayoutPlacementResolver{KindRootPrefix: c.kindRootPrefix()}.ClusterFromStorageKey(key) + if !ok || cid == "" { + continue + } + ns, nm, ok := objectPathFromStorageKey(key, c.kindRootPrefix()+"/") + if !ok || nm != name || ns != namespace { + continue + } + score := 1 + if wantRV > 0 && kv.ModRevision == wantRV { + score = 2 + } + if score > matchScore { + matchCID = cid + matchScore = score + if score == 1 { + ambiguous = false + } + if score == 2 { + return + } + } else if score == matchScore && score == 1 && matchCID != "" && matchCID != cid { + ambiguous = true + } + } + } + if wantRV > 0 { + if resp, err := cli.Get( + ctx, + queryPrefix, + clientv3.WithPrefix(), + clientv3.WithKeysOnly(), + clientv3.WithMinModRev(wantRV), + clientv3.WithMaxModRev(wantRV), + ); err == nil { + scan(resp.Kvs) + } + if matchScore == 2 { + return matchCID + } + } + resp, err := cli.Get(ctx, queryPrefix, clientv3.WithPrefix(), clientv3.WithKeysOnly()) + if err != nil { + return "" + } + for _, kv := range resp.Kvs { + key := c.storageKeyFromETCDKey(string(kv.Key)) + if key == "" { + continue + } + cid, ok := mcstorage.KeyLayoutPlacementResolver{KindRootPrefix: c.kindRootPrefix()}.ClusterFromStorageKey(key) + if !ok || cid == "" { + continue + } + ns, nm, ok := objectPathFromStorageKey(key, c.kindRootPrefix()+"/") + if !ok || nm != name || ns != namespace { + continue + } + score := 1 + if wantRV > 0 && kv.ModRevision == wantRV { + score = 2 + } + if score > matchScore { + matchCID = cid + matchScore = score + if score == 2 { + break + } + } + } + if wantRV == 0 && ambiguous { + return "" + } + return matchCID +} + +func (c *clusteredStorage) clusterHintFromDestination(obj runtime.Object) string { + if obj == nil { + return "" } - key := c.options.ClusterAnnotationKey + accessor, err := meta.Accessor(obj) + if err != nil { + return "" + } + name := accessor.GetName() + if name == "" { + return "" + } + namespace := accessor.GetNamespace() + wantRV := int64(0) + if rv := accessor.GetResourceVersion(); rv != "" { + if parsed, err := strconv.ParseInt(rv, 10, 64); err == nil { + wantRV = parsed + } + } + cli, err := c.getETCDClient() + if err != nil { + return "" + } + storagePrefix := strings.TrimSuffix(c.config.Prefix, "/") + "/" + strings.TrimPrefix(c.kindRootPrefix(), "/") + queryPrefix := strings.TrimSuffix(storagePrefix, "/") + "/" + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + matchCID := "" + matchScore := 0 + ambiguous := false + scan := func(kvs []*mvccpb.KeyValue) { + for _, kv := range kvs { + key := c.storageKeyFromETCDKey(string(kv.Key)) + if key == "" { + continue + } + cid, ok := mcstorage.KeyLayoutPlacementResolver{KindRootPrefix: c.kindRootPrefix()}.ClusterFromStorageKey(key) + if !ok || cid == "" { + continue + } + ns, nm, ok := objectPathFromStorageKey(key, c.kindRootPrefix()+"/") + if !ok || nm != name || ns != namespace { + continue + } + score := 1 + if wantRV > 0 && kv.ModRevision == wantRV { + score = 2 + } + if score > matchScore { + matchCID = cid + matchScore = score + if score == 1 { + ambiguous = false + } + if score == 2 { + return + } + } else if score == matchScore && score == 1 && matchCID != "" && matchCID != cid { + ambiguous = true + } + } + } + if wantRV > 0 { + if resp, err := cli.Get( + ctx, + queryPrefix, + clientv3.WithPrefix(), + clientv3.WithKeysOnly(), + clientv3.WithMinModRev(wantRV), + clientv3.WithMaxModRev(wantRV), + ); err == nil { + scan(resp.Kvs) + } + if matchScore == 2 { + return matchCID + } + } + resp, err := cli.Get(ctx, queryPrefix, clientv3.WithPrefix(), clientv3.WithKeysOnly()) + if err != nil { + return "" + } + for _, kv := range resp.Kvs { + key := c.storageKeyFromETCDKey(string(kv.Key)) + if key == "" { + continue + } + cid, ok := mcstorage.KeyLayoutPlacementResolver{KindRootPrefix: c.kindRootPrefix()}.ClusterFromStorageKey(key) + if !ok || cid == "" { + continue + } + ns, nm, ok := objectPathFromStorageKey(key, c.kindRootPrefix()+"/") + if !ok || nm != name || ns != namespace { + continue + } + score := 1 + if wantRV > 0 && kv.ModRevision == wantRV { + score = 2 + } + if score > matchScore { + matchCID = cid + matchScore = score + if score == 2 { + break + } + } + } + if wantRV == 0 && ambiguous { + return "" + } + return matchCID +} + +func (c *clusteredStorage) clusterHintFromKey(key string, precond *storage.Preconditions) string { if key == "" { - key = DefaultClusterAnnotation + return "" + } + base := strings.TrimSuffix(c.resourcePrefix, "/") + if !strings.HasPrefix(key, base+"/") { + return "" + } + rest := strings.TrimPrefix(key, base+"/") + parts := strings.Split(rest, "/") + var namespace, name string + switch len(parts) { + case 1: + name = parts[0] + default: + namespace, name = parts[len(parts)-2], parts[len(parts)-1] + } + if name == "" { + return "" + } + wantRV := int64(0) + if precond != nil && precond.ResourceVersion != nil { + if parsed, err := strconv.ParseInt(*precond.ResourceVersion, 10, 64); err == nil { + wantRV = parsed + } + } + cli, err := c.getETCDClient() + if err != nil { + return "" + } + storagePrefix := strings.TrimSuffix(c.config.Prefix, "/") + "/" + strings.TrimPrefix(c.kindRootPrefix(), "/") + queryPrefix := strings.TrimSuffix(storagePrefix, "/") + "/" + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + matchCID, matchScore := "", 0 + ambiguous := false + scan := func(kvs []*mvccpb.KeyValue) { + for _, kv := range kvs { + storageKey := c.storageKeyFromETCDKey(string(kv.Key)) + if storageKey == "" { + continue + } + cid, ok := mcstorage.KeyLayoutPlacementResolver{KindRootPrefix: c.kindRootPrefix()}.ClusterFromStorageKey(storageKey) + if !ok || cid == "" { + continue + } + ns, nm, ok := objectPathFromStorageKey(storageKey, c.kindRootPrefix()+"/") + if !ok || nm != name || ns != namespace { + continue + } + score := 1 + if wantRV > 0 && kv.ModRevision == wantRV { + score = 2 + } + if score > matchScore { + matchCID, matchScore = cid, score + if score == 1 { + ambiguous = false + } + if score == 2 { + return + } + } else if score == matchScore && score == 1 && matchCID != "" && matchCID != cid { + ambiguous = true + } + } } - if cid := acc.GetLabels()[key]; cid != "" { - return cid + if wantRV > 0 { + if resp, err := cli.Get( + ctx, + queryPrefix, + clientv3.WithPrefix(), + clientv3.WithKeysOnly(), + clientv3.WithMinModRev(wantRV), + clientv3.WithMaxModRev(wantRV), + ); err == nil { + scan(resp.Kvs) + } + if matchScore == 2 { + return matchCID + } } - return c.defaultCluster() + resp, err := cli.Get(ctx, queryPrefix, clientv3.WithPrefix(), clientv3.WithKeysOnly()) + if err != nil { + return "" + } + for _, kv := range resp.Kvs { + storageKey := c.storageKeyFromETCDKey(string(kv.Key)) + if storageKey == "" { + continue + } + cid, ok := mcstorage.KeyLayoutPlacementResolver{KindRootPrefix: c.kindRootPrefix()}.ClusterFromStorageKey(storageKey) + if !ok || cid == "" { + continue + } + ns, nm, ok := objectPathFromStorageKey(storageKey, c.kindRootPrefix()+"/") + if !ok || nm != name || ns != namespace { + continue + } + score := 1 + if wantRV > 0 && kv.ModRevision == wantRV { + score = 2 + } + if score > matchScore { + matchCID, matchScore = cid, score + if score == 2 { + break + } + } + } + if wantRV == 0 && ambiguous { + return "" + } + return matchCID +} + +func isInternalLoopbackContext(ctx context.Context) bool { + u, ok := apirequest.UserFrom(ctx) + if !ok || u == nil { + return false + } + return u.GetName() == "system:apiserver" +} + +func objectPathFromStorageKey(storageKey, rootPrefix string) (namespace, name string, ok bool) { + if !strings.HasPrefix(storageKey, rootPrefix) { + return "", "", false + } + rest := strings.TrimPrefix(storageKey, rootPrefix) + parts := strings.Split(rest, "/") + if len(parts) < 2 { + return "", "", false + } + // parts[0] is cluster ID. + switch len(parts) { + case 2: + return "", parts[1], true + default: + return parts[1], parts[2], true + } +} + +func (c *clusteredStorage) storageKeyFromETCDKey(etcdKey string) string { + if etcdKey == "" { + return "" + } + prefix := strings.TrimSuffix(c.config.Prefix, "/") + if prefix == "" { + return etcdKey + } + trimmed := strings.TrimPrefix(etcdKey, prefix) + if trimmed == etcdKey { + return "" + } + if !strings.HasPrefix(trimmed, "/") { + trimmed = "/" + trimmed + } + return trimmed +} + +func (c *clusteredStorage) getETCDClient() (*clientv3.Client, error) { + c.etcdMu.Lock() + defer c.etcdMu.Unlock() + if c.etcdClient != nil { + return c.etcdClient, nil + } + if len(c.config.Transport.ServerList) == 0 { + if env := strings.TrimSpace(os.Getenv("ETCD_ENDPOINTS")); env != "" { + for _, ep := range strings.Split(env, ",") { + ep = strings.TrimSpace(ep) + if ep != "" { + c.config.Transport.ServerList = append(c.config.Transport.ServerList, ep) + } + } + } + } + if len(c.config.Transport.ServerList) == 0 { + return nil, fmt.Errorf("no etcd servers configured") + } + cfg := clientv3.Config{ + Endpoints: append([]string{}, c.config.Transport.ServerList...), + DialTimeout: 5 * time.Second, + } + if c.config.Transport.CertFile != "" || c.config.Transport.KeyFile != "" || c.config.Transport.TrustedCAFile != "" { + tlsInfo := transport.TLSInfo{ + CertFile: c.config.Transport.CertFile, + KeyFile: c.config.Transport.KeyFile, + TrustedCAFile: c.config.Transport.TrustedCAFile, + } + tlsCfg, err := tlsInfo.ClientConfig() + if err != nil { + return nil, err + } + cfg.TLS = tlsCfg + } + cli, err := clientv3.New(cfg) + if err != nil { + return nil, err + } + c.etcdClient = cli + return c.etcdClient, nil } func (c *clusteredStorage) rewriteKey(cluster, key string) string { @@ -430,6 +947,14 @@ func (c *clusteredStorage) ensureStore() (storage.Interface, error) { seq, server, c.resourcePrefix, kindRootPrefix, cfg.Prefix, hex.EncodeToString(stackHash[:8]), ) keyFunc := func(obj runtime.Object) (string, error) { + if entry, ok := obj.(*mcstorage.InternalEntry); ok && entry != nil { + if entry.StorageKey != "" { + return entry.StorageKey, nil + } + if entry.Object != nil { + obj = entry.Object + } + } key, err := c.keyFunc(obj) if err != nil { return "", err @@ -442,7 +967,7 @@ func (c *clusteredStorage) ensureStore() (storage.Interface, error) { if err != nil { return nil, err } - c.store = store + c.store = storagekeyaware.New(store, kindRootPrefix, c.config.Prefix, c.config.Transport, nil) c.destroyFn = destroy - return store, nil + return c.store, nil } diff --git a/pkg/multicluster/storage/entry.go b/pkg/multicluster/storage/entry.go new file mode 100644 index 0000000..2ecdd05 --- /dev/null +++ b/pkg/multicluster/storage/entry.go @@ -0,0 +1,31 @@ +package storage + +import ( + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" +) + +// InternalEntry carries key-aware multicluster metadata for storage/cache paths. +// It is never serialized to API clients. +type InternalEntry struct { + Object runtime.Object + ClusterID string + StorageKey string + LayoutVersion string +} + +func (e *InternalEntry) DeepCopyObject() runtime.Object { + if e == nil { + return nil + } + out := *e + if e.Object != nil { + out.Object = e.Object.DeepCopyObject() + } + return &out +} + +func (e *InternalEntry) GetObjectKind() schema.ObjectKind { + return schema.EmptyObjectKind +} + diff --git a/pkg/multicluster/storage/keyaware_cache.go b/pkg/multicluster/storage/keyaware_cache.go new file mode 100644 index 0000000..852400b --- /dev/null +++ b/pkg/multicluster/storage/keyaware_cache.go @@ -0,0 +1,40 @@ +package storage + +import "sync" + +// EntryCache is a small in-memory helper for key-aware placement lookups. +// It is intentionally minimal and can be replaced by tighter cache integration. +type EntryCache struct { + mu sync.RWMutex + byKey map[string]*InternalEntry + byScope map[string][]*InternalEntry +} + +func NewEntryCache() *EntryCache { + return &EntryCache{ + byKey: map[string]*InternalEntry{}, + byScope: map[string][]*InternalEntry{}, + } +} + +func (c *EntryCache) Upsert(e *InternalEntry) { + if c == nil || e == nil || e.StorageKey == "" { + return + } + c.mu.Lock() + defer c.mu.Unlock() + c.byKey[e.StorageKey] = e + scope := e.ClusterID + c.byScope[scope] = append(c.byScope[scope], e) +} + +func (c *EntryCache) Get(storageKey string) (*InternalEntry, bool) { + if c == nil || storageKey == "" { + return nil, false + } + c.mu.RLock() + defer c.mu.RUnlock() + e, ok := c.byKey[storageKey] + return e, ok +} + diff --git a/pkg/multicluster/storage/keyaware_index.go b/pkg/multicluster/storage/keyaware_index.go new file mode 100644 index 0000000..3513be3 --- /dev/null +++ b/pkg/multicluster/storage/keyaware_index.go @@ -0,0 +1,9 @@ +package storage + +import "fmt" + +// ClusterScopedObjectKey builds a stable composite key for internal indexes. +func ClusterScopedObjectKey(clusterID, namespace, name string) string { + return fmt.Sprintf("%s/%s/%s", clusterID, namespace, name) +} + diff --git a/pkg/multicluster/storage/keyaware_index_test.go b/pkg/multicluster/storage/keyaware_index_test.go new file mode 100644 index 0000000..14dbf3e --- /dev/null +++ b/pkg/multicluster/storage/keyaware_index_test.go @@ -0,0 +1,12 @@ +package storage + +import "testing" + +func TestClusterScopedObjectKey(t *testing.T) { + got := ClusterScopedObjectKey("c1", "default", "obj") + want := "c1/default/obj" + if got != want { + t.Fatalf("key=%q want=%q", got, want) + } +} + diff --git a/pkg/multicluster/storage/keyaware_watch.go b/pkg/multicluster/storage/keyaware_watch.go new file mode 100644 index 0000000..0c473c9 --- /dev/null +++ b/pkg/multicluster/storage/keyaware_watch.go @@ -0,0 +1,50 @@ +package storage + +import ( + "k8s.io/apimachinery/pkg/watch" +) + +// EntryWatch adapts a watch stream that carries InternalEntry objects into +// a stream that emits only runtime objects to API callers. +type EntryWatch struct { + source watch.Interface + resultCh chan watch.Event +} + +func NewEntryWatch(source watch.Interface) watch.Interface { + if source == nil { + return nil + } + w := &EntryWatch{ + source: source, + resultCh: make(chan watch.Event), + } + go w.run() + return w +} + +func (w *EntryWatch) Stop() { + if w.source != nil { + w.source.Stop() + } +} + +func (w *EntryWatch) ResultChan() <-chan watch.Event { + return w.resultCh +} + +func (w *EntryWatch) run() { + defer close(w.resultCh) + for evt := range w.source.ResultChan() { + entry, ok := evt.Object.(*InternalEntry) + if !ok || entry == nil { + w.resultCh <- evt + continue + } + if entry.Object == nil { + continue + } + w.resultCh <- watch.Event{Type: evt.Type, Object: entry.Object} + } +} + diff --git a/pkg/multicluster/storage/keyaware_watch_test.go b/pkg/multicluster/storage/keyaware_watch_test.go new file mode 100644 index 0000000..9406e92 --- /dev/null +++ b/pkg/multicluster/storage/keyaware_watch_test.go @@ -0,0 +1,34 @@ +package storage + +import ( + "testing" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/watch" +) + +func TestNewEntryWatch_EmitsRuntimeObjects(t *testing.T) { + fw := watch.NewFake() + defer fw.Stop() + + out := NewEntryWatch(fw) + cm := &corev1.ConfigMap{ObjectMeta: metav1.ObjectMeta{Name: "cm"}} + fw.Add(&InternalEntry{Object: cm, ClusterID: "c1", StorageKey: "/registry/configmaps/clusters/c1/default/cm"}) + + evt, ok := <-out.ResultChan() + if !ok { + t.Fatalf("expected watch event") + } + if evt.Type != watch.Added { + t.Fatalf("event type=%s want=%s", evt.Type, watch.Added) + } + got, ok := evt.Object.(*corev1.ConfigMap) + if !ok { + t.Fatalf("expected ConfigMap object, got %T", evt.Object) + } + if got.Name != "cm" { + t.Fatalf("name=%q want=cm", got.Name) + } +} + diff --git a/pkg/multicluster/storage/placement_resolver.go b/pkg/multicluster/storage/placement_resolver.go new file mode 100644 index 0000000..f167db6 --- /dev/null +++ b/pkg/multicluster/storage/placement_resolver.go @@ -0,0 +1,36 @@ +package storage + +import "strings" + +// PlacementResolver resolves cluster identity from storage key metadata. +type PlacementResolver interface { + ClusterFromStorageKey(storageKey string) (clusterID string, ok bool) +} + +// KeyLayoutPlacementResolver resolves cluster identity for keys shaped like: +// //... +// where kindRootPrefix is ".../clusters". +type KeyLayoutPlacementResolver struct { + KindRootPrefix string +} + +func (r KeyLayoutPlacementResolver) ClusterFromStorageKey(storageKey string) (string, bool) { + if strings.TrimSpace(storageKey) == "" { + return "", false + } + root := strings.TrimSuffix(r.KindRootPrefix, "/") + key := strings.TrimSuffix(storageKey, "/") + if root == "" || !strings.HasPrefix(key, root+"/") { + return "", false + } + rest := strings.TrimPrefix(key, root+"/") + if rest == "" { + return "", false + } + i := strings.IndexByte(rest, '/') + if i <= 0 { + return rest, true + } + return rest[:i], true +} + diff --git a/pkg/multicluster/storage/placement_resolver_test.go b/pkg/multicluster/storage/placement_resolver_test.go new file mode 100644 index 0000000..67824ee --- /dev/null +++ b/pkg/multicluster/storage/placement_resolver_test.go @@ -0,0 +1,32 @@ +package storage + +import "testing" + +func TestKeyLayoutPlacementResolver_ClusterFromStorageKey(t *testing.T) { + r := KeyLayoutPlacementResolver{KindRootPrefix: "/registry/apps/deployments/clusters"} + + tests := []struct { + name string + key string + wantCID string + wantFound bool + }{ + {name: "normal", key: "/registry/apps/deployments/clusters/c1/default/d1", wantCID: "c1", wantFound: true}, + {name: "cluster root only", key: "/registry/apps/deployments/clusters/c2", wantCID: "c2", wantFound: true}, + {name: "wrong prefix", key: "/registry/apps/deployments/c1/default/d1", wantFound: false}, + {name: "empty", key: "", wantFound: false}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + gotCID, found := r.ClusterFromStorageKey(tt.key) + if found != tt.wantFound { + t.Fatalf("found=%v want=%v", found, tt.wantFound) + } + if gotCID != tt.wantCID { + t.Fatalf("cid=%q want=%q", gotCID, tt.wantCID) + } + }) + } +} + diff --git a/pkg/multicluster/storage_test.go b/pkg/multicluster/storage_test.go index fc5398e..3ddc0b0 100644 --- a/pkg/multicluster/storage_test.go +++ b/pkg/multicluster/storage_test.go @@ -272,8 +272,8 @@ func TestCreateEnforcesClusterLabel(t *testing.T) { if !ok || got == nil { t.Fatalf("expected recorded configmap create object") } - if got.Labels[DefaultClusterAnnotation] != "c1" { - t.Fatalf("expected cluster label c1, got %q", got.Labels[DefaultClusterAnnotation]) + if ObjectClusterIdentity(got) != "" { + t.Fatalf("expected no object metadata cluster identity, got %q", ObjectClusterIdentity(got)) } } @@ -328,7 +328,7 @@ func TestGuaranteedUpdateEnforcesClusterLabel(t *testing.T) { if !ok || got == nil { t.Fatalf("expected recorded updated configmap") } - if got.Labels[DefaultClusterAnnotation] != "c2" { - t.Fatalf("expected cluster label c2, got %q", got.Labels[DefaultClusterAnnotation]) + if ObjectClusterIdentity(got) != "" { + t.Fatalf("expected no object metadata cluster identity, got %q", ObjectClusterIdentity(got)) } } diff --git a/pkg/multicluster/storagekeyaware/backend.go b/pkg/multicluster/storagekeyaware/backend.go new file mode 100644 index 0000000..f23dc0b --- /dev/null +++ b/pkg/multicluster/storagekeyaware/backend.go @@ -0,0 +1,420 @@ +package storagekeyaware + +import ( + "context" + "fmt" + "reflect" + "strconv" + "strings" + "sync" + "time" + + "k8s.io/apimachinery/pkg/api/meta" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/watch" + "k8s.io/apiserver/pkg/storage" + "k8s.io/apiserver/pkg/storage/storagebackend" + "go.etcd.io/etcd/client/pkg/v3/transport" + clientv3 "go.etcd.io/etcd/client/v3" + + mcstorage "github.com/kplane-dev/apiserver/pkg/multicluster/storage" +) + +// Backend is a local fork boundary for key-aware storage behavior. +// It currently forwards to the delegate while centralizing hooks for +// InternalEntry-based list/watch handling. +type Backend struct { + delegate storage.Interface + kindRootPrefix string + etcdPrefix string + transport storagebackend.TransportConfig + resolver mcstorage.PlacementResolver + remember func(runtime.Object, string) + + clientMu sync.Mutex + client *clientv3.Client + + revCluster sync.Map // map[string]string +} + +func New( + delegate storage.Interface, + kindRootPrefix string, + etcdPrefix string, + transport storagebackend.TransportConfig, + remember func(runtime.Object, string), +) storage.Interface { + if delegate == nil { + return nil + } + trimmedRoot := strings.TrimSuffix(kindRootPrefix, "/") + return &Backend{ + delegate: delegate, + kindRootPrefix: trimmedRoot, + etcdPrefix: strings.TrimSuffix(etcdPrefix, "/"), + transport: transport, + resolver: mcstorage.KeyLayoutPlacementResolver{KindRootPrefix: trimmedRoot}, + remember: remember, + } +} + +func (b *Backend) Versioner() storage.Versioner { return b.delegate.Versioner() } + +func (b *Backend) Create(ctx context.Context, key string, obj, out runtime.Object, ttl uint64) error { + if err := b.delegate.Create(ctx, key, obj, out, ttl); err != nil { + return err + } + if cid, ok := b.resolver.ClusterFromStorageKey(key); ok { + b.rememberCluster(out, cid) + } + return nil +} + +func (b *Backend) Delete(ctx context.Context, key string, out runtime.Object, preconditions *storage.Preconditions, validateDeletion storage.ValidateObjectFunc, cachedExistingObject runtime.Object, opts storage.DeleteOptions) error { + return b.delegate.Delete(ctx, key, out, preconditions, validateDeletion, cachedExistingObject, opts) +} + +func (b *Backend) Watch(ctx context.Context, key string, opts storage.ListOptions) (watch.Interface, error) { + source, err := b.delegate.Watch(ctx, key, opts) + if err != nil { + return nil, err + } + return &entryWrapWatch{ + source: source, + backend: b, + key: key, + }, nil +} + +func (b *Backend) Get(ctx context.Context, key string, opts storage.GetOptions, objPtr runtime.Object) error { + if entry, ok := objPtr.(*mcstorage.InternalEntry); ok && entry != nil && entry.Object != nil { + if err := b.delegate.Get(ctx, key, opts, entry.Object); err != nil { + return err + } + entry.StorageKey = key + if cid, ok := b.resolver.ClusterFromStorageKey(key); ok { + entry.ClusterID = cid + b.rememberCluster(entry.Object, cid) + } + return nil + } + if err := b.delegate.Get(ctx, key, opts, objPtr); err != nil { + return err + } + if cid, ok := b.resolver.ClusterFromStorageKey(key); ok { + b.rememberCluster(objPtr, cid) + } + return nil +} + +func (b *Backend) GetList(ctx context.Context, key string, opts storage.ListOptions, listObj runtime.Object) error { + if err := b.delegate.GetList(ctx, key, opts, listObj); err != nil { + return err + } + // Rebuild revision->cluster memory from durable keyspace on targeted + // cross-cluster relists. We intentionally keep this narrow to avoid + // penalizing startup/list performance for unrelated resources. + if b.isCrossClusterKey(key) && shouldPrimeListIdentity(listObj) { + revToCluster, err := b.revisionClusterIndex(ctx, key) + if err == nil { + b.rememberListClusters(listObj, revToCluster) + } + } + return nil +} + +func (b *Backend) GuaranteedUpdate(ctx context.Context, key string, destination runtime.Object, ignoreNotFound bool, preconditions *storage.Preconditions, tryUpdate storage.UpdateFunc, cachedExistingObject runtime.Object) error { + wrapped := func(input runtime.Object, res storage.ResponseMeta) (runtime.Object, *uint64, error) { + out, ttl, err := tryUpdate(input, res) + if err != nil || out == nil { + return out, ttl, err + } + if cid, ok := b.resolver.ClusterFromStorageKey(key); ok { + b.rememberCluster(out, cid) + } + return out, ttl, nil + } + if err := b.delegate.GuaranteedUpdate(ctx, key, destination, ignoreNotFound, preconditions, wrapped, cachedExistingObject); err != nil { + return err + } + if cid, ok := b.resolver.ClusterFromStorageKey(key); ok { + b.rememberCluster(destination, cid) + } + return nil +} + +func (b *Backend) ReadinessCheck() error { return b.delegate.ReadinessCheck() } + +func (b *Backend) RequestWatchProgress(ctx context.Context) error { + return b.delegate.RequestWatchProgress(ctx) +} + +func (b *Backend) GetCurrentResourceVersion(ctx context.Context) (uint64, error) { + return b.delegate.GetCurrentResourceVersion(ctx) +} + +func (b *Backend) SetKeysFunc(keysFunc storage.KeysFunc) { b.delegate.SetKeysFunc(keysFunc) } + +func (b *Backend) GetListWithAlloc(ctx context.Context, key string, opts storage.ListOptions, listObj runtime.Object, growSlice bool) error { + if alloc, ok := b.delegate.(interface { + GetListWithAlloc(context.Context, string, storage.ListOptions, runtime.Object, bool) error + }); ok { + return alloc.GetListWithAlloc(ctx, key, opts, listObj, growSlice) + } + return b.delegate.GetList(ctx, key, opts, listObj) +} + +func (b *Backend) Stats(ctx context.Context) (storage.Stats, error) { return b.delegate.Stats(ctx) } + +func (b *Backend) CompactRevision() int64 { return b.delegate.CompactRevision() } + +type entryWrapWatch struct { + source watch.Interface + backend *Backend + key string +} + +func (w *entryWrapWatch) Stop() { + if w.source != nil { + w.source.Stop() + } +} + +func (w *entryWrapWatch) ResultChan() <-chan watch.Event { + ch := make(chan watch.Event) + go func() { + defer close(ch) + scopeClusterID, _ := w.backend.resolver.ClusterFromStorageKey(w.key) + for evt := range w.source.ResultChan() { + obj := evt.Object + if obj == nil { + continue + } + storageKey := objectStorageKey(w.key, obj, !w.backend.isCrossClusterKey(w.key)) + clusterID := scopeClusterID + if clusterID == "" { + if cid, ok := w.backend.resolver.ClusterFromStorageKey(storageKey); ok { + clusterID = cid + } + } + if clusterID == "" { + // Keep identity derivation strictly key-based: scope key for scoped + // watches, object storage key for cross-cluster watches. + // If neither yields a cluster, leave it unset. + } + w.backend.rememberCluster(obj, clusterID) + ch <- evt + } + }() + return ch +} + +func (b *Backend) rememberCluster(obj runtime.Object, clusterID string) { + if obj == nil || clusterID == "" { + return + } + if b.remember != nil { + b.remember(obj, clusterID) + } + accessor, err := meta.Accessor(obj) + if err != nil { + return + } + rv := accessor.GetResourceVersion() + if rv == "" { + return + } + b.revCluster.Store(rv, clusterID) +} + +func (b *Backend) rememberListClusters(listObj runtime.Object, revToCluster map[string]string) { + if listObj == nil || len(revToCluster) == 0 { + return + } + _ = meta.EachListItem(listObj, func(obj runtime.Object) error { + accessor, err := meta.Accessor(obj) + if err != nil { + return nil + } + rv := accessor.GetResourceVersion() + if rv == "" { + return nil + } + cid := revToCluster[objectIdentityKey(rv, accessor.GetNamespace(), accessor.GetName())] + if cid == "" { + cid = revToCluster[rv] + } + if cid == "" { + return nil + } + b.rememberCluster(obj, cid) + return nil + }) +} + +func (b *Backend) isCrossClusterKey(key string) bool { + trimmed := strings.TrimSuffix(key, "/") + return trimmed == b.kindRootPrefix +} + +func (b *Backend) revisionClusterIndex(ctx context.Context, key string) (map[string]string, error) { + cli, err := b.getETCDClient() + if err != nil { + return nil, err + } + fullPrefix := b.fullETCDPrefixForKey(key) + if fullPrefix == "" { + return nil, fmt.Errorf("empty etcd prefix") + } + queryPrefix := fullPrefix + if !strings.HasSuffix(queryPrefix, "/") { + queryPrefix += "/" + } + etcdCtx, cancel := context.WithTimeout(ctx, 5*time.Second) + defer cancel() + resp, err := cli.Get(etcdCtx, queryPrefix, clientv3.WithPrefix(), clientv3.WithKeysOnly()) + if err != nil { + return nil, err + } + out := make(map[string]string, len(resp.Kvs)) + storagePrefix := strings.TrimSuffix(key, "/") + "/" + for _, kv := range resp.Kvs { + storageKey := b.storageKeyFromETCDKey(string(kv.Key)) + if storageKey == "" { + continue + } + cid, ok := b.resolver.ClusterFromStorageKey(storageKey) + if !ok || cid == "" { + continue + } + rv := strconv.FormatInt(kv.ModRevision, 10) + out[rv] = cid + if ns, name, ok := objectPathFromStorageKey(storageKey, storagePrefix); ok { + out[objectIdentityKey(rv, ns, name)] = cid + } + } + return out, nil +} + +func objectPathFromStorageKey(storageKey, rootPrefix string) (namespace, name string, ok bool) { + if !strings.HasPrefix(storageKey, rootPrefix) { + return "", "", false + } + rest := strings.TrimPrefix(storageKey, rootPrefix) + parts := strings.Split(rest, "/") + if len(parts) < 2 { + return "", "", false + } + // parts[0] is cluster ID. + switch len(parts) { + case 2: + return "", parts[1], true + default: + return parts[1], parts[2], true + } +} + +func objectIdentityKey(rv, namespace, name string) string { + return rv + "|" + namespace + "|" + name +} + +func (b *Backend) fullETCDPrefixForKey(storageKey string) string { + if storageKey == "" { + return "" + } + return strings.TrimSuffix(b.etcdPrefix, "/") + "/" + strings.TrimPrefix(storageKey, "/") +} + +func (b *Backend) storageKeyFromETCDKey(etcdKey string) string { + if etcdKey == "" { + return "" + } + prefix := strings.TrimSuffix(b.etcdPrefix, "/") + if prefix == "" { + return etcdKey + } + trimmed := strings.TrimPrefix(etcdKey, prefix) + if trimmed == etcdKey { + return "" + } + if !strings.HasPrefix(trimmed, "/") { + trimmed = "/" + trimmed + } + return trimmed +} + +func (b *Backend) getETCDClient() (*clientv3.Client, error) { + b.clientMu.Lock() + defer b.clientMu.Unlock() + if b.client != nil { + return b.client, nil + } + if len(b.transport.ServerList) == 0 { + return nil, fmt.Errorf("no etcd servers configured") + } + cfg := clientv3.Config{ + Endpoints: append([]string{}, b.transport.ServerList...), + DialTimeout: 5 * time.Second, + } + if b.transport.CertFile != "" || b.transport.KeyFile != "" || b.transport.TrustedCAFile != "" { + tlsInfo := transport.TLSInfo{ + CertFile: b.transport.CertFile, + KeyFile: b.transport.KeyFile, + TrustedCAFile: b.transport.TrustedCAFile, + } + tlsCfg, err := tlsInfo.ClientConfig() + if err != nil { + return nil, err + } + cfg.TLS = tlsCfg + } + cli, err := clientv3.New(cfg) + if err != nil { + return nil, err + } + b.client = cli + return b.client, nil +} + +func shouldPrimeListIdentity(listObj runtime.Object) bool { + if listObj == nil { + return false + } + // Shared auth projection relies on cluster attribution for RBAC resources. + // Prime only these lists to make restart/relist deterministic without + // introducing broad etcd key-scan overhead. + t := reflect.TypeOf(listObj).String() + return strings.HasSuffix(t, ".RoleList") || + strings.HasSuffix(t, ".RoleBindingList") || + strings.HasSuffix(t, ".ClusterRoleList") || + strings.HasSuffix(t, ".ClusterRoleBindingList") +} + +func objectStorageKey(requestKey string, obj runtime.Object, appendObjectPath bool) string { + if strings.TrimSpace(requestKey) == "" || obj == nil { + return "" + } + if !appendObjectPath { + return strings.TrimSuffix(requestKey, "/") + } + accessor, err := meta.Accessor(obj) + if err != nil { + return requestKey + } + name := accessor.GetName() + if name == "" { + return requestKey + } + base := strings.TrimSuffix(requestKey, "/") + namespace := accessor.GetNamespace() + if namespace == "" { + if strings.HasSuffix(base, "/"+name) { + return base + } + return base + "/" + name + } + if strings.HasSuffix(base, "/"+namespace+"/"+name) { + return base + } + return base + "/" + namespace + "/" + name +} diff --git a/test/smoke/apiserver_test.go b/test/smoke/apiserver_test.go index 39aef50..972e04e 100644 --- a/test/smoke/apiserver_test.go +++ b/test/smoke/apiserver_test.go @@ -114,6 +114,7 @@ func mustWriteTokenFile(t *testing.T, path string) string { type apiserverOptions struct { rootCluster string etcdPrefix string + port int extraArgs []string } @@ -128,7 +129,10 @@ func startAPIServerWithOptions(t *testing.T, etcdEndpoints string, opts apiserve } bin := buildAPIServerBinary(t) - port := mustFreePort(t) + port := opts.port + if port == 0 { + port = mustFreePort(t) + } tmp := t.TempDir() ctx, cancel := context.WithCancel(context.Background()) diff --git a/test/smoke/auth_restart_test.go b/test/smoke/auth_restart_test.go new file mode 100644 index 0000000..81df342 --- /dev/null +++ b/test/smoke/auth_restart_test.go @@ -0,0 +1,121 @@ +package smoke + +import ( + "context" + "fmt" + "os" + "testing" + "time" + + authorizationv1 "k8s.io/api/authorization/v1" + rbacv1 "k8s.io/api/rbac/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +func TestRBACIsolationAfterAPIServerRestart(t *testing.T) { + etcd := os.Getenv("ETCD_ENDPOINTS") + if etcd == "" { + t.Skip("ETCD_ENDPOINTS is not set; skipping integration smoke tests") + } + + ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute) + defer cancel() + + port := mustFreePort(t) + prefix := fmt.Sprintf("/registry-smoke-restart-rbac-%d", time.Now().UnixNano()) + opts := apiserverOptions{etcdPrefix: prefix, port: port} + + s1 := startAPIServerWithOptions(t, etcd, opts) + clusterA := "c-" + randSuffix(3) + subjectUser := "user-" + randSuffix(4) + roleName := "role-" + randSuffix(4) + bindingName := "rb-" + randSuffix(4) + + csA1 := kubeClientForCluster(t, s1, clusterA) + csRoot1 := kubeClientForCluster(t, s1, s1.root) + + _, err := csA1.RbacV1().ClusterRoles().Create(ctx, &rbacv1.ClusterRole{ + ObjectMeta: metav1.ObjectMeta{Name: roleName}, + Rules: []rbacv1.PolicyRule{ + { + APIGroups: []string{""}, + Resources: []string{"configmaps"}, + Verbs: []string{"get"}, + }, + }, + }, metav1.CreateOptions{}) + if err != nil { + t.Fatalf("cluster=%s create clusterrole: %v", clusterA, err) + } + _, err = csA1.RbacV1().ClusterRoleBindings().Create(ctx, &rbacv1.ClusterRoleBinding{ + ObjectMeta: metav1.ObjectMeta{Name: bindingName}, + Subjects: []rbacv1.Subject{ + {Kind: rbacv1.UserKind, Name: subjectUser}, + }, + RoleRef: rbacv1.RoleRef{ + APIGroup: rbacv1.GroupName, + Kind: "ClusterRole", + Name: roleName, + }, + }, metav1.CreateOptions{}) + if err != nil { + t.Fatalf("cluster=%s create clusterrolebinding: %v", clusterA, err) + } + + sar := &authorizationv1.SubjectAccessReview{ + Spec: authorizationv1.SubjectAccessReviewSpec{ + User: subjectUser, + ResourceAttributes: &authorizationv1.ResourceAttributes{ + Namespace: "default", + Verb: "get", + Group: "", + Resource: "configmaps", + }, + }, + } + waitForSubjectAccessReviewWithLogs(ctx, t, csA1, sar, true, s1.logs) + waitForSubjectAccessReviewWithLogs(ctx, t, csRoot1, sar, false, s1.logs) + + // Simulate a full process restart while keeping persisted etcd state. + s1.Stop() + + s2 := startAPIServerWithOptions(t, etcd, opts) + csA2 := kubeClientForCluster(t, s2, clusterA) + csRoot2 := kubeClientForCluster(t, s2, s2.root) + + waitForRBACObject := func(name string, getFn func(context.Context, string, metav1.GetOptions) (interface{}, error)) { + deadline := time.Now().Add(20 * time.Second) + var lastErr error + for time.Now().Before(deadline) { + if _, err := getFn(ctx, name, metav1.GetOptions{}); err == nil { + return + } else { + lastErr = err + } + time.Sleep(300 * time.Millisecond) + } + t.Fatalf("cluster=%s object %q not visible after restart: %v", clusterA, name, lastErr) + } + waitForRBACObject(roleName, func(ctx context.Context, name string, opts metav1.GetOptions) (interface{}, error) { + return csA2.RbacV1().ClusterRoles().Get(ctx, name, opts) + }) + waitForRBACObject(bindingName, func(ctx context.Context, name string, opts metav1.GetOptions) (interface{}, error) { + return csA2.RbacV1().ClusterRoleBindings().Get(ctx, name, opts) + }) + if _, err := csRoot2.RbacV1().ClusterRoles().Get(ctx, roleName, metav1.GetOptions{}); err == nil { + t.Fatalf("unexpectedly found clusterrole %q in root cluster after restart", roleName) + } else if !apierrors.IsNotFound(err) { + t.Fatalf("root clusterrole get unexpected error: %v", err) + } + if _, err := csRoot2.RbacV1().ClusterRoleBindings().Get(ctx, bindingName, metav1.GetOptions{}); err == nil { + t.Fatalf("unexpectedly found clusterrolebinding %q in root cluster after restart", bindingName) + } else if !apierrors.IsNotFound(err) { + t.Fatalf("root clusterrolebinding get unexpected error: %v", err) + } + + // Shared RBAC projection must rebuild from relist/watch after restart. + waitForSubjectAccessReviewWithLogs(ctx, t, csA2, sar, true, s2.logs) + waitForSubjectAccessReviewWithLogs(ctx, t, csRoot2, sar, false, s2.logs) +} + diff --git a/test/smoke/auth_test.go b/test/smoke/auth_test.go index f832abb..e8efa64 100644 --- a/test/smoke/auth_test.go +++ b/test/smoke/auth_test.go @@ -7,7 +7,6 @@ import ( "testing" "time" - mc "github.com/kplane-dev/apiserver/pkg/multicluster" authenticationv1 "k8s.io/api/authentication/v1" authorizationv1 "k8s.io/api/authorization/v1" corev1 "k8s.io/api/core/v1" @@ -45,12 +44,10 @@ func TestRBACIsolationAcrossClusters(t *testing.T) { }, }, metav1.CreateOptions{}) if err != nil { - t.Fatalf("cluster=%s create clusterrole: %v", clusterA, err) + t.Fatalf("cluster=%s create clusterrole: %v\nlogs:\n%s", clusterA, err, s.logs()) } - if got, getErr := csA.RbacV1().ClusterRoles().Get(ctx, roleName, metav1.GetOptions{}); getErr != nil { + if _, getErr := csA.RbacV1().ClusterRoles().Get(ctx, roleName, metav1.GetOptions{}); getErr != nil { t.Fatalf("cluster=%s get clusterrole: %v", clusterA, getErr) - } else if got.Labels[mc.DefaultClusterAnnotation] != clusterA { - t.Fatalf("cluster=%s clusterrole missing label %q=%q labels=%v", clusterA, mc.DefaultClusterAnnotation, clusterA, got.Labels) } t.Cleanup(func() { _ = csA.RbacV1().ClusterRoles().Delete(context.Background(), roleName, metav1.DeleteOptions{}) @@ -70,10 +67,8 @@ func TestRBACIsolationAcrossClusters(t *testing.T) { if err != nil { t.Fatalf("cluster=%s create clusterrolebinding: %v", clusterA, err) } - if got, getErr := csA.RbacV1().ClusterRoleBindings().Get(ctx, bindingName, metav1.GetOptions{}); getErr != nil { + if _, getErr := csA.RbacV1().ClusterRoleBindings().Get(ctx, bindingName, metav1.GetOptions{}); getErr != nil { t.Fatalf("cluster=%s get clusterrolebinding: %v", clusterA, getErr) - } else if got.Labels[mc.DefaultClusterAnnotation] != clusterA { - t.Fatalf("cluster=%s clusterrolebinding missing label %q=%q labels=%v", clusterA, mc.DefaultClusterAnnotation, clusterA, got.Labels) } t.Cleanup(func() { _ = csA.RbacV1().ClusterRoleBindings().Delete(context.Background(), bindingName, metav1.DeleteOptions{}) @@ -141,7 +136,7 @@ func TestServiceAccountTokenIsolationAcrossClusters(t *testing.T) { waitForTokenReview(ctx, t, csA, tokenRoot, false) } -func TestRBACCreateSetsClusterLabel(t *testing.T) { +func TestRBACCreateClusterRole(t *testing.T) { etcd := os.Getenv("ETCD_ENDPOINTS") s := startAPIServer(t, etcd) @@ -165,8 +160,8 @@ func TestRBACCreateSetsClusterLabel(t *testing.T) { if err != nil { t.Fatalf("cluster=%s create clusterrole: %v", clusterA, err) } - if obj.Labels[mc.DefaultClusterAnnotation] != clusterA { - t.Fatalf("expected cluster label %q=%q, got labels=%v", mc.DefaultClusterAnnotation, clusterA, obj.Labels) + if obj == nil { + t.Fatalf("expected created clusterrole object") } } diff --git a/test/smoke/memory_200vcp_test.go b/test/smoke/memory_200vcp_test.go index 02f3ba7..cf4df4d 100644 --- a/test/smoke/memory_200vcp_test.go +++ b/test/smoke/memory_200vcp_test.go @@ -137,7 +137,7 @@ func exerciseClusterForMemory(ctx context.Context, t *testing.T, s *testAPIServe plural := "memwidgets" crdName := plural + "." + group createTestCRD(ctx, t, crdClient, crdName, group, plural) - waitForCRDEstablished(ctx, t, crdClient, clusterID, crdName) + waitForCRDEstablished(ctx, t, crdClient, clusterID, crdName) waitForResourcePresence(t, cs, clusterID, group+"/v1", plural, true) return nil }