From 32574a4cca3382203355177a2095424bc95cfa24 Mon Sep 17 00:00:00 2001 From: Espen Wobbes Date: Thu, 12 Feb 2026 14:24:01 +0100 Subject: [PATCH 01/24] fix: now stores more data, refined repositories --- TODOS.md | 5 +- cmd/main.go | 21 ++--- go.mod | 12 +-- go.sum | 10 +++ internal/api/handlers/failover/failover.go | 5 +- internal/api/handlers/spoofs/overrides.go | 47 ++++++----- internal/api/handlers/spoofs/service.go | 11 ++- internal/api/handlers/spoofs/spoofs.go | 6 +- internal/dns/handler.go | 10 ++- internal/dns/updater.go | 46 ++++++----- internal/manager/manager.go | 2 +- internal/manager/query_manager.go | 2 +- internal/model/service.go | 29 +++++++ internal/repositories/service/service.go | 91 ++++++++++++++++++++++ internal/repositories/spoof/spoof.go | 66 +++++++--------- internal/service/service.go | 4 + pkg/models/spoofs/override.go | 2 +- pkg/persistence/persistence.go | 1 + pkg/persistence/store/file/file.go | 12 ++- pkg/persistence/store/memory/memory.go | 4 + 20 files changed, 271 insertions(+), 115 deletions(-) create mode 100644 internal/model/service.go create mode 100644 internal/repositories/service/service.go diff --git a/TODOS.md b/TODOS.md index 9be18b7..12b9a82 100644 --- a/TODOS.md +++ b/TODOS.md @@ -8,7 +8,10 @@ - flags loader for config variables - OnShutDown functions to save current state on shutdown + - expand to OnStart - If svc not in DC, then roundtrip decides priority -- AUTH \ No newline at end of file +- AUTH + +- Webhooks notifies on event? \ No newline at end of file diff --git a/cmd/main.go b/cmd/main.go index 8cff1e2..f6e61c0 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -11,17 +11,17 @@ import ( "time" "github.com/vitistack/gslb-operator/internal/api/handlers/failover" - spoofs "github.com/vitistack/gslb-operator/internal/api/handlers/spoofs" + "github.com/vitistack/gslb-operator/internal/api/handlers/spoofs" "github.com/vitistack/gslb-operator/internal/api/routes" "github.com/vitistack/gslb-operator/internal/config" "github.com/vitistack/gslb-operator/internal/dns" "github.com/vitistack/gslb-operator/internal/manager" - spoofsrepo "github.com/vitistack/gslb-operator/internal/repositories/spoof" + "github.com/vitistack/gslb-operator/internal/model" + "github.com/vitistack/gslb-operator/internal/repositories/service" "github.com/vitistack/gslb-operator/pkg/auth" "github.com/vitistack/gslb-operator/pkg/auth/jwt" "github.com/vitistack/gslb-operator/pkg/bslog" "github.com/vitistack/gslb-operator/pkg/lua" - apiContractSpoof "github.com/vitistack/gslb-operator/pkg/models/spoofs" "github.com/vitistack/gslb-operator/pkg/persistence/store/file" "github.com/vitistack/gslb-operator/pkg/rest/middleware" ) @@ -41,14 +41,14 @@ func main() { manager.WithNonBlockingBufferSize(110), ) - spoofsFileStore, err := file.NewStore[apiContractSpoof.Spoof]("store.json") + serviceFileStore, err := file.NewStore[model.Service]("store.json") if err != nil { bslog.Fatal("could not create persistent storage", slog.String("reason", err.Error())) } - spoofRepo := spoofsrepo.NewRepository(spoofsFileStore) + svcRepo := service.NewServiceRepo(serviceFileStore) updater, err := dns.NewUpdater( - dns.UpdaterWithSpoofRepo(spoofRepo), + dns.UpdaterWithSpoofRepo(svcRepo), ) if err != nil { bslog.Fatal("unable to create updater", slog.String("error", err.Error())) @@ -65,12 +65,13 @@ func main() { api := http.NewServeMux() // routes handlers - spoofsApiService := spoofs.NewSpoofsService(spoofRepo, mgr) + spoofsApiService := spoofs.NewSpoofsService(serviceFileStore, mgr) - failoverApiService := failover.NewFailoverService(spoofRepo, mgr) + failoverApiService := failover.NewFailoverService(mgr) // initializing the service jwt self signer jwt.InitServiceTokenManager(cfg.JWT().Secret(), cfg.JWT().User()) + fmt.Println(jwt.GetInstance().GetServiceToken()) api.HandleFunc(routes.POST_FAILOVER, middleware.Chain( middleware.WithIncomingRequestLogging(slog.Default()), @@ -131,10 +132,10 @@ func main() { case <-quit: bslog.Info("gracefully shutting down...") } - + shutdown, cancel := context.WithTimeout(background, time.Second*5) defer cancel() - + dnsHandler.Stop(shutdown) if err := server.Shutdown(shutdown); err != nil { panic("error shutting down server: " + err.Error()) diff --git a/go.mod b/go.mod index d3b6b67..65df623 100644 --- a/go.mod +++ b/go.mod @@ -1,18 +1,18 @@ module github.com/vitistack/gslb-operator -go 1.25.0 +go 1.25.7 require ( - codeberg.org/miekg/dns v0.5.21 + codeberg.org/miekg/dns v0.6.48 github.com/golang-jwt/jwt/v5 v5.3.1 github.com/google/uuid v1.6.0 github.com/joho/godotenv v1.5.1 - github.com/tevino/tcp-shaker v0.0.0-20251020080735-c4094cd6c927 + github.com/tevino/tcp-shaker v0.0.0-20260210162928-fb888f26451b github.com/yuin/gopher-lua v1.1.1 - golang.org/x/crypto v0.43.0 + golang.org/x/crypto v0.48.0 ) require ( - golang.org/x/net v0.46.0 // indirect - golang.org/x/sys v0.37.0 // indirect + golang.org/x/net v0.50.0 // indirect + golang.org/x/sys v0.41.0 // indirect ) diff --git a/go.sum b/go.sum index 2791e99..6e25b55 100644 --- a/go.sum +++ b/go.sum @@ -1,5 +1,7 @@ codeberg.org/miekg/dns v0.5.21 h1:O+Ibq9IJuOeMoBnNmYdQmKJ7J9zgEsUqcbBhjsSrzIc= codeberg.org/miekg/dns v0.5.21/go.mod h1:Q10KolpjjNhl9x14KdKA3s+7Xynb8Zqvjj9jWyzrYRA= +codeberg.org/miekg/dns v0.6.48 h1:+RZiJMKPq5BYjePB7AfTv7O+qf/3Kjsz9C4WmOUHdoA= +codeberg.org/miekg/dns v0.6.48/go.mod h1:fIxAzBMDPnXWSw0fp8+pfZMRiAqYY4+HHYLzUo/S6Dg= github.com/golang-jwt/jwt/v5 v5.3.1 h1:kYf81DTWFe7t+1VvL7eS+jKFVWaUnK9cB1qbwn63YCY= github.com/golang-jwt/jwt/v5 v5.3.1/go.mod h1:fxCRLWMO43lRc8nhHWY6LGqRcf+1gQWArsqaEUEa5bE= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= @@ -8,11 +10,19 @@ github.com/joho/godotenv v1.5.1 h1:7eLL/+HRGLY0ldzfGMeQkb7vMd0as4CfYvUVzLqw0N0= github.com/joho/godotenv v1.5.1/go.mod h1:f4LDr5Voq0i2e/R5DDNOoa2zzDfwtkZa6DnEwAbqwq4= github.com/tevino/tcp-shaker v0.0.0-20251020080735-c4094cd6c927 h1:BdtSwzS6fNIAC3Ylj3x/ak6PD4EV885gGhWR7eIplEI= github.com/tevino/tcp-shaker v0.0.0-20251020080735-c4094cd6c927/go.mod h1:S0VUAF1puvgOrlSQqCrJiz2t7yn2gPKYSpGu4+w8eg0= +github.com/tevino/tcp-shaker v0.0.0-20260210162928-fb888f26451b h1:vmeHwA9U5lODKqvdZQxKqy+i1Q2yMwShjxytoszeWmw= +github.com/tevino/tcp-shaker v0.0.0-20260210162928-fb888f26451b/go.mod h1:bNnAwCfoEQXR47eBqFYS9fD6qTcY3t5ZUUgBZskRdcY= github.com/yuin/gopher-lua v1.1.1 h1:kYKnWBjvbNP4XLT3+bPEwAXJx262OhaHDWDVOPjL46M= github.com/yuin/gopher-lua v1.1.1/go.mod h1:GBR0iDaNXjAgGg9zfCvksxSRnQx76gclCIb7kdAd1Pw= golang.org/x/crypto v0.43.0 h1:dduJYIi3A3KOfdGOHX8AVZ/jGiyPa3IbBozJ5kNuE04= golang.org/x/crypto v0.43.0/go.mod h1:BFbav4mRNlXJL4wNeejLpWxB7wMbc79PdRGhWKncxR0= +golang.org/x/crypto v0.48.0 h1:/VRzVqiRSggnhY7gNRxPauEQ5Drw9haKdM0jqfcCFts= +golang.org/x/crypto v0.48.0/go.mod h1:r0kV5h3qnFPlQnBSrULhlsRfryS2pmewsg+XfMgkVos= golang.org/x/net v0.46.0 h1:giFlY12I07fugqwPuWJi68oOnpfqFnJIJzaIIm2JVV4= golang.org/x/net v0.46.0/go.mod h1:Q9BGdFy1y4nkUwiLvT5qtyhAnEHgnQ/zd8PfU6nc210= +golang.org/x/net v0.50.0 h1:ucWh9eiCGyDR3vtzso0WMQinm2Dnt8cFMuQa9K33J60= +golang.org/x/net v0.50.0/go.mod h1:UgoSli3F/pBgdJBHCTc+tp3gmrU4XswgGRgtnwWTfyM= golang.org/x/sys v0.37.0 h1:fdNQudmxPjkdUTPnLn5mdQv7Zwvbvpaxqs831goi9kQ= golang.org/x/sys v0.37.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= +golang.org/x/sys v0.41.0 h1:Ivj+2Cp/ylzLiEU89QhWblYnOE9zerudt9Ftecq2C6k= +golang.org/x/sys v0.41.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= diff --git a/internal/api/handlers/failover/failover.go b/internal/api/handlers/failover/failover.go index 7f48014..f58baa2 100644 --- a/internal/api/handlers/failover/failover.go +++ b/internal/api/handlers/failover/failover.go @@ -5,7 +5,6 @@ import ( "net/http" "github.com/vitistack/gslb-operator/internal/manager" - "github.com/vitistack/gslb-operator/internal/repositories/spoof" "github.com/vitistack/gslb-operator/pkg/bslog" "github.com/vitistack/gslb-operator/pkg/models/failover" "github.com/vitistack/gslb-operator/pkg/rest/request" @@ -13,13 +12,11 @@ import ( ) type FailoverService struct { - spoofRepo *spoof.Repository serviceManager manager.QueryManager } -func NewFailoverService(repo *spoof.Repository, mgr manager.QueryManager) *FailoverService { +func NewFailoverService(mgr manager.QueryManager) *FailoverService { return &FailoverService{ - spoofRepo: repo, serviceManager: mgr, } } diff --git a/internal/api/handlers/spoofs/overrides.go b/internal/api/handlers/spoofs/overrides.go index f6430ad..9140f9c 100644 --- a/internal/api/handlers/spoofs/overrides.go +++ b/internal/api/handlers/spoofs/overrides.go @@ -22,15 +22,15 @@ import ( func (ss *SpoofsService) GetOverride(w http.ResponseWriter, r *http.Request) { logger := bslog.With(slog.Any("request_id", r.Context().Value("id"))) - fqdn := r.PathValue("fqdn") + memberOf := r.PathValue("memberOf") - if fqdn == "" { + if memberOf == "" { logger.Error("skipping request due to insufficient input parameters", slog.String("reason", "missing fqdn")) response.Err(w, response.ErrInvalidInput, "missing fqdn") return } - exist, err := ss.SpoofRepo.ReadFQDN(fqdn) + exist, err := ss.spoofRepo.ReadFQDN(memberOf) if err != nil { logger.Error("could not read spoofs", slog.String("reason", err.Error())) response.Err(w, response.ErrInternalError, "") @@ -64,7 +64,7 @@ func (ss *SpoofsService) CreateOverride(w http.ResponseWriter, r *http.Request) if err != nil { logger.Error("could not override spoof", slog.String("reason", err.Error())) if errors.Is(err, spoofRepo.ErrSpoofWithFQDNNotFound) { - response.Err(w, response.ErrNotFound, "fqdn not found: "+override.FQDN) + response.Err(w, response.ErrNotFound, "fqdn not found: "+override.MemberOf) return } @@ -90,7 +90,7 @@ func (ss *SpoofsService) UpdateOverride(w http.ResponseWriter, r *http.Request) if err != nil { logger.Error("could not update spoof", slog.String("reason", err.Error())) if errors.Is(err, spoofRepo.ErrSpoofWithFQDNNotFound) { - response.Err(w, response.ErrNotFound, "fqdn not found: "+override.FQDN) + response.Err(w, response.ErrNotFound, "member-of not found: "+override.MemberOf) return } @@ -123,24 +123,24 @@ func (ss *SpoofsService) DeleteOverride(w http.ResponseWriter, r *http.Request) } func (ss *SpoofsService) newOverride(override spoofs.Override) error { - exist, err := ss.SpoofRepo.ReadFQDN(override.FQDN) + exist, err := ss.svcRepo.FetchServiceMemberOf(override.MemberOf) if err != nil { return fmt.Errorf("unable to read spoofs from storage: %w", err) } - if exist.DC == "OVERRIDE" { - return fmt.Errorf("service already has active override: %s", exist.FQDN) + if exist.Datacenter == "OVERRIDE" { + return fmt.Errorf("service already has active override: %s", exist.MemberOf) } - err = ss.SpoofRepo.Delete(exist.Key()) + err = ss.svcRepo.Delete(exist.Key()) if err != nil { return fmt.Errorf("could not delete old spoof: %w", err) } - exist.DC = "OVERRIDE" + exist.Datacenter = "OVERRIDE" exist.IP = override.IP.String() - err = ss.SpoofRepo.Create(exist.Key(), &exist) + err = ss.svcRepo.Create(&exist) if err != nil { return fmt.Errorf("could not create spoof: %w", err) } @@ -149,18 +149,18 @@ func (ss *SpoofsService) newOverride(override spoofs.Override) error { } func (ss *SpoofsService) updateOverride(override spoofs.Override) error { - exist, err := ss.SpoofRepo.ReadFQDN(override.FQDN) + exist, err := ss.svcRepo.FetchServiceMemberOf(override.MemberOf) if err != nil { return fmt.Errorf("unable to read spoofs from storage: %w", err) } - if exist.DC != "OVERRIDE" { - return fmt.Errorf("%s does not have an active override", override.FQDN) + if exist.Datacenter != "OVERRIDE" { + return fmt.Errorf("%s does not have an active override", override.MemberOf) } exist.IP = override.IP.String() - err = ss.SpoofRepo.Update(exist.Key(), &exist) + err = ss.svcRepo.Update(exist.Key(), &exist) if err != nil { return fmt.Errorf("could not update spoof: %w", err) } @@ -169,17 +169,18 @@ func (ss *SpoofsService) updateOverride(override spoofs.Override) error { } func (ss *SpoofsService) deleteOverride(override spoofs.Override) error { - exist, err := ss.SpoofRepo.ReadFQDN(override.FQDN) + exist, err := ss.svcRepo.FetchServiceMemberOf(override.MemberOf) if err != nil { return fmt.Errorf("unable to read spoofs from storage: %w", err) } - if exist.DC != "OVERRIDE" { - return fmt.Errorf("%s does not have an override currently set", override.FQDN) + if exist.Datacenter != "OVERRIDE" { + return fmt.Errorf("%s does not have an override currently set", override.MemberOf) } spoof := ss.restoreSpoof(override) - err = ss.SpoofRepo.Delete(exist.Key()) + + err = ss.svcRepo.Delete(exist.Key()) if err != nil { return fmt.Errorf("could not update spoof: %w", err) } @@ -188,7 +189,11 @@ func (ss *SpoofsService) deleteOverride(override spoofs.Override) error { return nil } - err = ss.SpoofRepo.Create(spoof.Key(), spoof) + exist.Datacenter = spoof.DC + exist.Fqdn = spoof.FQDN + exist.IP = spoof.IP + + err = ss.svcRepo.Create(&exist) if err != nil { return fmt.Errorf("could not create spoof for active service: %w", err) } @@ -197,7 +202,7 @@ func (ss *SpoofsService) deleteOverride(override spoofs.Override) error { } func (ss *SpoofsService) restoreSpoof(override spoofs.Override) *spoofs.Spoof { - svc := ss.serviceManager.GetActiveForFQDN(override.FQDN) + svc := ss.serviceManager.GetActiveForMemberOf(override.MemberOf) if svc == nil { // no active service: e.g. no spoof should be there return nil } diff --git a/internal/api/handlers/spoofs/service.go b/internal/api/handlers/spoofs/service.go index f32c06b..5921f8c 100644 --- a/internal/api/handlers/spoofs/service.go +++ b/internal/api/handlers/spoofs/service.go @@ -2,17 +2,22 @@ package spoofs import ( "github.com/vitistack/gslb-operator/internal/manager" + "github.com/vitistack/gslb-operator/internal/model" + "github.com/vitistack/gslb-operator/internal/repositories/service" "github.com/vitistack/gslb-operator/internal/repositories/spoof" + "github.com/vitistack/gslb-operator/pkg/persistence" ) type SpoofsService struct { - SpoofRepo *spoof.Repository + svcRepo *service.ServiceRepo + spoofRepo *spoof.SpoofRepo serviceManager manager.QueryManager } -func NewSpoofsService(repo *spoof.Repository, svcManager manager.QueryManager) *SpoofsService { +func NewSpoofsService(store persistence.Store[model.Service], svcManager manager.QueryManager) *SpoofsService { return &SpoofsService{ - SpoofRepo: repo, + svcRepo: service.NewServiceRepo(store), + spoofRepo: spoof.NewSpoofRepo(store), // create read-only serviceManager: svcManager, } } diff --git a/internal/api/handlers/spoofs/spoofs.go b/internal/api/handlers/spoofs/spoofs.go index 3f93f42..ae392c0 100644 --- a/internal/api/handlers/spoofs/spoofs.go +++ b/internal/api/handlers/spoofs/spoofs.go @@ -17,7 +17,7 @@ import ( ) func (ss *SpoofsService) GetSpoofs(w http.ResponseWriter, r *http.Request) { - data, err := ss.SpoofRepo.ReadAll() + data, err := ss.spoofRepo.ReadAll() if err != nil { response.Err(w, response.ErrInternalError, "unable to fetch spoofs from storage") bslog.Error("Unable to fetch spoofs", slog.String("reason", err.Error())) @@ -43,7 +43,7 @@ func (ss *SpoofsService) GetFQDNSpoof(w http.ResponseWriter, r *http.Request) { return } - spoof, err := ss.SpoofRepo.Read(fqdn) + spoof, err := ss.spoofRepo.Read(fqdn) if err != nil { msg := "unable to fetch spoof with id: " + fqdn + " from storage" response.Err(w, response.ErrInternalError, msg) @@ -55,7 +55,7 @@ func (ss *SpoofsService) GetFQDNSpoof(w http.ResponseWriter, r *http.Request) { } func (ss *SpoofsService) GetSpoofsHash(w http.ResponseWriter, r *http.Request) { - data, err := ss.SpoofRepo.ReadAll() + data, err := ss.spoofRepo.ReadAll() if err != nil { response.Err(w, response.ErrInternalError, "unable to fetch spoofs from storage") bslog.Error("unable to read spoofs from storage", slog.String("reason", err.Error())) diff --git a/internal/dns/handler.go b/internal/dns/handler.go index 23b8cb9..6e8e4e8 100644 --- a/internal/dns/handler.go +++ b/internal/dns/handler.go @@ -78,11 +78,17 @@ func (h *Handler) Stop(ctx context.Context) { } func (h *Handler) onServiceDown(svc *service.Service) { - h.updater.ServiceDown(svc) + err := h.updater.ServiceDown(svc) + if err != nil { + bslog.Warn("error while updating service on service down", slog.String("error", err.Error())) + } } func (h *Handler) onServiceUp(svc *service.Service) { - h.updater.ServiceUp(svc) + err := h.updater.ServiceUp(svc) + if err != nil { + bslog.Warn("error while updating service state on service up", slog.String("error", err.Error())) + } } func (h *Handler) handleZoneUpdates(zone <-chan []dns.RR, pollErrors <-chan error) { diff --git a/internal/dns/updater.go b/internal/dns/updater.go index ea8bb06..584a50f 100644 --- a/internal/dns/updater.go +++ b/internal/dns/updater.go @@ -7,11 +7,11 @@ import ( "time" "github.com/vitistack/gslb-operator/internal/config" - "github.com/vitistack/gslb-operator/internal/repositories/spoof" + "github.com/vitistack/gslb-operator/internal/model" + svcRepo "github.com/vitistack/gslb-operator/internal/repositories/service" "github.com/vitistack/gslb-operator/internal/service" "github.com/vitistack/gslb-operator/pkg/auth/jwt" "github.com/vitistack/gslb-operator/pkg/bslog" - "github.com/vitistack/gslb-operator/pkg/models/spoofs" "github.com/vitistack/gslb-operator/pkg/persistence/store/memory" "github.com/vitistack/gslb-operator/pkg/rest/request" "github.com/vitistack/gslb-operator/pkg/rest/request/client" @@ -20,11 +20,11 @@ import ( type updaterOption func(u *Updater) type Updater struct { - Server string - spoofRepo *spoof.Repository - client client.HTTPClient - builder *request.Builder - mu *sync.Mutex + Server string + svcRepo *svcRepo.ServiceRepo + client client.HTTPClient + builder *request.Builder + mu *sync.Mutex } func NewUpdater(opts ...updaterOption) (*Updater, error) { @@ -39,10 +39,10 @@ func NewUpdater(opts ...updaterOption) (*Updater, error) { } u := &Updater{ - Server: config.GetInstance().GSLB().UpdaterHost(), - spoofRepo: spoof.NewRepository(memory.NewStore[spoofs.Spoof]()), - client: *c, - mu: &sync.Mutex{}, + Server: config.GetInstance().GSLB().UpdaterHost(), + svcRepo: svcRepo.NewServiceRepo(memory.NewStore[model.Service]()), + client: *c, + mu: &sync.Mutex{}, } for _, opt := range opts { @@ -53,9 +53,9 @@ func NewUpdater(opts ...updaterOption) (*Updater, error) { return u, nil } -func UpdaterWithSpoofRepo(spoofRep *spoof.Repository) updaterOption { +func UpdaterWithSpoofRepo(svcRepo *svcRepo.ServiceRepo) updaterOption { return func(u *Updater) { - u.spoofRepo = spoofRep + u.svcRepo = svcRepo } } @@ -73,7 +73,7 @@ func UpdaterWithClient(client *client.HTTPClient) updaterOption { func (u *Updater) ServiceDown(svc *service.Service) error { u.mu.Lock() - override, err := u.spoofRepo.HasOverride(svc.MemberOf) + override, err := u.svcRepo.HasOverride(svc.MemberOf) if err != nil { return fmt.Errorf("unable to delete spoof: %w", err) } @@ -83,7 +83,7 @@ func (u *Updater) ServiceDown(svc *service.Service) error { return nil } - err = u.spoofRepo.Delete(fmt.Sprintf("%s:%s", svc.MemberOf, svc.Datacenter)) + err = u.svcRepo.Delete(fmt.Sprintf("%s:%s", svc.MemberOf, svc.Datacenter)) u.mu.Unlock() if err != nil { return fmt.Errorf("unable to delete service from storage: %s", err.Error()) @@ -116,7 +116,7 @@ func (u *Updater) ServiceDown(svc *service.Service) error { func (u *Updater) ServiceUp(svc *service.Service) error { u.mu.Lock() - override, err := u.spoofRepo.HasOverride(svc.MemberOf) + override, err := u.svcRepo.HasOverride(svc.MemberOf) if err != nil { return fmt.Errorf("unable to store spoof: %w", err) } @@ -131,13 +131,17 @@ func (u *Updater) ServiceUp(svc *service.Service) error { return fmt.Errorf("unable to get ip address: %s", err.Error()) } - spoof := &spoofs.Spoof{ - FQDN: svc.MemberOf, - IP: ip, - DC: svc.Datacenter, + spoof := &model.Service{ + ID: svc.GetID(), + MemberOf: svc.MemberOf, + Fqdn: svc.Fqdn, + IP: ip, + Datacenter: svc.Datacenter, + IsHealthy: svc.IsHealthy(), + FailureCount: svc.GetFailureCount(), } - err = u.spoofRepo.Create(fmt.Sprintf("%s:%s", svc.MemberOf, svc.Datacenter), spoof) + err = u.svcRepo.Create(spoof) u.mu.Unlock() if err != nil { return fmt.Errorf("could not store new spoof: %s", err.Error()) diff --git a/internal/manager/manager.go b/internal/manager/manager.go index 92ce252..4fe3200 100644 --- a/internal/manager/manager.go +++ b/internal/manager/manager.go @@ -357,7 +357,7 @@ func (sm *ServicesManager) moveServiceToInterval(svc *service.Service, newInterv slog.Any("service", svc)) } -func (sm *ServicesManager) GetActiveForFQDN(memberOf string) *service.Service { +func (sm *ServicesManager) GetActiveForMemberOf(memberOf string) *service.Service { sm.mutex.RLock() defer sm.mutex.RUnlock() if group, ok := sm.serviceGroups[memberOf]; ok { diff --git a/internal/manager/query_manager.go b/internal/manager/query_manager.go index 357b889..69845f6 100644 --- a/internal/manager/query_manager.go +++ b/internal/manager/query_manager.go @@ -8,7 +8,7 @@ import ( // interface for API handlers that needs specific functionality from the manager. // without exposing all functionality type QueryManager interface { - GetActiveForFQDN(fqdn string) *service.Service + GetActiveForMemberOf(memberOf string) *service.Service //write operations Failover(fqdn string, failover failover.Failover) error diff --git a/internal/model/service.go b/internal/model/service.go new file mode 100644 index 0000000..c1ed21d --- /dev/null +++ b/internal/model/service.go @@ -0,0 +1,29 @@ +package model + +import ( + "github.com/vitistack/gslb-operator/pkg/models/spoofs" +) + +// storage representation of service +// services that are configured with gslb config end up as a service.Service +type Service struct { + ID string `json:"id"` + MemberOf string `json:"memberOf"` + Fqdn string `json:"fqdn"` + Datacenter string `json:"datacenter"` + IP string `json:"ip"` + IsHealthy bool `json:"isHealthy"` + FailureCount int `json:"failureCount"` +} + +func (s Service) Key() string { + return s.MemberOf + ":" + s.Datacenter +} + +func (s Service) Spoof() spoofs.Spoof { + return spoofs.Spoof{ + FQDN: s.Fqdn, + IP: s.IP, + DC: s.Datacenter, + } +} diff --git a/internal/repositories/service/service.go b/internal/repositories/service/service.go new file mode 100644 index 0000000..c147b52 --- /dev/null +++ b/internal/repositories/service/service.go @@ -0,0 +1,91 @@ +package service + +import ( + "errors" + "fmt" + + "github.com/vitistack/gslb-operator/internal/model" + "github.com/vitistack/gslb-operator/pkg/persistence" +) + +var ( + ErrServiceWithMemberOfNotFound = errors.New("service with member-of not found") +) + +// repository for services that are considered active in a service group +type ServiceRepo struct { + store persistence.Store[model.Service] +} + +func NewServiceRepo(store persistence.Store[model.Service]) *ServiceRepo { + return &ServiceRepo{ + store: store, + } +} + +func (sr *ServiceRepo) Create(new *model.Service) error { + err := sr.store.Save(new.Key(), *new) + if err != nil { + return fmt.Errorf("failed to store service: %w", err) + } + return nil +} + +func (sr *ServiceRepo) Update(id string, new *model.Service) error { + err := sr.store.Save(id, *new) + if err != nil { + return fmt.Errorf("failed to update entry with id: %s: %w", id, err) + } + return nil +} + +func (sr *ServiceRepo) Delete(id string) error { + err := sr.store.Delete(id) + if err != nil { + return fmt.Errorf("failed to delete entry with id: %s: %w", id, err) + } + return nil +} + +func (sr *ServiceRepo) Read(id string) (model.Service, error) { + svc, err := sr.store.Load(id) + if err != nil { + return model.Service{}, fmt.Errorf("failed to read from storage: %w", err) + } + return svc, nil +} +func (sr *ServiceRepo) ReadAll() ([]model.Service, error) { + services, err := sr.store.LoadAll() + if err != nil { + return nil, fmt.Errorf("failed to read from storage: %w", err) + } + + return services, nil +} + +func (sr *ServiceRepo) FetchServiceMemberOf(memberOf string) (model.Service, error) { + allServices, err := sr.ReadAll() + if err != nil { + return model.Service{}, err + } + + for _, svc := range allServices { + if svc.MemberOf == memberOf { + return svc, nil + } + } + + return model.Service{}, fmt.Errorf("%w: member-of %s", ErrServiceWithMemberOfNotFound, memberOf) +} + +func (sr *ServiceRepo) HasOverride(memberOf string) (bool, error) { + svc, err := sr.FetchServiceMemberOf(memberOf) + if err != nil { + if errors.Is(err, ErrServiceWithMemberOfNotFound) { + return false, nil + } + return false, err + } + + return svc.Datacenter == "OVERRIDE", nil +} diff --git a/internal/repositories/spoof/spoof.go b/internal/repositories/spoof/spoof.go index 7c10797..5fa8d76 100644 --- a/internal/repositories/spoof/spoof.go +++ b/internal/repositories/spoof/spoof.go @@ -4,6 +4,7 @@ import ( "errors" "fmt" + "github.com/vitistack/gslb-operator/internal/model" "github.com/vitistack/gslb-operator/pkg/models/spoofs" "github.com/vitistack/gslb-operator/pkg/persistence" ) @@ -12,69 +13,56 @@ var ( ErrSpoofWithFQDNNotFound = errors.New("spoof with fqdn not found") ) -type Repository struct { - storage persistence.Store[spoofs.Spoof] +// read-only repo for spoofs +type SpoofRepo struct { + store persistence.Store[model.Service] } -func NewRepository(storage persistence.Store[spoofs.Spoof]) *Repository { - return &Repository{ - storage: storage, +func NewSpoofRepo(storage persistence.Store[model.Service]) *SpoofRepo { + return &SpoofRepo{ + store: storage, } } -func (r *Repository) Create(key string, new *spoofs.Spoof) error { - err := r.storage.Save(key, *new) +func (r *SpoofRepo) Read(id string) (spoofs.Spoof, error) { + svc, err := r.store.Load(id) if err != nil { - return fmt.Errorf("unable to store entry: %s", err.Error()) + return spoofs.Spoof{}, fmt.Errorf("failed to read from storage: %w", err) } - return nil -} -func (r *Repository) Update(id string, new *spoofs.Spoof) error { - err := r.storage.Save(id, *new) - if err != nil { - return fmt.Errorf("unable to update entry with id: %s: %s", id, err.Error()) - } - return nil + return svc.Spoof(), nil } -func (r *Repository) Delete(id string) error { - err := r.storage.Delete(id) +func (r *SpoofRepo) ReadFQDN(fqdn string) (spoofs.Spoof, error) { + allServices, err := r.store.LoadAll() if err != nil { - return fmt.Errorf("unable to delete entry with id: %s: %s", id, err.Error()) + return spoofs.Spoof{}, fmt.Errorf("failed to read from storage: %w", err) } - return nil -} -func (r *Repository) Read(id string) (spoofs.Spoof, error) { - spoof, err := r.storage.Load(id) - if err != nil { - return spoofs.Spoof{}, fmt.Errorf("unable to read resource with id: %s", err.Error()) + for _, svc := range allServices { + if svc.Fqdn == fqdn { + return svc.Spoof(), nil + } } - return spoof, nil + return spoofs.Spoof{}, fmt.Errorf("%w: fqdn: %s", ErrSpoofWithFQDNNotFound, fqdn) } -func (r *Repository) ReadFQDN(fqdn string) (spoofs.Spoof, error) { - allSpoofs, err := r.storage.LoadAll() +func (r *SpoofRepo) ReadAll() ([]spoofs.Spoof, error) { + allServices, err := r.store.LoadAll() if err != nil { - return spoofs.Spoof{}, fmt.Errorf("unable to read all spoofs: %w", err) + return nil, fmt.Errorf("failed to read from storage: %w", err) } - for _, spoof := range allSpoofs { - if spoof.FQDN == fqdn { - return spoof, nil - } + spoofs := make([]spoofs.Spoof, 0) + for _, svc := range allServices { + spoofs = append(spoofs, svc.Spoof()) } - return spoofs.Spoof{}, fmt.Errorf("%w: fqdn: %s", ErrSpoofWithFQDNNotFound, fqdn) -} - -func (r *Repository) ReadAll() ([]spoofs.Spoof, error) { - return r.storage.LoadAll() + return spoofs, nil } -func (r *Repository) HasOverride(fqdn string) (bool, error) { +func (r *SpoofRepo) HasOverride(fqdn string) (bool, error) { spoof, err := r.ReadFQDN(fqdn) if err != nil { if errors.Is(err, ErrSpoofWithFQDNNotFound) { diff --git a/internal/service/service.go b/internal/service/service.go index 02105d2..d57b4d4 100644 --- a/internal/service/service.go +++ b/internal/service/service.go @@ -225,6 +225,10 @@ func (s *Service) GetID() string { return s.id } +func (s *Service) GetFailureCount() int { + return s.failureCount +} + func (s *Service) ConfigChanged(other *Service) bool { if s.Fqdn != other.Fqdn || s.addr != other.addr || diff --git a/pkg/models/spoofs/override.go b/pkg/models/spoofs/override.go index 1e5f6ea..5026126 100644 --- a/pkg/models/spoofs/override.go +++ b/pkg/models/spoofs/override.go @@ -3,6 +3,6 @@ package spoofs import "net" type Override struct { - FQDN string `json:"fqdn"` + MemberOf string `json:"memberOf"` IP net.IP `json:"ip,omitempty"` } diff --git a/pkg/persistence/persistence.go b/pkg/persistence/persistence.go index 7e9f2e0..4b0358f 100644 --- a/pkg/persistence/persistence.go +++ b/pkg/persistence/persistence.go @@ -14,4 +14,5 @@ type Store[T any] interface { Load(key string) (T, error) LoadAll() ([]T, error) Delete(key string) error + Close() error } diff --git a/pkg/persistence/store/file/file.go b/pkg/persistence/store/file/file.go index e22c944..c608aed 100644 --- a/pkg/persistence/store/file/file.go +++ b/pkg/persistence/store/file/file.go @@ -5,10 +5,13 @@ import ( "fmt" "os" "sync" + + "github.com/vitistack/gslb-operator/pkg/persistence/store/memory" ) type Store[T any] struct { - lock sync.RWMutex + lock sync.RWMutex + cache *memory.Store[T] fileName string } @@ -20,8 +23,9 @@ func NewStore[T any](fileName string) (*Store[T], error) { store.Close() return &Store[T]{ - lock: sync.RWMutex{}, + lock: sync.RWMutex{}, fileName: fileName, + cache: memory.NewStore[T](), }, nil } @@ -130,3 +134,7 @@ func (s *Store[T]) Delete(key string) error { return nil } + +func (s *Store[T]) Close() error { + return nil +} diff --git a/pkg/persistence/store/memory/memory.go b/pkg/persistence/store/memory/memory.go index 318644b..c6b13b3 100644 --- a/pkg/persistence/store/memory/memory.go +++ b/pkg/persistence/store/memory/memory.go @@ -53,3 +53,7 @@ func (s *Store[T]) Delete(key string) error { delete(s.data, key) return nil } + +func (s *Store[T]) Close() error { + return nil +} From 5dfa651b1745a3d869017b6246edc921eece4b38 Mon Sep 17 00:00:00 2001 From: Espen Wobbes Date: Thu, 12 Feb 2026 14:27:51 +0100 Subject: [PATCH 02/24] chore: formatting --- internal/api/handlers/auth/auth.go | 3 +-- internal/api/handlers/spoofs/service.go | 2 +- internal/checks/checker.go | 1 - internal/dns/const.go | 2 +- internal/manager/manager_test.go | 6 +++--- internal/manager/query_manager.go | 2 +- internal/manager/servicegroup_test.go | 19 +++++++++---------- internal/service/service_test.go | 8 ++++---- pkg/auth/auth.go | 2 +- pkg/dnsdist/client_test.go | 2 +- pkg/loaders/flag_loader.go | 5 ++--- pkg/lua/bucket.go | 2 +- pkg/models/spoofs/hash.go | 2 +- pkg/models/spoofs/override.go | 2 +- pkg/persistence/store/memory/memory.go | 2 +- pkg/pool/bufferedQueue.go | 3 +-- pkg/pool/pool.go | 2 +- pkg/rest/const.go | 2 +- pkg/rest/request/client/client.go | 1 - pkg/rest/request/request.go | 3 +-- pkg/rest/response/error.go | 2 +- 21 files changed, 33 insertions(+), 40 deletions(-) diff --git a/internal/api/handlers/auth/auth.go b/internal/api/handlers/auth/auth.go index 60632f7..9582994 100644 --- a/internal/api/handlers/auth/auth.go +++ b/internal/api/handlers/auth/auth.go @@ -1,5 +1,4 @@ package auth type AuthService struct { - -} \ No newline at end of file +} diff --git a/internal/api/handlers/spoofs/service.go b/internal/api/handlers/spoofs/service.go index 5921f8c..40bc412 100644 --- a/internal/api/handlers/spoofs/service.go +++ b/internal/api/handlers/spoofs/service.go @@ -17,7 +17,7 @@ type SpoofsService struct { func NewSpoofsService(store persistence.Store[model.Service], svcManager manager.QueryManager) *SpoofsService { return &SpoofsService{ svcRepo: service.NewServiceRepo(store), - spoofRepo: spoof.NewSpoofRepo(store), // create read-only + spoofRepo: spoof.NewSpoofRepo(store), // create read-only serviceManager: svcManager, } } diff --git a/internal/checks/checker.go b/internal/checks/checker.go index a47a174..dcd0925 100644 --- a/internal/checks/checker.go +++ b/internal/checks/checker.go @@ -3,4 +3,3 @@ package checks type Checker interface { Check() error } - diff --git a/internal/dns/const.go b/internal/dns/const.go index 48206d0..418a071 100644 --- a/internal/dns/const.go +++ b/internal/dns/const.go @@ -2,4 +2,4 @@ package dns import "time" -const DEFAULT_POLL_INTERVAL = time.Minute * 5 \ No newline at end of file +const DEFAULT_POLL_INTERVAL = time.Minute * 5 diff --git a/internal/manager/manager_test.go b/internal/manager/manager_test.go index b9e8e71..4e5befc 100644 --- a/internal/manager/manager_test.go +++ b/internal/manager/manager_test.go @@ -234,7 +234,7 @@ func TestServicesManager_moveServiceToInterval(t *testing.T) { // Named input parameters for target function. config model.GSLBConfig newInterval timesutil.Duration - shouldExist bool + shouldExist bool }{ { name: "change-to-non-existing-interval", @@ -245,7 +245,7 @@ func TestServicesManager_moveServiceToInterval(t *testing.T) { name: "change-to-existing-interval", config: genericGSLBConfig, newInterval: timesutil.FromDuration(time.Second), - shouldExist: true, + shouldExist: true, }, } for _, tt := range tests { @@ -257,7 +257,7 @@ func TestServicesManager_moveServiceToInterval(t *testing.T) { sm.newScheduler(tt.newInterval) } sm.moveServiceToInterval(svc, tt.newInterval) - + _, interval, _ := sm.scheduledServices.Search(svc.GetID()) if interval != tt.newInterval { t.Errorf("expected new interval: %s but got: %s", tt.newInterval.String(), interval.String()) diff --git a/internal/manager/query_manager.go b/internal/manager/query_manager.go index 69845f6..7c11f65 100644 --- a/internal/manager/query_manager.go +++ b/internal/manager/query_manager.go @@ -9,7 +9,7 @@ import ( // without exposing all functionality type QueryManager interface { GetActiveForMemberOf(memberOf string) *service.Service - + //write operations Failover(fqdn string, failover failover.Failover) error } diff --git a/internal/manager/servicegroup_test.go b/internal/manager/servicegroup_test.go index ef65a34..fe0d9e6 100644 --- a/internal/manager/servicegroup_test.go +++ b/internal/manager/servicegroup_test.go @@ -22,7 +22,7 @@ var activeConfig = model.GSLBConfig{ Datacenter: "dc1", Interval: timesutil.Duration(5 * time.Second), Priority: 1, - CheckType: "TCP-FULL", + CheckType: "TCP-FULL", } var passiveConfig = model.GSLBConfig{ @@ -32,7 +32,7 @@ var passiveConfig = model.GSLBConfig{ Datacenter: "dc2", Interval: timesutil.Duration(5 * time.Second), Priority: 2, - CheckType: "TCP-FULL", + CheckType: "TCP-FULL", } var active *service.Service @@ -66,10 +66,10 @@ func TestServiceGroup_RegisterService(t *testing.T) { t.Errorf("Expected group mode: %v, but got: %v, after two services with different priorities registered", ActivePassive, group.mode) } /* - if group.active != 0 { - t.Errorf("Expected activeIndex: %v, but got: %v", 0, group.activeIndex) - } - */ + if group.active != 0 { + t.Errorf("Expected activeIndex: %v, but got: %v", 0, group.activeIndex) + } + */ } func TestServiceGroup_OnServiceHealthChange(t *testing.T) { @@ -140,7 +140,6 @@ func TestServiceGroup_OnServiceHealthChange(t *testing.T) { } makeServiceHealthy(active) - } func makeServiceHealthy(service *service.Service) { @@ -166,15 +165,15 @@ func TestServiceGroup_memberExists(t *testing.T) { { name: "exists", member: &service.Service{ - Fqdn: "test.example.com", + Fqdn: "test.example.com", Datacenter: "JK", }, want: true, }, { - name: "does-not-exist", + name: "does-not-exist", member: &service.Service{}, - want: false, + want: false, }, } for _, tt := range tests { diff --git a/internal/service/service_test.go b/internal/service/service_test.go index 1c1a9eb..23481f5 100644 --- a/internal/service/service_test.go +++ b/internal/service/service_test.go @@ -278,7 +278,7 @@ func TestService_GetBaseInterval(t *testing.T) { Datacenter: "Abels1", Interval: timesutil.FromDuration(time.Second * 5), Priority: 1, - CheckType: "TCP-FULL", + CheckType: "TCP-FULL", }, dryRun: true, want: timesutil.FromDuration(time.Second * 5), @@ -292,7 +292,7 @@ func TestService_GetBaseInterval(t *testing.T) { Datacenter: "Abels1", Interval: timesutil.FromDuration(time.Second * 5), Priority: 2, - CheckType: "TCP-FULL", + CheckType: "TCP-FULL", }, dryRun: true, want: timesutil.FromDuration(time.Second * 5), @@ -306,7 +306,7 @@ func TestService_GetBaseInterval(t *testing.T) { Datacenter: "Abels1", Interval: timesutil.FromDuration(time.Second * 5), Priority: 3, - CheckType: "TCP-FULL", + CheckType: "TCP-FULL", }, dryRun: true, want: timesutil.FromDuration(time.Second * 5), @@ -320,7 +320,7 @@ func TestService_GetBaseInterval(t *testing.T) { Datacenter: "Abels1", Interval: timesutil.FromDuration(time.Second * 5), Priority: 4, - CheckType: "TCP-FULL", + CheckType: "TCP-FULL", }, dryRun: true, want: timesutil.FromDuration(time.Second * 5), diff --git a/pkg/auth/auth.go b/pkg/auth/auth.go index 2e71e6e..999c780 100644 --- a/pkg/auth/auth.go +++ b/pkg/auth/auth.go @@ -16,7 +16,7 @@ func WithTokenValidation(logger *slog.Logger) middleware.MiddlewareFunc { return func(w http.ResponseWriter, r *http.Request) { ctx := context.WithValue(r.Context(), "request_method", r.Method) ctx = context.WithValue(ctx, "request_route", r.URL.String()) - + resp, err := jwt.Validate(ctx, strings.Split(r.Header.Get("Authorization"), "Bearer")[1]) if err != nil { logger.Error("token-validation failed", slog.String("reason", err.Error())) diff --git a/pkg/dnsdist/client_test.go b/pkg/dnsdist/client_test.go index 95b2fa8..0c632bc 100644 --- a/pkg/dnsdist/client_test.go +++ b/pkg/dnsdist/client_test.go @@ -13,7 +13,7 @@ func TestNewClient(t *testing.T) { if err != nil { t.Errorf("could not create client: %v", err.Error()) } - + } func TestCommand(t *testing.T) { diff --git a/pkg/loaders/flag_loader.go b/pkg/loaders/flag_loader.go index fb1f459..cd2e0d2 100644 --- a/pkg/loaders/flag_loader.go +++ b/pkg/loaders/flag_loader.go @@ -1,7 +1,6 @@ package loaders - -type FlagLoader struct {} +type FlagLoader struct{} func NewFlagLoader() *FileLoader { return &FileLoader{} @@ -10,4 +9,4 @@ func NewFlagLoader() *FileLoader { func (f *FlagLoader) Load(dest any) error { return nil -} \ No newline at end of file +} diff --git a/pkg/lua/bucket.go b/pkg/lua/bucket.go index 0326e66..03ca911 100644 --- a/pkg/lua/bucket.go +++ b/pkg/lua/bucket.go @@ -28,7 +28,7 @@ func (pl *LuaBucket) get() *glua.LState { func (pl *LuaBucket) new() *glua.LState { L := glua.NewState(glua.Options{ - SkipOpenLibs: true, + SkipOpenLibs: true, IncludeGoStackTrace: true, MinimizeStackMemory: true, }) diff --git a/pkg/models/spoofs/hash.go b/pkg/models/spoofs/hash.go index 2840ce3..d1ae610 100644 --- a/pkg/models/spoofs/hash.go +++ b/pkg/models/spoofs/hash.go @@ -2,4 +2,4 @@ package spoofs type Hash struct { Hash string `json:"hash"` -} \ No newline at end of file +} diff --git a/pkg/models/spoofs/override.go b/pkg/models/spoofs/override.go index 5026126..6b5af86 100644 --- a/pkg/models/spoofs/override.go +++ b/pkg/models/spoofs/override.go @@ -4,5 +4,5 @@ import "net" type Override struct { MemberOf string `json:"memberOf"` - IP net.IP `json:"ip,omitempty"` + IP net.IP `json:"ip,omitempty"` } diff --git a/pkg/persistence/store/memory/memory.go b/pkg/persistence/store/memory/memory.go index c6b13b3..21af799 100644 --- a/pkg/persistence/store/memory/memory.go +++ b/pkg/persistence/store/memory/memory.go @@ -43,7 +43,7 @@ func (s *Store[T]) LoadAll() ([]T, error) { for _, val := range s.data { result = append(result, val) } - + return result, nil } diff --git a/pkg/pool/bufferedQueue.go b/pkg/pool/bufferedQueue.go index 5e7d267..62df50f 100644 --- a/pkg/pool/bufferedQueue.go +++ b/pkg/pool/bufferedQueue.go @@ -1,9 +1,8 @@ package pool - type BufferedJobQueue chan Job // returns true wether a new item will block or not func (bq *BufferedJobQueue) Blocked() bool { return len(*bq) == cap(*bq) -} \ No newline at end of file +} diff --git a/pkg/pool/pool.go b/pkg/pool/pool.go index b158672..fea83e0 100644 --- a/pkg/pool/pool.go +++ b/pkg/pool/pool.go @@ -124,7 +124,7 @@ func (wp *WorkerPool) worker(id uint32) { err := job.Execute() if err != nil { job.OnFailure(err) - }else { + } else { job.OnSuccess() } diff --git a/pkg/rest/const.go b/pkg/rest/const.go index b2514ec..d8265f2 100644 --- a/pkg/rest/const.go +++ b/pkg/rest/const.go @@ -2,4 +2,4 @@ package rest const ( ContentTypeJSON = "application/json" -) \ No newline at end of file +) diff --git a/pkg/rest/request/client/client.go b/pkg/rest/request/client/client.go index 2eca7c4..e34f8ce 100644 --- a/pkg/rest/request/client/client.go +++ b/pkg/rest/request/client/client.go @@ -41,7 +41,6 @@ func NewClient(timeout time.Duration, opts ...clientOption) (*HTTPClient, error) return &ctx.wrapped, nil } - func (c *Client) Do(req *http.Request) (*http.Response, error) { return c.Client.Do(req) } diff --git a/pkg/rest/request/request.go b/pkg/rest/request/request.go index 77c9b53..b909821 100644 --- a/pkg/rest/request/request.go +++ b/pkg/rest/request/request.go @@ -5,7 +5,6 @@ import ( "io" ) - func JSONDECODE[T any](body io.Reader, dest *T) error { return json.NewDecoder(body).Decode(dest) -} \ No newline at end of file +} diff --git a/pkg/rest/response/error.go b/pkg/rest/response/error.go index c6a024d..2f7f98e 100644 --- a/pkg/rest/response/error.go +++ b/pkg/rest/response/error.go @@ -29,7 +29,7 @@ var ( Title: string(ErrInternalError), }, ErrNotFound: { - Code: http.StatusNotFound, + Code: http.StatusNotFound, Title: string(ErrNotFound), }, } From 9cf45968e6193fc2bdf820ad3e9be292db5df9d6 Mon Sep 17 00:00:00 2001 From: Espen Wobbes Date: Mon, 16 Feb 2026 09:26:00 +0100 Subject: [PATCH 03/24] feat: storing more service information, need to fix override API endpoint --- cmd/main.go | 18 +-- internal/api/handlers/spoofs/overrides.go | 91 +++++------- internal/api/handlers/spoofs/service.go | 2 +- internal/api/routes/const.go | 14 +- internal/dns/updater.go | 74 ++-------- internal/manager/manager.go | 144 +++++++++++++++--- internal/manager/options.go | 9 ++ internal/manager/servicegroup.go | 5 +- internal/model/service.go | 15 +- internal/repositories/service/service.go | 171 +++++++++++++++++++--- internal/repositories/spoof/spoof.go | 48 +++--- internal/service/service.go | 43 +++--- pkg/persistence/store/file/file.go | 39 +---- pkg/persistence/store/memory/memory.go | 3 +- 14 files changed, 412 insertions(+), 264 deletions(-) diff --git a/cmd/main.go b/cmd/main.go index f6e61c0..930149a 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -34,22 +34,21 @@ func main() { bslog.Fatal("could not load lua configuration", slog.Any("reason", err)) } + serviceFileStore, err := file.NewStore[model.GSLBServiceGroup]("store.json") + if err != nil { + bslog.Fatal("could not create persistent storage", slog.String("reason", err.Error())) + } + svcRepo := service.NewServiceRepo(serviceFileStore) + // creating dns - handler objects zoneFetcher := dns.NewZoneFetcherWithAutoPoll() mgr := manager.NewManager( manager.WithMinRunningWorkers(100), manager.WithNonBlockingBufferSize(110), + manager.WithServiceRepository(svcRepo), ) - serviceFileStore, err := file.NewStore[model.Service]("store.json") - if err != nil { - bslog.Fatal("could not create persistent storage", slog.String("reason", err.Error())) - } - - svcRepo := service.NewServiceRepo(serviceFileStore) - updater, err := dns.NewUpdater( - dns.UpdaterWithSpoofRepo(svcRepo), - ) + updater, err := dns.NewUpdater() if err != nil { bslog.Fatal("unable to create updater", slog.String("error", err.Error())) } @@ -71,7 +70,6 @@ func main() { // initializing the service jwt self signer jwt.InitServiceTokenManager(cfg.JWT().Secret(), cfg.JWT().User()) - fmt.Println(jwt.GetInstance().GetServiceToken()) api.HandleFunc(routes.POST_FAILOVER, middleware.Chain( middleware.WithIncomingRequestLogging(slog.Default()), diff --git a/internal/api/handlers/spoofs/overrides.go b/internal/api/handlers/spoofs/overrides.go index 9140f9c..799234a 100644 --- a/internal/api/handlers/spoofs/overrides.go +++ b/internal/api/handlers/spoofs/overrides.go @@ -13,6 +13,8 @@ import ( "log/slog" "net/http" + "github.com/vitistack/gslb-operator/internal/api/routes" + "github.com/vitistack/gslb-operator/internal/model" spoofRepo "github.com/vitistack/gslb-operator/internal/repositories/spoof" "github.com/vitistack/gslb-operator/pkg/bslog" "github.com/vitistack/gslb-operator/pkg/models/spoofs" @@ -22,23 +24,23 @@ import ( func (ss *SpoofsService) GetOverride(w http.ResponseWriter, r *http.Request) { logger := bslog.With(slog.Any("request_id", r.Context().Value("id"))) - memberOf := r.PathValue("memberOf") + memberOf := r.PathValue(routes.MemberOf) if memberOf == "" { - logger.Error("skipping request due to insufficient input parameters", slog.String("reason", "missing fqdn")) - response.Err(w, response.ErrInvalidInput, "missing fqdn") + logger.Error("skipping request due to insufficient input parameters", slog.String("reason", "missing member-of")) + response.Err(w, response.ErrInvalidInput, "missing member-of") return } - exist, err := ss.spoofRepo.ReadFQDN(memberOf) + exist, err := ss.svcRepo.GetActive(memberOf) if err != nil { logger.Error("could not read spoofs", slog.String("reason", err.Error())) response.Err(w, response.ErrInternalError, "") return } - if exist.DC != "OVERRIDE" { - logger.Error("service does not have an active override", slog.String("fqdn", exist.FQDN)) + if !exist.HasOverride { + logger.Error("service does not have an active override", slog.String("memberOf", exist.MemberOf)) response.Err(w, response.ErrNotFound, "not an active override") return } @@ -63,8 +65,8 @@ func (ss *SpoofsService) CreateOverride(w http.ResponseWriter, r *http.Request) err = ss.newOverride(override) if err != nil { logger.Error("could not override spoof", slog.String("reason", err.Error())) - if errors.Is(err, spoofRepo.ErrSpoofWithFQDNNotFound) { - response.Err(w, response.ErrNotFound, "fqdn not found: "+override.MemberOf) + if errors.Is(err, spoofRepo.ErrSpoofInServiceGroupNotFound) { + response.Err(w, response.ErrNotFound, "group: "+override.MemberOf) return } @@ -89,8 +91,8 @@ func (ss *SpoofsService) UpdateOverride(w http.ResponseWriter, r *http.Request) err = ss.updateOverride(override) if err != nil { logger.Error("could not update spoof", slog.String("reason", err.Error())) - if errors.Is(err, spoofRepo.ErrSpoofWithFQDNNotFound) { - response.Err(w, response.ErrNotFound, "member-of not found: "+override.MemberOf) + if errors.Is(err, spoofRepo.ErrSpoofInServiceGroupNotFound) { + response.Err(w, response.ErrNotFound, "group: "+override.MemberOf) return } @@ -123,98 +125,75 @@ func (ss *SpoofsService) DeleteOverride(w http.ResponseWriter, r *http.Request) } func (ss *SpoofsService) newOverride(override spoofs.Override) error { - exist, err := ss.svcRepo.FetchServiceMemberOf(override.MemberOf) + exist, err := ss.svcRepo.GetActive(override.MemberOf) if err != nil { - return fmt.Errorf("unable to read spoofs from storage: %w", err) + return fmt.Errorf("unable to get active service for group: %s: %w", override.MemberOf, err) } - if exist.Datacenter == "OVERRIDE" { + if exist.HasOverride { return fmt.Errorf("service already has active override: %s", exist.MemberOf) } - err = ss.svcRepo.Delete(exist.Key()) - if err != nil { - return fmt.Errorf("could not delete old spoof: %w", err) - } - - exist.Datacenter = "OVERRIDE" exist.IP = override.IP.String() + exist.HasOverride = true - err = ss.svcRepo.Create(&exist) + err = ss.svcRepo.Update(&exist) if err != nil { - return fmt.Errorf("could not create spoof: %w", err) + return fmt.Errorf("failed to update GSLB service with override flag: %w", err) } return nil } func (ss *SpoofsService) updateOverride(override spoofs.Override) error { - exist, err := ss.svcRepo.FetchServiceMemberOf(override.MemberOf) + active, err := ss.svcRepo.GetActive(override.MemberOf) if err != nil { - return fmt.Errorf("unable to read spoofs from storage: %w", err) + return fmt.Errorf("unable to get active service for group: %s: %w", override.MemberOf, err) } - if exist.Datacenter != "OVERRIDE" { - return fmt.Errorf("%s does not have an active override", override.MemberOf) + if active.HasOverride { + return fmt.Errorf("service already has active override: %s", active.MemberOf) } - exist.IP = override.IP.String() + active.IP = override.IP.String() - err = ss.svcRepo.Update(exist.Key(), &exist) + err = ss.svcRepo.UpdateOverride(override.IP.String(), &active) if err != nil { - return fmt.Errorf("could not update spoof: %w", err) + return fmt.Errorf("failed to update GSLB service with override flag: %w", err) } return nil } func (ss *SpoofsService) deleteOverride(override spoofs.Override) error { - exist, err := ss.svcRepo.FetchServiceMemberOf(override.MemberOf) + exist, err := ss.svcRepo.GetActive(override.MemberOf) if err != nil { - return fmt.Errorf("unable to read spoofs from storage: %w", err) + return fmt.Errorf("unable to get active service for group: %s: %w", override.MemberOf, err) } - if exist.Datacenter != "OVERRIDE" { + if !exist.HasOverride { return fmt.Errorf("%s does not have an override currently set", override.MemberOf) } - spoof := ss.restoreSpoof(override) - - err = ss.svcRepo.Delete(exist.Key()) + err = ss.svcRepo.RemoveOverrideFlag(override.MemberOf) if err != nil { - return fmt.Errorf("could not update spoof: %w", err) - } - - if spoof == nil { // if not possible to create new spoof, we return with NO spoof for the fqdn - return nil + return fmt.Errorf("failed to remove override flag: %w", err) } - exist.Datacenter = spoof.DC - exist.Fqdn = spoof.FQDN - exist.IP = spoof.IP - - err = ss.svcRepo.Create(&exist) + active := ss.restoreActive(override) + err = ss.svcRepo.Update(active) if err != nil { - return fmt.Errorf("could not create spoof for active service: %w", err) + return fmt.Errorf("could not restore active service in group after override flag has been removed: %w", err) } return nil } -func (ss *SpoofsService) restoreSpoof(override spoofs.Override) *spoofs.Spoof { +func (ss *SpoofsService) restoreActive(override spoofs.Override) *model.GSLBService { svc := ss.serviceManager.GetActiveForMemberOf(override.MemberOf) if svc == nil { // no active service: e.g. no spoof should be there return nil } - ip, err := svc.GetIP() - if err != nil { - return nil - } - - return &spoofs.Spoof{ - FQDN: svc.Fqdn, - DC: svc.Datacenter, - IP: ip, - } + return svc.GSLBService() } diff --git a/internal/api/handlers/spoofs/service.go b/internal/api/handlers/spoofs/service.go index 40bc412..69a31ad 100644 --- a/internal/api/handlers/spoofs/service.go +++ b/internal/api/handlers/spoofs/service.go @@ -14,7 +14,7 @@ type SpoofsService struct { serviceManager manager.QueryManager } -func NewSpoofsService(store persistence.Store[model.Service], svcManager manager.QueryManager) *SpoofsService { +func NewSpoofsService(store persistence.Store[model.GSLBServiceGroup], svcManager manager.QueryManager) *SpoofsService { return &SpoofsService{ svcRepo: service.NewServiceRepo(store), spoofRepo: spoof.NewSpoofRepo(store), // create read-only diff --git a/internal/api/routes/const.go b/internal/api/routes/const.go index 177a702..e58bf43 100644 --- a/internal/api/routes/const.go +++ b/internal/api/routes/const.go @@ -1,6 +1,8 @@ package routes -import "net/http" +import ( + "net/http" +) const ( ROOT = "/" @@ -13,9 +15,9 @@ const ( GET_SPOOFS_HASH = http.MethodGet + " " + SPOOFS_HASH // Route to hash all spoofs, for config validation POST_SPOOF = http.MethodPost + " " + SPOOFS // Route POST - OVERRIDE = SPOOFS + "/override" // override DNSDIST configuration - GET_OVERRIDE = http.MethodGet + " " + OVERRIDE + "/{fqdn}" // Route GET - POST_OVERRIDE = http.MethodPost + " " + OVERRIDE // Route POST + OVERRIDE = SPOOFS + "/override" // override DNSDIST configuration + GET_OVERRIDE = http.MethodGet + " " + OVERRIDE + "/{" + MemberOf + "}" // Route GET + POST_OVERRIDE = http.MethodPost + " " + OVERRIDE // Route POST PUT_OVERRIDE = http.MethodPut + " " + OVERRIDE + "/{fqdn}" DELETE_OVERRIDE = http.MethodDelete + " " + OVERRIDE // Route DELETE @@ -26,3 +28,7 @@ const ( AUTH_LOGIN = AUTH + "/login" POST_AUTH_LOGIN = http.MethodPost + " " + AUTH_LOGIN ) + +const ( + MemberOf = "memberOf" +) diff --git a/internal/dns/updater.go b/internal/dns/updater.go index 584a50f..8b87f32 100644 --- a/internal/dns/updater.go +++ b/internal/dns/updater.go @@ -7,12 +7,9 @@ import ( "time" "github.com/vitistack/gslb-operator/internal/config" - "github.com/vitistack/gslb-operator/internal/model" - svcRepo "github.com/vitistack/gslb-operator/internal/repositories/service" "github.com/vitistack/gslb-operator/internal/service" "github.com/vitistack/gslb-operator/pkg/auth/jwt" - "github.com/vitistack/gslb-operator/pkg/bslog" - "github.com/vitistack/gslb-operator/pkg/persistence/store/memory" + "github.com/vitistack/gslb-operator/pkg/models/spoofs" "github.com/vitistack/gslb-operator/pkg/rest/request" "github.com/vitistack/gslb-operator/pkg/rest/request/client" ) @@ -21,7 +18,6 @@ type updaterOption func(u *Updater) type Updater struct { Server string - svcRepo *svcRepo.ServiceRepo client client.HTTPClient builder *request.Builder mu *sync.Mutex @@ -39,10 +35,9 @@ func NewUpdater(opts ...updaterOption) (*Updater, error) { } u := &Updater{ - Server: config.GetInstance().GSLB().UpdaterHost(), - svcRepo: svcRepo.NewServiceRepo(memory.NewStore[model.Service]()), - client: *c, - mu: &sync.Mutex{}, + Server: config.GetInstance().GSLB().UpdaterHost(), + client: *c, + mu: &sync.Mutex{}, } for _, opt := range opts { @@ -53,12 +48,6 @@ func NewUpdater(opts ...updaterOption) (*Updater, error) { return u, nil } -func UpdaterWithSpoofRepo(svcRepo *svcRepo.ServiceRepo) updaterOption { - return func(u *Updater) { - u.svcRepo = svcRepo - } -} - func UpdaterWithServer(server string) updaterOption { return func(u *Updater) { u.Server = server @@ -72,23 +61,6 @@ func UpdaterWithClient(client *client.HTTPClient) updaterOption { } func (u *Updater) ServiceDown(svc *service.Service) error { - u.mu.Lock() - override, err := u.svcRepo.HasOverride(svc.MemberOf) - if err != nil { - return fmt.Errorf("unable to delete spoof: %w", err) - } - - if override { - bslog.Debug("service has spoof active override", slog.Any("service", svc)) - return nil - } - - err = u.svcRepo.Delete(fmt.Sprintf("%s:%s", svc.MemberOf, svc.Datacenter)) - u.mu.Unlock() - if err != nil { - return fmt.Errorf("unable to delete service from storage: %s", err.Error()) - } - token, err := jwt.GetInstance().GetServiceToken() if err != nil { return fmt.Errorf("could not fetch service token: %w", err) @@ -115,38 +87,6 @@ func (u *Updater) ServiceDown(svc *service.Service) error { } func (u *Updater) ServiceUp(svc *service.Service) error { - u.mu.Lock() - override, err := u.svcRepo.HasOverride(svc.MemberOf) - if err != nil { - return fmt.Errorf("unable to store spoof: %w", err) - } - - if override { - bslog.Debug("service has spoof active override", slog.Any("service", svc)) - return nil - } - - ip, err := svc.GetIP() - if err != nil { - return fmt.Errorf("unable to get ip address: %s", err.Error()) - } - - spoof := &model.Service{ - ID: svc.GetID(), - MemberOf: svc.MemberOf, - Fqdn: svc.Fqdn, - IP: ip, - Datacenter: svc.Datacenter, - IsHealthy: svc.IsHealthy(), - FailureCount: svc.GetFailureCount(), - } - - err = u.svcRepo.Create(spoof) - u.mu.Unlock() - if err != nil { - return fmt.Errorf("could not store new spoof: %s", err.Error()) - } - token, err := jwt.GetInstance().GetServiceToken() if err != nil { return fmt.Errorf("could not fetch service token: %w", err) @@ -154,7 +94,11 @@ func (u *Updater) ServiceUp(svc *service.Service) error { req, err := u.builder.POST().SetHeader("Authorization", token). URL("/spoofs"). - Body(spoof). + Body(spoofs.Spoof{ + FQDN: svc.MemberOf, + IP: svc.GetIP(), + DC: svc.Datacenter, + }). Build() if err != nil { return fmt.Errorf("could not create post request for update: %s", err.Error()) diff --git a/internal/manager/manager.go b/internal/manager/manager.go index 4fe3200..d395267 100644 --- a/internal/manager/manager.go +++ b/internal/manager/manager.go @@ -9,10 +9,12 @@ import ( "github.com/vitistack/gslb-operator/internal/manager/scheduler" "github.com/vitistack/gslb-operator/internal/model" + svcRepo "github.com/vitistack/gslb-operator/internal/repositories/service" "github.com/vitistack/gslb-operator/internal/service" "github.com/vitistack/gslb-operator/internal/utils/timesutil" "github.com/vitistack/gslb-operator/pkg/bslog" "github.com/vitistack/gslb-operator/pkg/models/failover" + "github.com/vitistack/gslb-operator/pkg/persistence/store/memory" "github.com/vitistack/gslb-operator/pkg/pool" ) @@ -22,6 +24,7 @@ type ServicesManager struct { scheduledServices ScheduledServices // services that are scheduled on an interval schedulers map[timesutil.Duration]*scheduler.Scheduler // schedulers for health-checks serviceGroups map[string]*ServiceGroup + svcRepo *svcRepo.ServiceRepo mutex sync.RWMutex stop sync.Once pool pool.WorkerPool @@ -35,6 +38,7 @@ func NewManager(opts ...serviceManagerOption) *ServicesManager { MinRunningWorkers: 100, NonBlockingBufferSize: 110, DryRun: false, + repo: svcRepo.NewServiceRepo(memory.NewStore[model.GSLBServiceGroup]()), } for _, opt := range opts { @@ -49,6 +53,7 @@ func NewManager(opts ...serviceManagerOption) *ServicesManager { scheduledServices: make(ScheduledServices), schedulers: make(map[timesutil.Duration]*scheduler.Scheduler), serviceGroups: make(map[string]*ServiceGroup), + svcRepo: cfg.repo, mutex: sync.RWMutex{}, pool: *pool.NewWorkerPool(cfg.MinRunningWorkers, cfg.NonBlockingBufferSize), stop: sync.Once{}, @@ -76,6 +81,11 @@ func (sm *ServicesManager) Stop() { }) } +func (sm *ServicesManager) OnShutdown() error { + + return nil +} + func (sm *ServicesManager) RegisterService(serviceCfg model.GSLBConfig) (*service.Service, error) { newService, err := service.NewServiceFromGSLBConfig(serviceCfg, sm.dryrun) // create the service object if err != nil { @@ -94,9 +104,22 @@ func (sm *ServicesManager) RegisterService(serviceCfg model.GSLBConfig) (*servic sm.mutex.Lock() defer sm.mutex.Unlock() + err = sm.svcRepo.Create(newService.GSLBService()) + if err != nil { + return nil, fmt.Errorf("failed to create new service: %w", err) + } + // set healthchange callback action newService.SetHealthChangeCallback(func(healthy bool) { bslog.Debug("received health-change", slog.Any("service", newService), slog.Bool("healthy", healthy)) + err := sm.svcRepo.Update(newService.GSLBService()) + if err != nil { + bslog.Error( + "failed to update service health on health-change", + slog.String("reason", err.Error()), + slog.Any("service", newService), + ) + } sm.serviceGroups[newService.MemberOf].OnServiceHealthChange(newService, healthy) }) @@ -141,9 +164,14 @@ func (sm *ServicesManager) RemoveService(id string) error { sm.mutex.Lock() defer sm.mutex.Unlock() + sm.scheduledServices.Delete(id) - bslog.Debug("removed service", slog.Any("service", svc)) + err := sm.svcRepo.Delete(svc.MemberOf, svc.GetID()) + if err != nil { + return fmt.Errorf("failed to delete service: %w", err) + } + bslog.Debug("removed service", slog.Any("service", svc)) return nil } @@ -172,29 +200,7 @@ func (sm *ServicesManager) updateService(old, new *service.Service) { sm.mutex.Unlock() if oldMemberOf != newMemberOf { - sm.mutex.Lock() - newGroup, newOk := sm.serviceGroups[newMemberOf] - if !newOk { - newGroup = sm.newServiceGroup(newMemberOf) - } - - oldGroup, oldOk := sm.serviceGroups[oldMemberOf] - sm.mutex.Unlock() - - newGroup.RegisterService(old) - var empty bool - if oldOk { - empty = oldGroup.RemoveService(old.GetID()) - - } - if empty { // delete empty service group - delete(sm.serviceGroups, oldMemberOf) - } - bslog.Debug( - "updated service group membership", - slog.String("oldGroup", oldMemberOf), - slog.String("newGroup", newMemberOf), - ) + sm.memberOfChanged(oldMemberOf, newMemberOf, old) } else { sm.mutex.RLock() oldGroup, ok := sm.serviceGroups[oldMemberOf] @@ -211,6 +217,15 @@ func (sm *ServicesManager) updateService(old, new *service.Service) { sm.mutex.Lock() defer sm.mutex.Unlock() + err := sm.svcRepo.Update(old.GSLBService()) + if err != nil { + bslog.Error( + "failed to update service config persistently", + slog.String("reason", err.Error()), + slog.Any("service", old), + ) + } + // important that this checked AFTER the service groups have ran their update // this is because the group may trigger a promotion event that needs to be handled first // if the promotion event does not happen, we just simply move it to a new interval @@ -223,6 +238,56 @@ func (sm *ServicesManager) updateService(old, new *service.Service) { bslog.Debug("updated service", slog.Any("service", old)) } +func (sm *ServicesManager) memberOfChanged(oldMemberOf, newMemberOf string, svc *service.Service) { + sm.mutex.Lock() + + err := sm.svcRepo.Delete(oldMemberOf, svc.GetID()) + if err != nil { + bslog.Error( + "failed to remove service from old service group", + slog.String("reason", err.Error()), + slog.String("oldMemberOf", oldMemberOf), + slog.Any("service", svc), + ) + return + } + + err = sm.svcRepo.Create(svc.GSLBService()) + if err != nil { + bslog.Error( + "failed to add service to new group", + slog.String("reason", err.Error()), + slog.String("newMemberOf", newMemberOf), + slog.Any("service", svc), + ) + return + } + + newGroup, newOk := sm.serviceGroups[newMemberOf] + if !newOk { + newGroup = sm.newServiceGroup(newMemberOf) + } + + oldGroup, oldOk := sm.serviceGroups[oldMemberOf] + sm.mutex.Unlock() + + newGroup.RegisterService(svc) + + var empty bool + if oldOk { + empty = oldGroup.RemoveService(svc.GetID()) + + } + if empty { // delete empty service group + delete(sm.serviceGroups, oldMemberOf) + } + bslog.Debug( + "updated service group membership", + slog.String("oldGroup", oldMemberOf), + slog.String("newGroup", newMemberOf), + ) +} + // re-schedules the relevant services in the PromotionEvent func (sm *ServicesManager) handlePromotion(event *PromotionEvent) { sm.mutex.Lock() @@ -263,6 +328,22 @@ func (sm *ServicesManager) handlePromotion(event *PromotionEvent) { if event.OldActive != nil && event.NewActive != nil { // just swap, and do dns updates demotedInterval = event.NewActive.ScheduledInterval + oldActiveGSLBService := event.OldActive.GSLBService() + oldActiveGSLBService.IsActive = false + err := sm.svcRepo.Update(oldActiveGSLBService) + if err != nil { + bslog.Error("failed to remove active flag from service", slog.Any("oldActive", event.OldActive)) + return + } + + newActiveGSLBService := event.NewActive.GSLBService() + newActiveGSLBService.IsActive = true + err = sm.svcRepo.Update(newActiveGSLBService) + if err != nil { + bslog.Error("failed to update active flag on service", slog.Any("newActive", event.NewActive)) + return + } + bslog.Warn("demoting service", slog.Any("oldActive", event.OldActive), slog.Group("intervalChange", @@ -270,6 +351,7 @@ func (sm *ServicesManager) handlePromotion(event *PromotionEvent) { slog.String("to", demotedInterval.String()), )) sm.moveServiceToInterval(event.OldActive, demotedInterval) + sm.DNSUpdate(event.OldActive, false) bslog.Warn("promoting service", @@ -284,6 +366,13 @@ func (sm *ServicesManager) handlePromotion(event *PromotionEvent) { } if event.NewActive != nil { // first service to come up when all services are down + newActiveGSLBService := event.NewActive.GSLBService() + newActiveGSLBService.IsActive = true + err := sm.svcRepo.Update(newActiveGSLBService) + if err != nil { + bslog.Error("failed to update active flag on service", slog.Any("newActive", event.NewActive)) + return + } bslog.Info("new active service", slog.Any("service", event.NewActive)) sm.moveServiceToInterval(event.NewActive, baseInterval) sm.DNSUpdate(event.NewActive, true) @@ -291,6 +380,13 @@ func (sm *ServicesManager) handlePromotion(event *PromotionEvent) { } if event.OldActive != nil { // no service to take over + oldActiveGSLBService := event.OldActive.GSLBService() + oldActiveGSLBService.IsActive = false + err := sm.svcRepo.Update(oldActiveGSLBService) + if err != nil { + bslog.Error("failed to remove active flag from service", slog.Any("oldActive", event.OldActive)) + return + } bslog.Warn("no available sites", slog.String("serviceGroup", event.Service)) sm.DNSUpdate(event.OldActive, false) return diff --git a/internal/manager/options.go b/internal/manager/options.go index 1c1c106..068d8b1 100644 --- a/internal/manager/options.go +++ b/internal/manager/options.go @@ -1,9 +1,12 @@ package manager +import "github.com/vitistack/gslb-operator/internal/repositories/service" + type managerConfig struct { MinRunningWorkers uint NonBlockingBufferSize uint DryRun bool + repo *service.ServiceRepo } type serviceManagerOption func(cfg *managerConfig) @@ -25,3 +28,9 @@ func WithDryRun(enabled bool) serviceManagerOption { cfg.DryRun = enabled } } + +func WithServiceRepository(repo *service.ServiceRepo) serviceManagerOption { + return func(cfg *managerConfig) { + cfg.repo = repo + } +} diff --git a/internal/manager/servicegroup.go b/internal/manager/servicegroup.go index 7d85a65..392ea4d 100644 --- a/internal/manager/servicegroup.go +++ b/internal/manager/servicegroup.go @@ -112,15 +112,16 @@ func (sg *ServiceGroup) firstHealthy() *service.Service { func (sg *ServiceGroup) OnServiceHealthChange(changedService *service.Service, healthy bool) { sg.mu.Lock() - defer sg.mu.Unlock() oldActive := sg.active if oldActive == nil { oldActive = sg.lastActive } + switch sg.mode { case ActivePassive: if !healthy && sg.active.GetID() == changedService.GetID() { // active has gone down! sg.lastActive = sg.active + sg.mu.Unlock() sg.OnPromotion(sg.promoteNextHealthy()) return } @@ -164,6 +165,7 @@ func (sg *ServiceGroup) OnServiceHealthChange(changedService *service.Service, h // unhealthy if changedService.GetID() == sg.active.GetID() { + sg.mu.Unlock() next := sg.firstHealthy() if next != nil { sg.OnPromotion(&PromotionEvent{ @@ -186,6 +188,7 @@ func (sg *ServiceGroup) OnServiceHealthChange(changedService *service.Service, h sg.active = nil } } + sg.mu.Unlock() } // This does not take in to account if the registered service has the highest priority diff --git a/internal/model/service.go b/internal/model/service.go index c1ed21d..ff2c996 100644 --- a/internal/model/service.go +++ b/internal/model/service.go @@ -4,9 +4,11 @@ import ( "github.com/vitistack/gslb-operator/pkg/models/spoofs" ) +type GSLBServiceGroup []GSLBService + // storage representation of service // services that are configured with gslb config end up as a service.Service -type Service struct { +type GSLBService struct { ID string `json:"id"` MemberOf string `json:"memberOf"` Fqdn string `json:"fqdn"` @@ -14,15 +16,18 @@ type Service struct { IP string `json:"ip"` IsHealthy bool `json:"isHealthy"` FailureCount int `json:"failureCount"` + IsActive bool `json:"isActive"` + HasOverride bool `json:"hasOverride"` } -func (s Service) Key() string { - return s.MemberOf + ":" + s.Datacenter +func (s GSLBService) Key() string { + return s.MemberOf } -func (s Service) Spoof() spoofs.Spoof { +// returns spoof representation of GSLBService +func (s GSLBService) Spoof() spoofs.Spoof { return spoofs.Spoof{ - FQDN: s.Fqdn, + FQDN: s.MemberOf, IP: s.IP, DC: s.Datacenter, } diff --git a/internal/repositories/service/service.go b/internal/repositories/service/service.go index c147b52..68f15b6 100644 --- a/internal/repositories/service/service.go +++ b/internal/repositories/service/service.go @@ -3,6 +3,7 @@ package service import ( "errors" "fmt" + "slices" "github.com/vitistack/gslb-operator/internal/model" "github.com/vitistack/gslb-operator/pkg/persistence" @@ -14,47 +15,173 @@ var ( // repository for services that are considered active in a service group type ServiceRepo struct { - store persistence.Store[model.Service] + store persistence.Store[model.GSLBServiceGroup] } -func NewServiceRepo(store persistence.Store[model.Service]) *ServiceRepo { +func NewServiceRepo(store persistence.Store[model.GSLBServiceGroup]) *ServiceRepo { return &ServiceRepo{ store: store, } } -func (sr *ServiceRepo) Create(new *model.Service) error { - err := sr.store.Save(new.Key(), *new) +func (sr *ServiceRepo) Create(new *model.GSLBService) error { + override, err := sr.HasOverride(new.MemberOf) + if err != nil { + return err + } + + if override { + return nil + } + + group, err := sr.Read(new.MemberOf) + if err != nil { + return fmt.Errorf("failed to check for existing service group: %w", err) + } + + if group == nil { + group = make(model.GSLBServiceGroup, 0) + group = append(group, *new) + err := sr.store.Save(new.MemberOf, group) + if err != nil { + return fmt.Errorf("failed to store service: %w", err) + } + return nil + } + + if slices.ContainsFunc( + group, + func(s model.GSLBService) bool { + return s.ID == new.ID + }) { + return fmt.Errorf("failed to store service: service already exists") + } + + group = append(group, *new) + err = sr.store.Save(new.Key(), group) if err != nil { return fmt.Errorf("failed to store service: %w", err) } + + return nil +} + +func (sr *ServiceRepo) Update(new *model.GSLBService) error { + override, err := sr.HasOverride(new.MemberOf) + if err != nil { + return err + } + + group, err := sr.Read(new.MemberOf) + if err != nil { + return fmt.Errorf("failed to check for existing service group: %w", err) + } + + if group == nil { + return fmt.Errorf("failed to update service: service group for: %s does not exist", new.MemberOf) + } + + for idx, svc := range group { + if svc.ID == new.ID { + if svc.IsActive { + if override { + new.IP = svc.IP // preserve ip on override, but only for active + new.HasOverride = true + } + } + group[idx] = *new + err = sr.store.Save(new.MemberOf, group) + + if err != nil { + return fmt.Errorf("failed to update entry with id: %s: %w", new.MemberOf, err) + } + break + } + } + return nil } -func (sr *ServiceRepo) Update(id string, new *model.Service) error { - err := sr.store.Save(id, *new) +func (sr *ServiceRepo) UpdateOverride(ip string, service *model.GSLBService) error { + service.IP = ip + + group, err := sr.Read(service.MemberOf) if err != nil { - return fmt.Errorf("failed to update entry with id: %s: %w", id, err) + return fmt.Errorf("failed to retrieve service group: %w", err) + } + + if group == nil { + return fmt.Errorf("failed to update service: service group for: %s does not exist", service.MemberOf) + } + + for idx, svc := range group { + if svc.ID == service.ID { + group[idx] = *service + err = sr.store.Save(service.MemberOf, group) + + if err != nil { + return fmt.Errorf("failed to update override: %w", err) + } + break + } } return nil } -func (sr *ServiceRepo) Delete(id string) error { - err := sr.store.Delete(id) +func (sr *ServiceRepo) RemoveOverrideFlag(memberOf string) error { + group, err := sr.Read(memberOf) + if err != nil { + return err + } + + for idx := range group { + group[idx].HasOverride = false // update flag for every service in group + } + + return sr.store.Save(memberOf, group) +} + +func (sr *ServiceRepo) Delete(memberOf string, id string) error { + group, err := sr.Read(memberOf) + if err != nil { + return err + } + + override, err := sr.HasOverride(memberOf) + if err != nil { + return err + } + + if override { + return nil + } + + group = slices.DeleteFunc(group, func(s model.GSLBService) bool { // delete service with id + return s.ID == id + }) + if len(group) == 0 { // delete service group if empty group + err = sr.store.Delete(memberOf) + if err != nil { + return fmt.Errorf("failed to delete service group after empty result: %w", err) + } + } + + err = sr.store.Save(memberOf, group) // save the remaining services if err != nil { return fmt.Errorf("failed to delete entry with id: %s: %w", id, err) } return nil } -func (sr *ServiceRepo) Read(id string) (model.Service, error) { - svc, err := sr.store.Load(id) +func (sr *ServiceRepo) Read(id string) (model.GSLBServiceGroup, error) { + group, err := sr.store.Load(id) if err != nil { - return model.Service{}, fmt.Errorf("failed to read from storage: %w", err) + return nil, fmt.Errorf("failed to read from storage: %w", err) } - return svc, nil + return group, nil } -func (sr *ServiceRepo) ReadAll() ([]model.Service, error) { + +func (sr *ServiceRepo) ReadAll() ([]model.GSLBServiceGroup, error) { services, err := sr.store.LoadAll() if err != nil { return nil, fmt.Errorf("failed to read from storage: %w", err) @@ -63,23 +190,23 @@ func (sr *ServiceRepo) ReadAll() ([]model.Service, error) { return services, nil } -func (sr *ServiceRepo) FetchServiceMemberOf(memberOf string) (model.Service, error) { - allServices, err := sr.ReadAll() +func (sr *ServiceRepo) GetActive(memberOf string) (model.GSLBService, error) { + group, err := sr.Read(memberOf) if err != nil { - return model.Service{}, err + return model.GSLBService{}, err } - for _, svc := range allServices { - if svc.MemberOf == memberOf { + for _, svc := range group { + if svc.IsActive { return svc, nil } } - return model.Service{}, fmt.Errorf("%w: member-of %s", ErrServiceWithMemberOfNotFound, memberOf) + return model.GSLBService{}, fmt.Errorf("%w: member-of %s", ErrServiceWithMemberOfNotFound, memberOf) } func (sr *ServiceRepo) HasOverride(memberOf string) (bool, error) { - svc, err := sr.FetchServiceMemberOf(memberOf) + svc, err := sr.GetActive(memberOf) if err != nil { if errors.Is(err, ErrServiceWithMemberOfNotFound) { return false, nil @@ -87,5 +214,5 @@ func (sr *ServiceRepo) HasOverride(memberOf string) (bool, error) { return false, err } - return svc.Datacenter == "OVERRIDE", nil + return svc.HasOverride, nil } diff --git a/internal/repositories/spoof/spoof.go b/internal/repositories/spoof/spoof.go index 5fa8d76..c8cd1cc 100644 --- a/internal/repositories/spoof/spoof.go +++ b/internal/repositories/spoof/spoof.go @@ -10,66 +10,64 @@ import ( ) var ( - ErrSpoofWithFQDNNotFound = errors.New("spoof with fqdn not found") + ErrSpoofInServiceGroupNotFound = errors.New("spoof in service group not found") ) // read-only repo for spoofs type SpoofRepo struct { - store persistence.Store[model.Service] + store persistence.Store[model.GSLBServiceGroup] } -func NewSpoofRepo(storage persistence.Store[model.Service]) *SpoofRepo { +func NewSpoofRepo(storage persistence.Store[model.GSLBServiceGroup]) *SpoofRepo { return &SpoofRepo{ store: storage, } } func (r *SpoofRepo) Read(id string) (spoofs.Spoof, error) { - svc, err := r.store.Load(id) + group, err := r.store.Load(id) if err != nil { return spoofs.Spoof{}, fmt.Errorf("failed to read from storage: %w", err) } - return svc.Spoof(), nil + for _, svc := range group { + if svc.IsActive { + return svc.Spoof(), nil + } + } + + return spoofs.Spoof{}, nil } -func (r *SpoofRepo) ReadFQDN(fqdn string) (spoofs.Spoof, error) { - allServices, err := r.store.LoadAll() +func (r *SpoofRepo) ReadMemberOf(memberOf string) (spoofs.Spoof, error) { + group, err := r.store.Load(memberOf) if err != nil { return spoofs.Spoof{}, fmt.Errorf("failed to read from storage: %w", err) } - for _, svc := range allServices { - if svc.Fqdn == fqdn { + for _, svc := range group { + if svc.IsActive { return svc.Spoof(), nil } } - return spoofs.Spoof{}, fmt.Errorf("%w: fqdn: %s", ErrSpoofWithFQDNNotFound, fqdn) + return spoofs.Spoof{}, fmt.Errorf("%w: fqdn: %s", ErrSpoofInServiceGroupNotFound, memberOf) } func (r *SpoofRepo) ReadAll() ([]spoofs.Spoof, error) { - allServices, err := r.store.LoadAll() + groups, err := r.store.LoadAll() if err != nil { return nil, fmt.Errorf("failed to read from storage: %w", err) } spoofs := make([]spoofs.Spoof, 0) - for _, svc := range allServices { - spoofs = append(spoofs, svc.Spoof()) - } - - return spoofs, nil -} - -func (r *SpoofRepo) HasOverride(fqdn string) (bool, error) { - spoof, err := r.ReadFQDN(fqdn) - if err != nil { - if errors.Is(err, ErrSpoofWithFQDNNotFound) { - return false, nil + for _, group := range groups { + for _, svc := range group { + if svc.IsActive { + spoofs = append(spoofs, svc.Spoof()) + } } - return false, err } - return spoof.DC == "OVERRIDE", nil + return spoofs, nil } diff --git a/internal/service/service.go b/internal/service/service.go index d57b4d4..bc3e578 100644 --- a/internal/service/service.go +++ b/internal/service/service.go @@ -18,7 +18,7 @@ type HealthChangeCallback func(healthy bool) type Service struct { id string - addr string + addr *net.TCPAddr Fqdn string MemberOf string Datacenter string @@ -51,7 +51,7 @@ func NewServiceFromGSLBConfig(config model.GSLBConfig, dryRun bool) (*Service, e interval := CalculateInterval(config.Priority, config.Interval) svc := &Service{ id: config.ServiceID, - addr: addr.String(), + addr: addr, Fqdn: config.Fqdn, MemberOf: config.MemberOf, Datacenter: config.Datacenter, @@ -60,7 +60,7 @@ func NewServiceFromGSLBConfig(config model.GSLBConfig, dryRun bool) (*Service, e defaultInterval: interval, priority: config.Priority, FailureThreshold: config.FailureThreshold, - failureCount: config.FailureThreshold, + failureCount: config.FailureThreshold, // need to succeed check N times before healthy! isHealthy: false, } @@ -75,13 +75,13 @@ func NewServiceFromGSLBConfig(config model.GSLBConfig, dryRun bool) (*Service, e svc.checker = checks.NewHTTPChecker("https://"+svc.Fqdn, checks.DEFAULT_TIMEOUT, config.Script) case config.CheckType == checks.TCP_FULL: - svc.checker = checks.NewTCPFullChecker(svc.addr, checks.DEFAULT_TIMEOUT) + svc.checker = checks.NewTCPFullChecker(svc.addr.String(), checks.DEFAULT_TIMEOUT) case config.CheckType == checks.TCP_HALF: - svc.checker = checks.NewTCPHalfChecker(svc.addr, checks.DEFAULT_TIMEOUT) + svc.checker = checks.NewTCPHalfChecker(svc.addr.String(), checks.DEFAULT_TIMEOUT) default: - svc.checker = checks.NewTCPFullChecker(svc.addr, checks.DEFAULT_TIMEOUT) + svc.checker = checks.NewTCPFullChecker(svc.addr.String(), checks.DEFAULT_TIMEOUT) } return svc, nil @@ -209,12 +209,8 @@ func (s *Service) GetPriority() int { return s.priority } -func (s *Service) GetIP() (string, error) { - ip, _, err := net.SplitHostPort(s.addr) - if err != nil { - return "", fmt.Errorf("could not read ip from network address: %s: %s", s.addr, err.Error()) - } - return ip, nil +func (s *Service) GetIP() string { + return s.addr.IP.String() } func (s *Service) GetDefaultInterval() timesutil.Duration { @@ -231,7 +227,7 @@ func (s *Service) GetFailureCount() int { func (s *Service) ConfigChanged(other *Service) bool { if s.Fqdn != other.Fqdn || - s.addr != other.addr || + s.addr.String() != other.addr.String() || s.Datacenter != other.Datacenter || s.FailureThreshold != other.FailureThreshold || s.priority != other.priority || @@ -244,9 +240,11 @@ func (s *Service) ConfigChanged(other *Service) bool { // updates the configuration values of s with the values of new func (s *Service) Assign(new *Service) { s.addr = new.addr + s.Fqdn = new.Fqdn s.checker = new.checker s.MemberOf = new.MemberOf s.priority = new.priority + s.checkType = new.checkType s.Datacenter = new.Datacenter s.defaultInterval = new.defaultInterval s.FailureThreshold = new.FailureThreshold @@ -256,18 +254,29 @@ func (s *Service) LogValue() slog.Value { if s == nil { return slog.StringValue("nil") } - ip, _ := s.GetIP() + return slog.GroupValue( slog.String("id", s.id), slog.String("memberOf", s.MemberOf), slog.String("fqdn", s.Fqdn), slog.String("datacenter", s.Datacenter), - slog.String("ip", ip), + slog.String("ip", s.GetIP()), ) } // satisfies the stringer interface to allow passing s for %v in formatted strings func (s *Service) String() string { - ip, _ := s.GetIP() - return fmt.Sprintf("id:%s, memberOf: %s, fqdn: %s, datacenter: %s, ip: %s", s.id, s.MemberOf, s.Fqdn, s.Datacenter, ip) + return fmt.Sprintf("id:%s, memberOf: %s, fqdn: %s, datacenter: %s, ip: %s", s.id, s.MemberOf, s.Fqdn, s.Datacenter, s.GetIP()) +} + +func (s *Service) GSLBService() *model.GSLBService { + return &model.GSLBService{ + ID: s.id, + MemberOf: s.MemberOf, + Fqdn: s.Fqdn, + Datacenter: s.Datacenter, + IP: s.GetIP(), + IsHealthy: s.isHealthy, + FailureCount: s.failureCount, + } } diff --git a/pkg/persistence/store/file/file.go b/pkg/persistence/store/file/file.go index c608aed..415abd3 100644 --- a/pkg/persistence/store/file/file.go +++ b/pkg/persistence/store/file/file.go @@ -11,7 +11,7 @@ import ( type Store[T any] struct { lock sync.RWMutex - cache *memory.Store[T] + cache *memory.Store[T] // dont check error because it is in memory fileName string } @@ -33,6 +33,8 @@ func (s *Store[T]) Save(key string, data T) error { s.lock.Lock() defer s.lock.Unlock() + s.cache.Save(key, data) + saved, err := os.ReadFile(s.fileName) if err != nil { return fmt.Errorf("could not read file: %s", err.Error()) @@ -62,51 +64,24 @@ func (s *Store[T]) Save(key string, data T) error { } func (s *Store[T]) Load(key string) (T, error) { - var zero T s.lock.Lock() defer s.lock.Unlock() - saved, err := os.ReadFile(s.fileName) - if err != nil { - return zero, fmt.Errorf("unable to read from storage: %s", err.Error()) - } - - store := make(map[string]T) - err = json.Unmarshal(saved, &store) - if err != nil { - return zero, fmt.Errorf("unable to read: %s: %s", key, err.Error()) - } - - return store[key], nil + return s.cache.Load(key) } func (s *Store[T]) LoadAll() ([]T, error) { s.lock.Lock() defer s.lock.Unlock() - all := []T{} - - saved, err := os.ReadFile(s.fileName) - if err != nil { - return nil, fmt.Errorf("unable to read from storage: %s", err.Error()) - } - - store := make(map[string]T) - err = json.Unmarshal(saved, &store) - if err != nil { - return nil, fmt.Errorf("unable to parse JSON: %s", err.Error()) - } - - for _, val := range store { - all = append(all, val) - } - - return all, nil + return s.cache.LoadAll() } func (s *Store[T]) Delete(key string) error { s.lock.Lock() defer s.lock.Unlock() + s.cache.Delete(key) + saved, err := os.ReadFile(s.fileName) if err != nil { return fmt.Errorf("could not read file: %s", err.Error()) diff --git a/pkg/persistence/store/memory/memory.go b/pkg/persistence/store/memory/memory.go index 21af799..2733295 100644 --- a/pkg/persistence/store/memory/memory.go +++ b/pkg/persistence/store/memory/memory.go @@ -1,7 +1,6 @@ package memory import ( - "fmt" "sync" ) @@ -30,7 +29,7 @@ func (s *Store[T]) Load(key string) (T, error) { val, exist := s.data[key] if !exist { var zero T - return zero, fmt.Errorf("resource: %s, does not exist", key) + return zero, nil } return val, nil } From e5bb19c84e3b0aca9c2d5506891258697d84e2c1 Mon Sep 17 00:00:00 2001 From: Espen Wobbes Date: Mon, 16 Feb 2026 13:26:08 +0100 Subject: [PATCH 04/24] feat: saving state on shutdown, and registering new services with config stored persistently --- TODOS.md | 8 +- internal/manager/manager.go | 60 +++++++++++++- internal/manager/manager_test.go | 2 +- internal/manager/scheduler/scheduler_test.go | 2 +- internal/manager/servicegroup_test.go | 4 +- internal/repositories/service/service.go | 82 ++++++++++++-------- internal/service/service.go | 31 +++++++- internal/service/service_test.go | 2 +- pkg/persistence/store/file/file.go | 48 ++++++++++-- 9 files changed, 189 insertions(+), 50 deletions(-) diff --git a/TODOS.md b/TODOS.md index 12b9a82..48a830e 100644 --- a/TODOS.md +++ b/TODOS.md @@ -7,11 +7,13 @@ - flags loader for config variables -- OnShutDown functions to save current state on shutdown - - expand to OnStart +- OnShutDown functions to save current state on shutdown ✅ + - expand to OnStart (unsure if this is necessary if handled correctly when registering services) - If svc not in DC, then roundtrip decides priority - AUTH -- Webhooks notifies on event? \ No newline at end of file +- Webhooks notifies on event? + +- worker pool stats handling from manager \ No newline at end of file diff --git a/internal/manager/manager.go b/internal/manager/manager.go index d395267..abd5e7f 100644 --- a/internal/manager/manager.go +++ b/internal/manager/manager.go @@ -71,6 +71,11 @@ func (sm *ServicesManager) Start() { func (sm *ServicesManager) Stop() { sm.pool.Stop() sm.stop.Do(func() { + err := sm.OnShutdown() + if err != nil { + bslog.Error("error while performing shutdown tasks", slog.String("error", err.Error())) + } + for interval, scheduler := range sm.schedulers { scheduler.Stop() bslog.Debug("scheduler closed", slog.String("interval", interval.String())) @@ -82,12 +87,38 @@ func (sm *ServicesManager) Stop() { } func (sm *ServicesManager) OnShutdown() error { + sm.mutex.Lock() + defer sm.mutex.Unlock() + + for memberOf, group := range sm.serviceGroups { + active := group.GetActive() + + for _, svc := range group.Members { + gslbService := svc.GSLBService() + + gslbService.IsActive = (active != nil && active.GetID() == svc.GetID()) + override, err := sm.svcRepo.HasOverride(memberOf) + if err != nil { + return fmt.Errorf("unable to check whether service group has active override: member-of: %s: %w", memberOf, err) + } + gslbService.HasOverride = override + + err = sm.svcRepo.Update(gslbService) + if err != nil { + return fmt.Errorf("failed to persist service state: service: %v: %w", svc, err) + } + } + } return nil } func (sm *ServicesManager) RegisterService(serviceCfg model.GSLBConfig) (*service.Service, error) { - newService, err := service.NewServiceFromGSLBConfig(serviceCfg, sm.dryrun) // create the service object + opts := sm.BuildServiceOptions(serviceCfg) + newService, err := service.NewServiceFromGSLBConfig( // create the service object + serviceCfg, + opts..., + ) if err != nil { return nil, fmt.Errorf("unable to register service: %s", err.Error()) } @@ -475,3 +506,30 @@ func (sm *ServicesManager) Failover(fqdn string, failover failover.Failover) err return nil } + +func (sm *ServicesManager) BuildServiceOptions(config model.GSLBConfig) []service.ServiceOption { + opts := make([]service.ServiceOption, 0, 5) + opts = append(opts, service.WithDryRunChecks(sm.dryrun)) + + gslbService, err := sm.svcRepo.GetMemberInGroup(config.MemberOf, config.ServiceID) + if err != nil { + if errors.Is(err, svcRepo.ErrServiceInGroupNotFound) { + bslog.Debug("could not find member in group", + slog.String("group", config.MemberOf), + slog.String("member", config.ServiceID), + ) + } + // max out the failure count + // means a long time before service will be considered healthy + opts = append(opts, service.WithFailureCount(config.FailureThreshold)) + + return opts + } + + opts = append(opts, service.WithFailureCount(gslbService.FailureCount)) + if gslbService.IsHealthy { + opts = append(opts, service.WithHealthy()) + } + + return opts +} diff --git a/internal/manager/manager_test.go b/internal/manager/manager_test.go index 4e5befc..ad61df7 100644 --- a/internal/manager/manager_test.go +++ b/internal/manager/manager_test.go @@ -206,7 +206,7 @@ func TestServicesManager_updateServiceUnlocked(t *testing.T) { t.Fatalf("could not create service during testing: %s", err.Error()) } - new, err := service.NewServiceFromGSLBConfig(tt.new, true) + new, err := service.NewServiceFromGSLBConfig(tt.new, service.WithDryRunChecks(true)) if err != nil { t.Fatalf("could not create service during testing: %s", err.Error()) } diff --git a/internal/manager/scheduler/scheduler_test.go b/internal/manager/scheduler/scheduler_test.go index b0927dc..2617689 100644 --- a/internal/manager/scheduler/scheduler_test.go +++ b/internal/manager/scheduler/scheduler_test.go @@ -93,7 +93,7 @@ func TestScheduler_Loop(t *testing.T) { for idx := range numServices { genericGSLBConfig.Fqdn = urls[idx] - svc, _ := service.NewServiceFromGSLBConfig(genericGSLBConfig, true) + svc, _ := service.NewServiceFromGSLBConfig(genericGSLBConfig, service.WithDryRunChecks(true)) services = append(services, svc) } diff --git a/internal/manager/servicegroup_test.go b/internal/manager/servicegroup_test.go index fe0d9e6..b746290 100644 --- a/internal/manager/servicegroup_test.go +++ b/internal/manager/servicegroup_test.go @@ -39,8 +39,8 @@ var active *service.Service var passive *service.Service func TestMain(m *testing.M) { - active, _ = service.NewServiceFromGSLBConfig(activeConfig, true) - passive, _ = service.NewServiceFromGSLBConfig(passiveConfig, true) + active, _ = service.NewServiceFromGSLBConfig(activeConfig, service.WithDryRunChecks(true)) + passive, _ = service.NewServiceFromGSLBConfig(passiveConfig, service.WithDryRunChecks(true)) m.Run() } diff --git a/internal/repositories/service/service.go b/internal/repositories/service/service.go index 68f15b6..036c02c 100644 --- a/internal/repositories/service/service.go +++ b/internal/repositories/service/service.go @@ -11,6 +11,7 @@ import ( var ( ErrServiceWithMemberOfNotFound = errors.New("service with member-of not found") + ErrServiceInGroupNotFound = errors.New("service in service-group not found") ) // repository for services that are considered active in a service group @@ -54,7 +55,8 @@ func (sr *ServiceRepo) Create(new *model.GSLBService) error { func(s model.GSLBService) bool { return s.ID == new.ID }) { - return fmt.Errorf("failed to store service: service already exists") + //update instead + return sr.Update(new) } group = append(group, *new) @@ -77,26 +79,27 @@ func (sr *ServiceRepo) Update(new *model.GSLBService) error { return fmt.Errorf("failed to check for existing service group: %w", err) } - if group == nil { - return fmt.Errorf("failed to update service: service group for: %s does not exist", new.MemberOf) - } - - for idx, svc := range group { - if svc.ID == new.ID { - if svc.IsActive { - if override { - new.IP = svc.IP // preserve ip on override, but only for active - new.HasOverride = true - } - } - group[idx] = *new - err = sr.store.Save(new.MemberOf, group) - - if err != nil { - return fmt.Errorf("failed to update entry with id: %s: %w", new.MemberOf, err) - } - break - } + if len(group) == 0 { + return fmt.Errorf("failed to update service group: %s does not exist", new.MemberOf) + } + + idx := slices.IndexFunc(group, func(s model.GSLBService) bool { + return s.ID == new.ID + }) + + if idx == -1 { + return fmt.Errorf("%w: %s id: %s", ErrServiceInGroupNotFound, new.MemberOf, new.ID) + } + + if group[idx].IsActive && override { + new.IP = group[idx].IP + new.HasOverride = true + } + + group[idx] = *new + + if err := sr.store.Save(new.MemberOf, group); err != nil { + return fmt.Errorf("failed to update entry with id: %s: %w", new.MemberOf, err) } return nil @@ -110,21 +113,22 @@ func (sr *ServiceRepo) UpdateOverride(ip string, service *model.GSLBService) err return fmt.Errorf("failed to retrieve service group: %w", err) } - if group == nil { + if len(group) == 0 { return fmt.Errorf("failed to update service: service group for: %s does not exist", service.MemberOf) } - for idx, svc := range group { - if svc.ID == service.ID { - group[idx] = *service - err = sr.store.Save(service.MemberOf, group) + idx := slices.IndexFunc(group, func(s model.GSLBService) bool { + return s.ID == service.ID + }) - if err != nil { - return fmt.Errorf("failed to update override: %w", err) - } - break - } + if idx == -1 { + return fmt.Errorf("%w: %s id: %s", ErrServiceInGroupNotFound, service.MemberOf, service.ID) + } + group[idx] = *service + if err := sr.store.Save(service.MemberOf, group); err != nil { + return fmt.Errorf("failed to update override: %w", err) } + return nil } @@ -205,6 +209,22 @@ func (sr *ServiceRepo) GetActive(memberOf string) (model.GSLBService, error) { return model.GSLBService{}, fmt.Errorf("%w: member-of %s", ErrServiceWithMemberOfNotFound, memberOf) } +func (sr *ServiceRepo) GetMemberInGroup(memberOf, memberId string) (model.GSLBService, error) { + group, err := sr.Read(memberOf) + if err != nil { + return model.GSLBService{}, err + } + + idx := slices.IndexFunc(group, func(s model.GSLBService) bool { + return s.ID == memberId + }) + if idx == -1 { + return model.GSLBService{}, fmt.Errorf("%w: member-of: %s: member-id: %s", ErrServiceInGroupNotFound, memberOf, memberId) + } + + return group[idx], nil +} + func (sr *ServiceRepo) HasOverride(memberOf string) (bool, error) { svc, err := sr.GetActive(memberOf) if err != nil { diff --git a/internal/service/service.go b/internal/service/service.go index bc3e578..223e5d3 100644 --- a/internal/service/service.go +++ b/internal/service/service.go @@ -15,6 +15,7 @@ import ( const DEFAULT_FAILURE_THRESHOLD = 3 type HealthChangeCallback func(healthy bool) +type ServiceOption func(s *Service) type Service struct { id string @@ -31,9 +32,10 @@ type Service struct { checker checks.Checker healthChangeCallback HealthChangeCallback isHealthy bool + dryRun bool } -func NewServiceFromGSLBConfig(config model.GSLBConfig, dryRun bool) (*Service, error) { +func NewServiceFromGSLBConfig(config model.GSLBConfig, opts ...ServiceOption) (*Service, error) { ip := net.ParseIP(config.Ip) if ip == nil { return nil, ErrUnableToParseIpAddr @@ -62,10 +64,15 @@ func NewServiceFromGSLBConfig(config model.GSLBConfig, dryRun bool) (*Service, e FailureThreshold: config.FailureThreshold, failureCount: config.FailureThreshold, // need to succeed check N times before healthy! isHealthy: false, + dryRun: false, + } + + for _, opt := range opts { + opt(svc) } switch { - case dryRun: + case svc.dryRun: svc.checker = &checks.DryRun{} case config.CheckType == checks.HTTPS: @@ -87,6 +94,26 @@ func NewServiceFromGSLBConfig(config model.GSLBConfig, dryRun bool) (*Service, e return svc, nil } +func WithDryRunChecks(enabled bool) ServiceOption { + return func(s *Service) { + s.dryRun = enabled + } +} + +func WithHealthy() ServiceOption { + return func(s *Service) { + s.isHealthy = true + } +} + +func WithFailureCount(count int) ServiceOption { + return func(s *Service) { + if count > -1 { + s.failureCount = count + } // default values are handled in the creation of the service! + } +} + // 5s, 15s, 45s, checks.MAX_CHECK_INTERVAL. // Exponential growth of duration based on priority. Up to checks.MAX_CHECK_INTERVAL func CalculateInterval(priority int, baseInterval timesutil.Duration) timesutil.Duration { diff --git a/internal/service/service_test.go b/internal/service/service_test.go index 23481f5..2baef55 100644 --- a/internal/service/service_test.go +++ b/internal/service/service_test.go @@ -328,7 +328,7 @@ func TestService_GetBaseInterval(t *testing.T) { } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - s, err := NewServiceFromGSLBConfig(tt.config, tt.dryRun) + s, err := NewServiceFromGSLBConfig(tt.config, WithDryRunChecks(tt.dryRun)) if err != nil { t.Fatalf("could not construct receiver type: %v", err) } diff --git a/pkg/persistence/store/file/file.go b/pkg/persistence/store/file/file.go index 415abd3..1d7406b 100644 --- a/pkg/persistence/store/file/file.go +++ b/pkg/persistence/store/file/file.go @@ -5,13 +5,11 @@ import ( "fmt" "os" "sync" - - "github.com/vitistack/gslb-operator/pkg/persistence/store/memory" ) type Store[T any] struct { lock sync.RWMutex - cache *memory.Store[T] // dont check error because it is in memory + cache map[string]T fileName string } @@ -25,7 +23,7 @@ func NewStore[T any](fileName string) (*Store[T], error) { return &Store[T]{ lock: sync.RWMutex{}, fileName: fileName, - cache: memory.NewStore[T](), + cache: make(map[string]T), }, nil } @@ -33,7 +31,7 @@ func (s *Store[T]) Save(key string, data T) error { s.lock.Lock() defer s.lock.Unlock() - s.cache.Save(key, data) + s.cache[key] = data saved, err := os.ReadFile(s.fileName) if err != nil { @@ -66,21 +64,55 @@ func (s *Store[T]) Save(key string, data T) error { func (s *Store[T]) Load(key string) (T, error) { s.lock.Lock() defer s.lock.Unlock() + var zero T + + data, ok := s.cache[key] + if ok { + return data, nil + } + + file, err := os.ReadFile(s.fileName) + if err != nil { + return zero, fmt.Errorf("unable to read storage: %w", err) + } - return s.cache.Load(key) + err = json.Unmarshal(file, &s.cache) + if err != nil { + return zero, fmt.Errorf("unable to parse: %s: %s", key, err.Error()) + } + + return s.cache[key], nil } func (s *Store[T]) LoadAll() ([]T, error) { s.lock.Lock() defer s.lock.Unlock() - return s.cache.LoadAll() + all := []T{} + + saved, err := os.ReadFile(s.fileName) + if err != nil { + return nil, fmt.Errorf("unable to read from storage: %s", err.Error()) + } + + store := make(map[string]T) + err = json.Unmarshal(saved, &store) + if err != nil { + return nil, fmt.Errorf("unable to parse JSON: %s", err.Error()) + } + + for key, val := range store { + s.cache[key] = val + all = append(all, val) + } + + return all, nil } func (s *Store[T]) Delete(key string) error { s.lock.Lock() defer s.lock.Unlock() - s.cache.Delete(key) + delete(s.cache, key) saved, err := os.ReadFile(s.fileName) if err != nil { From 27f39f1f6685037d50f3065520bb68f7419ccf54 Mon Sep 17 00:00:00 2001 From: Espen Wobbes Date: Tue, 17 Feb 2026 09:07:39 +0100 Subject: [PATCH 05/24] feat: roundtrip time for service prioritization in a service group --- TODOS.md | 2 -- internal/checks/checker.go | 3 ++ internal/checks/dryrun.go | 5 +++ internal/checks/http.go | 10 +++++- internal/checks/roundtrip.go | 60 ++++++++++++++++++++++++++++++++ internal/checks/tcp.go | 21 ++++++++--- internal/manager/servicegroup.go | 56 ++++++++++++++++------------- internal/service/service.go | 4 +++ 8 files changed, 128 insertions(+), 33 deletions(-) create mode 100644 internal/checks/roundtrip.go diff --git a/TODOS.md b/TODOS.md index 48a830e..6d54a1c 100644 --- a/TODOS.md +++ b/TODOS.md @@ -10,8 +10,6 @@ - OnShutDown functions to save current state on shutdown ✅ - expand to OnStart (unsure if this is necessary if handled correctly when registering services) -- If svc not in DC, then roundtrip decides priority - - AUTH - Webhooks notifies on event? diff --git a/internal/checks/checker.go b/internal/checks/checker.go index dcd0925..28ec341 100644 --- a/internal/checks/checker.go +++ b/internal/checks/checker.go @@ -1,5 +1,8 @@ package checks +import "time" + type Checker interface { Check() error + Roundtrip() time.Duration } diff --git a/internal/checks/dryrun.go b/internal/checks/dryrun.go index 177f920..f02a769 100644 --- a/internal/checks/dryrun.go +++ b/internal/checks/dryrun.go @@ -3,6 +3,7 @@ package checks import ( "errors" "math/rand" + "time" ) type DryRun struct{} @@ -14,3 +15,7 @@ func (dr *DryRun) Check() error { } return nil } + +func (dr *DryRun) Roundtrip() time.Duration { + return time.Duration(0) +} diff --git a/internal/checks/http.go b/internal/checks/http.go index 23b189b..783ddb1 100644 --- a/internal/checks/http.go +++ b/internal/checks/http.go @@ -8,6 +8,7 @@ import ( ) type HTTPChecker struct { + *RoundTripper url string client *http.Client validator *LuaValidator @@ -25,7 +26,8 @@ func NewHTTPChecker(url string, timeout time.Duration, validationScripts ...stri } return &HTTPChecker{ - url: url, + RoundTripper: NewRoundtripper(), + url: url, client: &http.Client{ Timeout: timeout, Transport: transport, @@ -35,7 +37,9 @@ func NewHTTPChecker(url string, timeout time.Duration, validationScripts ...stri } func (c *HTTPChecker) Check() error { + c.startRecording() resp, err := c.client.Get(c.url) + c.endRecording() if err != nil { return err } @@ -51,3 +55,7 @@ func (c *HTTPChecker) Check() error { resp.Body.Close() return nil } + +func (c *HTTPChecker) Roundtrip() time.Duration { + return c.AverageRoundtripTime() +} diff --git a/internal/checks/roundtrip.go b/internal/checks/roundtrip.go new file mode 100644 index 0000000..8efe431 --- /dev/null +++ b/internal/checks/roundtrip.go @@ -0,0 +1,60 @@ +package checks + +import ( + "sync" + "time" +) + +type RoundTripper struct { + mu sync.RWMutex + currentTripStart time.Time + roundtrips []time.Duration + roundtripIdx int // current index to populate + count int + roundtripCapacity int +} + +func NewRoundtripper() *RoundTripper { + return &RoundTripper{ + mu: sync.RWMutex{}, + roundtrips: make([]time.Duration, 20), + roundtripIdx: 0, + count: 0, + roundtripCapacity: 20, + } +} + +func (rt *RoundTripper) startRecording() { + rt.mu.Lock() + defer rt.mu.Unlock() + rt.currentTripStart = time.Now() +} + +func (rt *RoundTripper) endRecording() { + rt.mu.Lock() + defer rt.mu.Unlock() + + rt.roundtrips[rt.roundtripIdx] = time.Since(rt.currentTripStart) + rt.roundtripIdx = (rt.roundtripIdx + 1) % rt.roundtripCapacity + + if rt.count < rt.roundtripCapacity { + rt.count++ + } +} + +func (rt *RoundTripper) AverageRoundtripTime() time.Duration { + rt.mu.RLock() + defer rt.mu.RUnlock() + + if rt.count == 0 { + return time.Duration(0) + } + + var sum time.Duration + + for _, trip := range rt.roundtrips { + sum += trip + } + + return sum / time.Duration(rt.count) +} diff --git a/internal/checks/tcp.go b/internal/checks/tcp.go index 63deb89..959e548 100644 --- a/internal/checks/tcp.go +++ b/internal/checks/tcp.go @@ -9,10 +9,15 @@ import ( ) type TCPChecker struct { + *RoundTripper addr string timeout time.Duration } +func (c *TCPChecker) Roundtrip() time.Duration { + return c.AverageRoundtripTime() +} + type TCPFullChecker struct { TCPChecker } @@ -30,15 +35,18 @@ func NewTCPChecker(typ, addr string, timeout time.Duration) Checker { func NewTCPFullChecker(addr string, timeout time.Duration) Checker { return &TCPFullChecker{ - TCPChecker: TCPChecker{ - addr: addr, - timeout: timeout, + TCPChecker{ + RoundTripper: NewRoundtripper(), + addr: addr, + timeout: timeout, }, } } func (tf *TCPFullChecker) Check() error { + tf.startRecording() conn, err := net.DialTimeout("tcp", tf.addr, tf.timeout) + tf.endRecording() if err != nil { return err } @@ -53,8 +61,9 @@ type TCPHalfChecker struct { func NewTCPHalfChecker(addr string, timeout time.Duration) Checker { return &TCPHalfChecker{ TCPChecker{ - addr: addr, - timeout: timeout, + RoundTripper: NewRoundtripper(), + addr: addr, + timeout: timeout, }, } } @@ -62,7 +71,9 @@ func NewTCPHalfChecker(addr string, timeout time.Duration) Checker { func (th *TCPHalfChecker) Check() error { checker := tcpshaker.DefaultChecker() + th.startRecording() err := checker.CheckAddr(th.addr, th.timeout) + th.endRecording() if err != nil { if errors.Is(err, tcpshaker.ErrTimeout) { return err diff --git a/internal/manager/servicegroup.go b/internal/manager/servicegroup.go index 392ea4d..14f893e 100644 --- a/internal/manager/servicegroup.go +++ b/internal/manager/servicegroup.go @@ -6,7 +6,6 @@ import ( "slices" "sync" - "github.com/vitistack/gslb-operator/internal/config" "github.com/vitistack/gslb-operator/internal/service" "github.com/vitistack/gslb-operator/internal/utils" "github.com/vitistack/gslb-operator/pkg/bslog" @@ -64,14 +63,12 @@ type ServiceGroup struct { } func NewEmptyServiceGroup() *ServiceGroup { - datacenter := config.GetInstance().Server().Datacenter() return &ServiceGroup{ - mode: ActiveActive, - Members: make([]*service.Service, 0), - active: nil, - lastActive: nil, - prioritizedDatacenter: datacenter, - mu: sync.RWMutex{}, + mode: ActiveActive, + Members: make([]*service.Service, 0), + active: nil, + lastActive: nil, + mu: sync.RWMutex{}, } } @@ -384,23 +381,7 @@ func (sg *ServiceGroup) Update() { sg.mu.RUnlock() sg.mu.Lock() - slices.SortFunc(sg.Members, func(a, b *service.Service) int { - aPriority := a.GetPriority() - bPriority := b.GetPriority() - - if aPriority != bPriority { - return cmp.Compare(aPriority, bPriority) - } - - // equal priority - prioritized datacenter decides (ActiveActive tie-break) - if a.Datacenter == sg.prioritizedDatacenter { - return -1 - } else if b.Datacenter == sg.prioritizedDatacenter { - return 1 - } - - return 0 - }) + slices.SortFunc(sg.Members, sortMembersFunc) sg.mu.Unlock() sg.SetGroupMode() @@ -420,3 +401,28 @@ func (sg *ServiceGroup) Update() { sg.OnPromotion(event) } } + +// func passed into slices.SortFunc for sorting the groups members +func sortMembersFunc(a, b *service.Service) int { + aPriority := a.GetPriority() + bPriority := b.GetPriority() + + if aPriority != bPriority { + return cmp.Compare(aPriority, bPriority) + } + + aRoundtrip := a.GetAverageRoundtrip() + bRoundtrip := b.GetAverageRoundtrip() + + // handle case where no roundtrip time has been recorded + aHasRoundtrip := aRoundtrip > 0 + bHasRoundtrip := bRoundtrip > 0 + + if aHasRoundtrip && bHasRoundtrip { + return cmp.Compare(aRoundtrip, bRoundtrip) + } else if aHasRoundtrip && !bHasRoundtrip { // prioritize the one who has recorded data + return -1 + } else { + return 1 + } +} diff --git a/internal/service/service.go b/internal/service/service.go index 223e5d3..6288316 100644 --- a/internal/service/service.go +++ b/internal/service/service.go @@ -252,6 +252,10 @@ func (s *Service) GetFailureCount() int { return s.failureCount } +func (s *Service) GetAverageRoundtrip() time.Duration { + return s.checker.Roundtrip() +} + func (s *Service) ConfigChanged(other *Service) bool { if s.Fqdn != other.Fqdn || s.addr.String() != other.addr.String() || From 041a826e4bb4a5f3ed5d085e60d619fba16f9160 Mon Sep 17 00:00:00 2001 From: Espen Wobbes Date: Thu, 19 Feb 2026 11:37:28 +0100 Subject: [PATCH 06/24] pipelines, and fixed tests --- .github/workflows/build_test.yml | 33 +++++ .github/workflows/release.yml | 55 ++++++++ .github/workflows/security_scan.yml | 2 +- .gitignore | 2 + Dockerfile | 29 ++++ Makefile | 81 +++++++++++ cmd/main.go | 9 ++ go.mod | 2 +- internal/manager/manager_test.go | 31 +++-- internal/manager/scheduler/scheduler.go | 9 +- internal/manager/scheduler/schedulerHeap.go | 13 +- internal/manager/scheduler/scheduler_test.go | 134 ++++++++++++------- internal/manager/servicegroup.go | 7 +- internal/manager/servicegroup_test.go | 2 + internal/service/service_test.go | 13 +- pkg/dnsdist/client.go | 26 ++-- pkg/dnsdist/client_test.go | 6 +- pkg/persistence/store/file/file.go | 8 ++ 18 files changed, 365 insertions(+), 97 deletions(-) create mode 100644 .github/workflows/build_test.yml create mode 100644 .github/workflows/release.yml create mode 100644 Dockerfile create mode 100644 Makefile diff --git a/.github/workflows/build_test.yml b/.github/workflows/build_test.yml new file mode 100644 index 0000000..135b884 --- /dev/null +++ b/.github/workflows/build_test.yml @@ -0,0 +1,33 @@ +name: Build and Test + +on: + push: + branches: ["main", "develop"] + pull_request: + branches: ["main", "develop"] + +jobs: + build-and-test: + runs-on: ubuntu-latest + steps: + - name: checkout code + uses: actions/checkout@v4 + + - name: setup Go + uses: actions/setup-go@v5 + with: + go-version-file: go-mod + cache: true + cache-dependency-path: go.sum + + - name: install dependencies + run: go mod download + + - name: build + run: | + #test version not in use for release + export VERSION=test-$(git rev-parse --short HEAD) + make build + + - name: test + run: go test -v ./... diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..7b0ef70 --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,55 @@ +name: Release + +on: + push: + tags: + - "v*" + +jobs: + create-oci-image: + runs-on: ubuntu-latest + steps: + - name: checkout code + uses: actions/checkout@v4 + + - name: Set up QEMU + uses: docker/setup-qemu-action@v3 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Extract metadata (tags, labels) for Docker + id: meta + uses: docker/metadata-action@v5 + with: + images: ghcr.io/vitistack/gslb-operator + tags: | + type=semver,pattern={{version}} + type=semver,pattern={{major}}.{{minor}} + type=semver,pattern={{major}} + type=sha + + - name: Log in to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Set release date + id: date + run: echo "DATE=$(date +%Y-%m-%d)" >> $GITHUB_OUTPUT + + - name: Build and push Docker image + uses: docker/build-push-action@v6 + with: + context: . + push: true + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + build-args: | + VERSION=${{ github.ref_name }} + DATE=${{ steps.date.outputs.DATE }} + platforms: linux/amd64,linux/arm64 + cache-from: type=gha + cache-to: type=gha,mode=max diff --git a/.github/workflows/security_scan.yml b/.github/workflows/security_scan.yml index b2c30db..fa9dd2e 100644 --- a/.github/workflows/security_scan.yml +++ b/.github/workflows/security_scan.yml @@ -4,7 +4,7 @@ # You may wish to alter this file to override the set of languages analyzed, # or to provide custom queries or build logic. -name: "Security Scan" +name: Security Scan on: push: diff --git a/.gitignore b/.gitignore index 39f3e89..b7792ea 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,8 @@ *.so *.dylib +/bin + # Test binary, built with `go test -c` *.test diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..42e158d --- /dev/null +++ b/Dockerfile @@ -0,0 +1,29 @@ +FROM golang:1.26 AS build + +LABEL MAINTAINER="espen.wobbes@nhn.no" + +ARG VERSION +ARG DATE + +WORKDIR /app +COPY go.mod go.sum ./ +RUN go mod download + +COPY . . +# build image +RUN CGO_ENABLED=0 go build -ldflags "-s -w -X main.version=${VERSION} -X main.buildDate=${DATE}" -o gslb-operator ./cmd/main.go + + +FROM alpine:3.23 + +WORKDIR /app + +RUN addgroup -S gslb-group && adduser -S gslb-operator -G gslb-group +RUN chown -R gslb-operator:gslb-group /app + +COPY --from=build /app/gslb-operator /app/gslb-operator +COPY sandbox.lua /app + +USER gslb-operator + +CMD [ "./gslb-operator" ] diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..701ba00 --- /dev/null +++ b/Makefile @@ -0,0 +1,81 @@ +.PHONY: help +help: ## Display this help. + @awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n make \033[36m\033[0m\n"} /^[a-zA-Z_0-9-]+:.*?##/ { printf " \033[36m%-15s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST) + +# Variables +GO_VERSION := $(shell go version | cut -d' ' -f3) +DOCKER_COMPOSE := docker compose +HELM := helm +KUBECTL := kubectl +DATE := $(shell date +%Y-%m-%d) +VERSION ?= "test" + + +##@ Build +.PHONY: build run +build: check-tools ## Build the Go application. + @echo "Building GSLB - Operator binary..." + @echo "Version: $(VERSION)" + @echo "Date: $(DATE)" + @go build -ldflags "-s -w -X main.version=$(VERSION) -X main.buildDate=$(DATE)" -o ./bin/ ./cmd/main.go + +run: + @echo "Running GSLB - Operator" + @go run -ldflags "-X main.version=0.0.0-test -X main.buildDate=$(DATE)" ./cmd/main.go + +test: ## Run tests + @echo "Running tests..." + @go test -v ./... + @echo "Tests complete!" + +deps: ## Download and verify dependencies + @echo "Downloading dependencies..." + @go mod download + @go mod verify + @go mod tidy + @echo "Dependencies updated!" + +update-deps: ## Update dependencies + @echo "Updating dependencies..." + @go get -u ./... + @go mod tidy + @echo "Dependencies updated!" + +##@ Code Quality +.PHONY: lint format security-scan bench +lint: ## Run Go linters + @echo "Running Go linters..." + @command -v golangci-lint >/dev/null 2>&1 || { echo "Installing golangci-lint..."; go install github.com/golangci/golangci-lint/cmd/golangci-lint@latest; } + @golangci-lint run ./... + @echo "Linting complete!" + +format: ## Format Go code + @echo "Formatting Go code..." + @go fmt ./... + @echo "Code formatted!" + +go-security-scan: ## Run security scan + @echo "Running security scan..." + @command -v govulncheck >/dev/null 2>&1 || { echo "Installing govulncheck..."; go install golang.org/x/vuln/cmd/govulncheck@latest; } + @gosec ./... + @echo "Security scan complete!" + +bench: ## Run benchmarks + @echo "Running benchmarks..." + @go test -bench=. -benchmem ./... + @echo "Benchmarks complete!" + + +##@ Tools +.PHONY: check-tools install-tools +# Check if required tools are installed +check-tools: + @command -v go >/dev/null 2>&1 || { echo "Go is required but not installed. Aborting." >&2; exit 1; } + @command -v docker >/dev/null 2>&1 || { echo "Docker is required but not installed. Aborting." >&2; exit 1; } + @command -v $(DOCKER_COMPOSE) >/dev/null 2>&1 || { echo "Docker Compose is required but not installed. Aborting." >&2; exit 1; } + +install-tools: ## Install development tools + @echo "Installing development tools..." + @go install github.com/golangci/golangci-lint/cmd/golangci-lint@latest + @go install golang.org/x/vuln/cmd/govulncheck@latest; + @echo "Development tools installed!" \ No newline at end of file diff --git a/cmd/main.go b/cmd/main.go index 930149a..a387544 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -26,7 +26,16 @@ import ( "github.com/vitistack/gslb-operator/pkg/rest/middleware" ) +var ( // injected at buildtime + version string + buildDate string +) + func main() { + bslog.Info("Running GSLB - Operator", + slog.String("version", version), + slog.String("build-date", buildDate), + ) cfg := config.GetInstance() // initialize lua execution environment diff --git a/go.mod b/go.mod index 65df623..560d08a 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,6 @@ module github.com/vitistack/gslb-operator -go 1.25.7 +go 1.26 require ( codeberg.org/miekg/dns v0.6.48 diff --git a/internal/manager/manager_test.go b/internal/manager/manager_test.go index ad61df7..18321af 100644 --- a/internal/manager/manager_test.go +++ b/internal/manager/manager_test.go @@ -16,7 +16,7 @@ var genericGSLBConfig = model.GSLBConfig{ Ip: "192.168.1.1", Port: "80", Datacenter: "dc1", - Interval: timesutil.Duration(5 * time.Second), + Interval: timesutil.Duration(30 * time.Second), Priority: 1, CheckType: "TCP-FULL", } @@ -96,7 +96,7 @@ func TestStartAndStop(t *testing.T) { } } -func TestServicesManager_updateServiceUnlocked(t *testing.T) { +func TestServicesManager_updateService(t *testing.T) { tests := []struct { name string // description of this test case old model.GSLBConfig @@ -112,7 +112,7 @@ func TestServicesManager_updateServiceUnlocked(t *testing.T) { Ip: "192.168.1.1", Port: "80", Datacenter: "dc1", - Interval: timesutil.Duration(5 * time.Second), + Interval: timesutil.Duration(30 * time.Second), Priority: 2, CheckType: "TCP-FULL", }, @@ -127,7 +127,7 @@ func TestServicesManager_updateServiceUnlocked(t *testing.T) { Ip: "192.168.1.1", Port: "80", Datacenter: "dc2", - Interval: timesutil.Duration(5 * time.Second), + Interval: timesutil.Duration(30 * time.Second), Priority: 1, CheckType: "TCP-FULL", }, @@ -142,7 +142,7 @@ func TestServicesManager_updateServiceUnlocked(t *testing.T) { Ip: "192.168.1.2", Port: "80", Datacenter: "dc1", - Interval: timesutil.Duration(5 * time.Second), + Interval: timesutil.Duration(30 * time.Second), Priority: 1, CheckType: "TCP-FULL", }, @@ -157,7 +157,7 @@ func TestServicesManager_updateServiceUnlocked(t *testing.T) { Ip: "192.168.1.1", Port: "80", Datacenter: "dc1", - Interval: timesutil.Duration(5 * time.Second), + Interval: timesutil.Duration(30 * time.Second), Priority: 1, CheckType: "TCP-HALF", }, @@ -172,7 +172,7 @@ func TestServicesManager_updateServiceUnlocked(t *testing.T) { Ip: "192.168.1.1", Port: "80", Datacenter: "dc1", - Interval: timesutil.Duration(5 * time.Second), + Interval: timesutil.Duration(30 * time.Second), Priority: 1, CheckType: "TCP-FULL", }, @@ -182,12 +182,12 @@ func TestServicesManager_updateServiceUnlocked(t *testing.T) { old: genericGSLBConfig, new: model.GSLBConfig{ ServiceID: "123-test-456", - MemberOf: "example.example.com", + MemberOf: "example.com", Fqdn: "testing.example.com", Ip: "192.168.1.1", Port: "80", Datacenter: "dc1", - Interval: timesutil.Duration(5 * time.Second), + Interval: timesutil.Duration(30 * time.Second), Priority: 1, CheckType: "TCP-FULL", }, @@ -197,6 +197,7 @@ func TestServicesManager_updateServiceUnlocked(t *testing.T) { t.Run(tt.name, func(t *testing.T) { sm := NewManager(WithDryRun(true)) sm.Start() + defer sm.Stop() sm.DNSUpdate = func(s *service.Service, b bool) { @@ -213,12 +214,16 @@ func TestServicesManager_updateServiceUnlocked(t *testing.T) { sm.updateService(old, new) if old.ConfigChanged(new) { - t.Error("still pending config changes after update") + t.Fatal("still pending config changes after update") } _, interval, svc := sm.scheduledServices.Search(old.GetID()) if interval != new.GetDefaultInterval() { - t.Errorf("the service was not located at its correct interval, expected: %s but got: %s", new.GetDefaultInterval(), interval) + t.Fatalf("the service was not located at its correct interval, expected: %s but got: %s", new.GetDefaultInterval(), interval) + } + + if ok := sm.serviceGroups[old.MemberOf].memberExists(old); !ok { + t.Fatalf("service does not exist in expected service group, expected: %s", old.MemberOf) } if svc != old { @@ -228,6 +233,7 @@ func TestServicesManager_updateServiceUnlocked(t *testing.T) { } } + func TestServicesManager_moveServiceToInterval(t *testing.T) { tests := []struct { name string // description of this test case @@ -251,6 +257,8 @@ func TestServicesManager_moveServiceToInterval(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { sm := NewManager(WithDryRun(true)) + sm.Start() + defer sm.Stop() svc, _ := sm.RegisterService(tt.config) if tt.shouldExist { @@ -270,3 +278,4 @@ func TestServicesManager_moveServiceToInterval(t *testing.T) { }) } } + diff --git a/internal/manager/scheduler/scheduler.go b/internal/manager/scheduler/scheduler.go index 731f8dd..8119454 100644 --- a/internal/manager/scheduler/scheduler.go +++ b/internal/manager/scheduler/scheduler.go @@ -93,12 +93,15 @@ func (s *Scheduler) ScheduleService(svc *service.Service) { func (s *Scheduler) RemoveService(svc *service.Service) bool { s.mu.Lock() defer s.mu.Unlock() - idx := s.heap.GetServiceIndex(svc) + + idx := s.heap.GetServiceIndex(svc.GetID()) if idx == -1 { return s.heap.Len() == 0 - } - if idx == 0 { + } else if idx == 0 { s.heap[0].shouldReSchedule = false + if len(s.heap) == 1 { + return true + } } else { heap.Remove(&s.heap, idx) } diff --git a/internal/manager/scheduler/schedulerHeap.go b/internal/manager/scheduler/schedulerHeap.go index fc66f6f..4907dfc 100644 --- a/internal/manager/scheduler/schedulerHeap.go +++ b/internal/manager/scheduler/schedulerHeap.go @@ -1,7 +1,7 @@ package scheduler import ( - "github.com/vitistack/gslb-operator/internal/service" + "slices" ) type ServiceHeap []*ScheduledService @@ -38,11 +38,8 @@ func (h ServiceHeap) Peek() *ScheduledService { return h[0] } -func (h *ServiceHeap) GetServiceIndex(service *service.Service) int { - for index, scheduled := range *h { - if scheduled.service.GetID() == service.GetID() { - return index - } - } - return -1 +func (h *ServiceHeap) GetServiceIndex(id string) int { + return slices.IndexFunc(*h, func(s *ScheduledService) bool { + return s.service.GetID() == id + }) } diff --git a/internal/manager/scheduler/scheduler_test.go b/internal/manager/scheduler/scheduler_test.go index 2617689..1e29184 100644 --- a/internal/manager/scheduler/scheduler_test.go +++ b/internal/manager/scheduler/scheduler_test.go @@ -1,8 +1,6 @@ package scheduler import ( - "fmt" - "math/rand" "sync" "testing" "time" @@ -13,10 +11,21 @@ import ( ) var genericGSLBConfig = model.GSLBConfig{ + ServiceID: "123", Ip: "192.168.1.1", Port: "80", Datacenter: "dc1", - Interval: timesutil.Duration(5 * time.Second), + Interval: timesutil.Duration(time.Second), + Priority: 1, + CheckType: "TCP-FULL", +} + +var genericGSLBConfig2 = model.GSLBConfig{ + ServiceID: "456", + Ip: "192.168.1.2", + Port: "80", + Datacenter: "dc2", + Interval: timesutil.Duration(time.Second), Priority: 1, CheckType: "TCP-FULL", } @@ -62,63 +71,96 @@ func TestNewScheduler(t *testing.T) { } } -func TestScheduler_Loop(t *testing.T) { +func TestScheduleService(t *testing.T) { + svc, err := service.NewServiceFromGSLBConfig(genericGSLBConfig) + if err != nil { + t.Fatalf("could not create test service: %s", err.Error()) + } + + receivedTick := false + + wg := sync.WaitGroup{} + scheduler := NewScheduler(time.Duration(svc.GetDefaultInterval()), &wg) + scheduler.OnTick = func(s *service.Service) { + receivedTick = true + } + defer scheduler.Stop() + + scheduler.ScheduleService(svc) + if !scheduler.isRunning { + t.Errorf("scheduler is not running, expected: isRunning == true, but got: isRunning == false") + } + + if len(scheduler.heap) == 0 && !receivedTick { + t.Errorf("scheduler is running, but heap size is 0, means scheduler has pop'ed the heap before received tick") + } + +} + +func TestScheduler_RemoveService(t *testing.T) { + svc1, _ := service.NewServiceFromGSLBConfig(genericGSLBConfig) + svc2, _ := service.NewServiceFromGSLBConfig(genericGSLBConfig2) tests := []struct { name string // description of this test case // Named input parameters for receiver constructor. interval time.Duration + wg *sync.WaitGroup + // Named input parameters for target function. + svc *service.Service + want bool + addSecond bool + removeSecond bool }{ { - name: "100-services-on-5s", - interval: time.Second * 5, + name: "only-one", + interval: time.Second, + wg: &sync.WaitGroup{}, + svc: svc1, + want: true, + addSecond: false, + removeSecond: false, }, { - name: "100-services-on-15s", - interval: time.Second * 15, + name: "add-second-remove-first", + interval: time.Second, + wg: &sync.WaitGroup{}, + svc: svc1, + want: false, + addSecond: true, + removeSecond: false, }, { - name: "100-services-on-45s", - interval: time.Second * 45, + name: "add-second-remove-second", + interval: time.Second, + wg: &sync.WaitGroup{}, + svc: svc1, + want: false, + addSecond: true, + removeSecond: true, }, - { - name: "100-services-on-60s", - interval: time.Second * 60, - }, - } - - numServices := 100 - urls := randomUrlIDs(numServices) - - services := make([]*service.Service, 0, 100) - - for idx := range numServices { - genericGSLBConfig.Fqdn = urls[idx] - svc, _ := service.NewServiceFromGSLBConfig(genericGSLBConfig, service.WithDryRunChecks(true)) - services = append(services, svc) } - for _, tt := range tests { - scheduler := NewScheduler(tt.interval, &sync.WaitGroup{}) - scheduler.OnTick = func(s *service.Service) { - t.Logf("received tick for: %s\n", s.Fqdn) - } - - for _, svc := range services { - scheduler.ScheduleService(svc) - } - time.Sleep(time.Second * 6) - } -} + t.Run(tt.name, func(t *testing.T) { + s := NewScheduler(tt.interval, tt.wg) -func randomUrlIDs(num int) []string { - baseUrl := "test.example.com" - urls := make([]string, 0, num) + s.ScheduleService(tt.svc) + var got bool + if tt.addSecond { + s.ScheduleService(svc2) + } - const charSet = "abcdefghijklmnopqrstuvwxyz" - for range num { - idx := rand.Intn(len(charSet)) - urls = append(urls, fmt.Sprintf("%v/%v", baseUrl, charSet[idx])) - } + if tt.removeSecond { + got = s.RemoveService(svc2) + } else { + got = s.RemoveService(tt.svc) + if s.heap.Peek().shouldReSchedule { + t.Errorf("scheduled service are set to be rescheduled after remove has been called") + } + } - return urls + if got != tt.want { + t.Errorf("RemoveService() = %v, but wanted %v", got, tt.want) + } + }) + } } diff --git a/internal/manager/servicegroup.go b/internal/manager/servicegroup.go index 14f893e..365650d 100644 --- a/internal/manager/servicegroup.go +++ b/internal/manager/servicegroup.go @@ -207,11 +207,14 @@ func (sg *ServiceGroup) RegisterService(newService *service.Service) { func (sg *ServiceGroup) RemoveService(id string) bool { sg.mu.Lock() - defer sg.mu.Unlock() + members := sg.Members + sg.mu.Unlock() - for idx, member := range sg.Members { + for idx, member := range members { if member.GetID() == id { + sg.mu.Lock() sg.Members = utils.RemoveIndexFromSlice(sg.Members, idx) + sg.mu.Unlock() sg.Update() break } diff --git a/internal/manager/servicegroup_test.go b/internal/manager/servicegroup_test.go index b746290..60d3643 100644 --- a/internal/manager/servicegroup_test.go +++ b/internal/manager/servicegroup_test.go @@ -16,6 +16,7 @@ type Test struct { } var activeConfig = model.GSLBConfig{ + ServiceID: "123", Fqdn: "test.example.com", Ip: "192.168.1.1", Port: "80", @@ -26,6 +27,7 @@ var activeConfig = model.GSLBConfig{ } var passiveConfig = model.GSLBConfig{ + ServiceID: "456", Fqdn: "test.example.com", Ip: "192.168.1.1", Port: "80", diff --git a/internal/service/service_test.go b/internal/service/service_test.go index 2baef55..d0314d4 100644 --- a/internal/service/service_test.go +++ b/internal/service/service_test.go @@ -2,7 +2,6 @@ package service import ( "errors" - "log" "testing" "time" @@ -132,9 +131,7 @@ func TestOnSuccess(t *testing.T) { for range svc0.FailureThreshold - 1 { svc0.OnFailure(errors.New("test error")) } - log.Printf("count: %v", svc0.failureCount) svc0.OnSuccess() - log.Printf("count: %v", svc0.failureCount) if !svc0.isHealthy { t.Errorf("Expected health: %v, but got: %v. After 2x OnFailure before OnSuccess()", true, svc0.IsHealthy()) @@ -143,9 +140,7 @@ func TestOnSuccess(t *testing.T) { for range svc0.FailureThreshold { svc0.OnFailure(errors.New("test error")) } - log.Printf("count: %v", svc0.failureCount) svc0.OnSuccess() - log.Printf("count: %v", svc0.failureCount) if svc0.isHealthy { t.Fatalf("Expected health: %v, but got: %v. After 3x OnFailure before OnSuccess()", false, svc0.IsHealthy()) @@ -241,9 +236,7 @@ func TestOnFailure(t *testing.T) { for range svc0.FailureThreshold - 1 { svc0.OnSuccess() } - log.Printf("count: %v", svc0.failureCount) svc0.OnFailure(errors.New("test")) - log.Printf("count: %v", svc0.failureCount) if svc0.isHealthy { t.Errorf("Expected health: %v, but got: %v. After 2x OnSuccess() before OnFailure()", false, svc0.IsHealthy()) @@ -252,9 +245,7 @@ func TestOnFailure(t *testing.T) { for range svc0.FailureThreshold { svc0.OnSuccess() } - log.Printf("count: %v", svc0.failureCount) svc0.OnFailure(errors.New("test")) - log.Printf("count: %v", svc0.failureCount) if !svc0.isHealthy { t.Fatalf("Expected health: %v, but got: %v. After 3x OnSuccess() before OnFailure()", true, svc0.IsHealthy()) @@ -272,6 +263,7 @@ func TestService_GetBaseInterval(t *testing.T) { { name: "baseinterval-5-priority-1", config: model.GSLBConfig{ + ServiceID: "123", Fqdn: "test.nhn.no", Ip: "127.0.0.1", Port: "80", @@ -286,6 +278,7 @@ func TestService_GetBaseInterval(t *testing.T) { { name: "baseinterval-5-priority-2", config: model.GSLBConfig{ + ServiceID: "123", Fqdn: "test.nhn.no", Ip: "127.0.0.1", Port: "80", @@ -300,6 +293,7 @@ func TestService_GetBaseInterval(t *testing.T) { { name: "baseinterval-5-priority-3", config: model.GSLBConfig{ + ServiceID: "123", Fqdn: "test.nhn.no", Ip: "127.0.0.1", Port: "80", @@ -314,6 +308,7 @@ func TestService_GetBaseInterval(t *testing.T) { { name: "baseinterval-5-priority-4", config: model.GSLBConfig{ + ServiceID: "123", Fqdn: "test.nhn.no", Ip: "127.0.0.1", Port: "80", diff --git a/pkg/dnsdist/client.go b/pkg/dnsdist/client.go index 080424d..5d49c4a 100644 --- a/pkg/dnsdist/client.go +++ b/pkg/dnsdist/client.go @@ -310,22 +310,20 @@ func incrementNonce(nonce *[NONCE_LEN]byte) { binary.BigEndian.PutUint32(nonce[:4], value) } -func (c *Client) AddDomainSpoof(domain string, ips []string) error { - // addAction(QNameRule('example.com'), SpoofAction({"192.168.1.0","192.168.1.2"}), {name="example.com"}) - cmd := fmt.Sprintf("addAction(QNameRule('%v'), SpoofAction({", domain) +func (c *Client) AddDomainSpoof(ruleName, domain, ip string) error { + // addAction(QNameRule('example.com'), SpoofAction({"192.168.1.0"}), {name="example.com:DC"}) + cmd := fmt.Sprintf("addAction(QNameRule('%v'), SpoofAction({'%s'}, {ttl=3600}), {name='%s'})", domain, ip, ruleName) + return Must(c.command(cmd)) +} - for _, ip := range ips { - cmd += fmt.Sprintf("'%v', ", ip) - } - idx := strings.LastIndex(cmd, ",") - if idx == -1 { - return fmt.Errorf("no trailing comma found in command: %s", cmd) - } - cmd = fmt.Sprintf("%v {name='%v'})", cmd[:idx]+"}),", domain) +func (c *Client) RmRuleWithName(ruleName string) error { + return Must(c.command(fmt.Sprintf("rmRule('%s')", ruleName))) +} - return Must(c.command(cmd)) +func (c *Client) RmRuleWithIndex(idx int) error { + return Must(c.command(fmt.Sprintf("rmRule('%d')", idx))) } -func (c *Client) RmDomainSpoof(domain string) error { - return Must(c.command(fmt.Sprintf("rmRule(%s)", domain))) +func (c *Client) ShowRules() (string, error) { + return c.command("showRules()") } diff --git a/pkg/dnsdist/client_test.go b/pkg/dnsdist/client_test.go index 0c632bc..9d6aec6 100644 --- a/pkg/dnsdist/client_test.go +++ b/pkg/dnsdist/client_test.go @@ -1,7 +1,6 @@ package dnsdist import ( - "log" "testing" ) @@ -16,6 +15,8 @@ func TestNewClient(t *testing.T) { } +/* +TODO: need to mock dnsdist - server in testing func TestCommand(t *testing.T) { client, err := NewClient( "M2YQKiPEDzeWHUFjejVOd+QHmMVmm2SuYG7vSXdaIkE=", @@ -43,8 +44,9 @@ func TestAddDomainSpoof(t *testing.T) { t.Errorf("could not create client: %v", err.Error()) } - err = client.AddDomainSpoof("test.nhn.no", []string{"10.10.0.1", "10.10.0.2"}) + err = client.AddDomainSpoof("test.nhn.no:test", "test.nhn.no", "127.0.0.1") if err != nil { t.Errorf("failed to create DomainSpoof") } } +*/ diff --git a/pkg/persistence/store/file/file.go b/pkg/persistence/store/file/file.go index 1d7406b..d6706fd 100644 --- a/pkg/persistence/store/file/file.go +++ b/pkg/persistence/store/file/file.go @@ -76,6 +76,10 @@ func (s *Store[T]) Load(key string) (T, error) { return zero, fmt.Errorf("unable to read storage: %w", err) } + if len(file) == 0 { + return zero, nil + } + err = json.Unmarshal(file, &s.cache) if err != nil { return zero, fmt.Errorf("unable to parse: %s: %s", key, err.Error()) @@ -94,6 +98,10 @@ func (s *Store[T]) LoadAll() ([]T, error) { return nil, fmt.Errorf("unable to read from storage: %s", err.Error()) } + if len(saved) == 0 { + return all, nil + } + store := make(map[string]T) err = json.Unmarshal(saved, &store) if err != nil { From b4d0a10f1aaa88332ae6d83af94f31062b304044 Mon Sep 17 00:00:00 2001 From: Espen Wobbes Date: Thu, 19 Feb 2026 11:46:57 +0100 Subject: [PATCH 07/24] set fixed golang version --- .github/workflows/build_test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_test.yml b/.github/workflows/build_test.yml index 135b884..a767b1d 100644 --- a/.github/workflows/build_test.yml +++ b/.github/workflows/build_test.yml @@ -16,7 +16,7 @@ jobs: - name: setup Go uses: actions/setup-go@v5 with: - go-version-file: go-mod + go-version: 1.25 cache: true cache-dependency-path: go.sum From fa2d0508c2e3fc4144ccfddf013dce73e92fc145 Mon Sep 17 00:00:00 2001 From: Espen Wobbes Date: Fri, 20 Feb 2026 12:20:28 +0100 Subject: [PATCH 08/24] started work for metrics collection --- internal/manager/healthcheck/healtheck.go | 47 ++++++++ internal/manager/healthcheck/metrics.go | 124 ++++++++++++++++++++++ 2 files changed, 171 insertions(+) create mode 100644 internal/manager/healthcheck/healtheck.go create mode 100644 internal/manager/healthcheck/metrics.go diff --git a/internal/manager/healthcheck/healtheck.go b/internal/manager/healthcheck/healtheck.go new file mode 100644 index 0000000..795cfe6 --- /dev/null +++ b/internal/manager/healthcheck/healtheck.go @@ -0,0 +1,47 @@ +package healthcheck + +import ( + "time" + + "github.com/vitistack/gslb-operator/internal/service" +) + +// the result of a singular health-check +type Result struct { + Success bool + timeTaken time.Duration +} + +type HealthCheckJob struct { + service *service.Service + metrics *HealthCheckMetricsCounter + lastCheck time.Time +} + +func NewJob(svc *service.Service) *HealthCheckJob { + return &HealthCheckJob{ + service: svc, + metrics: NewMetricsCounter(1_000), + } +} + +func (hj *HealthCheckJob) Execute() error { + hj.lastCheck = time.Now() + return hj.service.Execute() +} + +func (hj *HealthCheckJob) OnSuccess() { + hj.metrics.Record(&Result{ + Success: true, + timeTaken: time.Since(hj.lastCheck), + }) + hj.service.OnSuccess() +} + +func (hj *HealthCheckJob) OnFailure(err error) { + hj.metrics.Record(&Result{ + Success: false, + timeTaken: time.Since(hj.lastCheck), + }) + hj.service.OnFailure(err) +} diff --git a/internal/manager/healthcheck/metrics.go b/internal/manager/healthcheck/metrics.go new file mode 100644 index 0000000..421b0b3 --- /dev/null +++ b/internal/manager/healthcheck/metrics.go @@ -0,0 +1,124 @@ +// collects different metrics for health-checks since start of service +package healthcheck + +import ( + "sync" + "sync/atomic" + "time" +) + +// holds the timestamp of healthcheck time +// and the result of that healthcheck +type Recording struct { + timestamp time.Time + result *Result +} + +func NewRecording(res *Result) *Recording { + return &Recording{ + timestamp: time.Now().Add(-res.timeTaken), + result: res, + } +} + +type HealthCheckMetricsCounter struct { + totalChecks atomic.Int64 + totalSuccess atomic.Int64 + totalFailure atomic.Int64 + maxRecordings int + recordings []*Recording // health-check timestamps + mu sync.RWMutex +} + +func NewMetricsCounter(max int) *HealthCheckMetricsCounter { + return &HealthCheckMetricsCounter{ + totalChecks: atomic.Int64{}, + totalSuccess: atomic.Int64{}, + totalFailure: atomic.Int64{}, + maxRecordings: max, + recordings: make([]*Recording, 0, max), + mu: sync.RWMutex{}, + } +} + +// total checks in the last given time-frame +func (c *HealthCheckMetricsCounter) Last(dur time.Duration) int { + c.mu.Lock() + defer c.mu.Unlock() + count := 0 + + interval := time.Now().Add(-dur) + + for i := len(c.recordings) - 1; i >= 0; i-- { + if !c.recordings[i].timestamp.Before(interval) { + count++ + } else { + // since we start from the back + // the first recording that is BEFORE the interval + // means that every interval next after this one is also before the interval + // therefore no need to check them when we know they are not going to hit + break + } + } + + return count +} + +// total checks that have been successfull in the last given time-frame +func (c *HealthCheckMetricsCounter) SuccessLast(dur time.Duration) int { + c.mu.Lock() + defer c.mu.Unlock() + count := 0 + + interval := time.Now().Add(-dur) + + for i := len(c.recordings) - 1; i >= 0; i-- { + if !c.recordings[i].timestamp.Before(interval) && c.recordings[i].result.Success { + count++ + } else { + // same reason as Last(...) func + break + } + } + + return count +} + +// total checks that have been failure in the last given time-frame +func (c *HealthCheckMetricsCounter) FailuresLast(dur time.Duration) int { + c.mu.Lock() + defer c.mu.Unlock() + count := 0 + + interval := time.Now().Add(-dur) + + for i := len(c.recordings) - 1; i >= 0; i-- { + if !c.recordings[i].timestamp.Before(interval) && !c.recordings[i].result.Success { + count++ + } else { + // same reason as Last(...) func + break + } + } + + return count +} + +func (c *HealthCheckMetricsCounter) Record(result *Result) { + c.totalChecks.Add(1) + + if result.Success { + c.totalSuccess.Add(1) + } else { + c.totalFailure.Add(1) + } + + c.mu.Lock() + defer c.mu.Unlock() + + if len(c.recordings) == c.maxRecordings { + c.recordings = append(c.recordings[1:], NewRecording(result)) + } else { + c.recordings = append(c.recordings, NewRecording(result)) + } +} From d48b53a069e4b51832a2d6f2f5dcbd70a738a6a9 Mon Sep 17 00:00:00 2001 From: Espen Wobbes Date: Fri, 20 Feb 2026 13:01:55 +0100 Subject: [PATCH 09/24] added charts and updated release pipeline --- .github/workflows/release.yml | 131 +++++++++------ charts/gslb-operator/.helmignore | 23 +++ charts/gslb-operator/Chart.yaml | 27 ++++ charts/gslb-operator/templates/NOTES.txt | 29 ++++ charts/gslb-operator/templates/_helpers.tpl | 62 +++++++ .../gslb-operator/templates/deployment.yaml | 79 +++++++++ charts/gslb-operator/templates/hpa.yaml | 32 ++++ charts/gslb-operator/templates/httproute.yaml | 38 +++++ charts/gslb-operator/templates/rbac/role.yaml | 9 ++ .../templates/rbac/role_binding.yaml | 16 ++ .../templates/rbac/service_account.yaml | 15 ++ charts/gslb-operator/templates/service.yaml | 17 ++ charts/gslb-operator/values.yaml | 153 ++++++++++++++++++ 13 files changed, 581 insertions(+), 50 deletions(-) create mode 100644 charts/gslb-operator/.helmignore create mode 100644 charts/gslb-operator/Chart.yaml create mode 100644 charts/gslb-operator/templates/NOTES.txt create mode 100644 charts/gslb-operator/templates/_helpers.tpl create mode 100644 charts/gslb-operator/templates/deployment.yaml create mode 100644 charts/gslb-operator/templates/hpa.yaml create mode 100644 charts/gslb-operator/templates/httproute.yaml create mode 100644 charts/gslb-operator/templates/rbac/role.yaml create mode 100644 charts/gslb-operator/templates/rbac/role_binding.yaml create mode 100644 charts/gslb-operator/templates/rbac/service_account.yaml create mode 100644 charts/gslb-operator/templates/service.yaml create mode 100644 charts/gslb-operator/values.yaml diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 7b0ef70..fa0522c 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -1,55 +1,86 @@ name: Release on: - push: - tags: - - "v*" + push: + tags: + - "v*" jobs: - create-oci-image: - runs-on: ubuntu-latest - steps: - - name: checkout code - uses: actions/checkout@v4 - - - name: Set up QEMU - uses: docker/setup-qemu-action@v3 - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Extract metadata (tags, labels) for Docker - id: meta - uses: docker/metadata-action@v5 - with: - images: ghcr.io/vitistack/gslb-operator - tags: | - type=semver,pattern={{version}} - type=semver,pattern={{major}}.{{minor}} - type=semver,pattern={{major}} - type=sha - - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Set release date - id: date - run: echo "DATE=$(date +%Y-%m-%d)" >> $GITHUB_OUTPUT - - - name: Build and push Docker image - uses: docker/build-push-action@v6 - with: - context: . - push: true - tags: ${{ steps.meta.outputs.tags }} - labels: ${{ steps.meta.outputs.labels }} - build-args: | - VERSION=${{ github.ref_name }} - DATE=${{ steps.date.outputs.DATE }} - platforms: linux/amd64,linux/arm64 - cache-from: type=gha - cache-to: type=gha,mode=max + create-oci-image: + runs-on: ubuntu-latest + steps: + - name: checkout code + uses: actions/checkout@v4 + + - name: Set up QEMU + uses: docker/setup-qemu-action@v3 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Extract metadata (tags, labels) for Docker + id: meta + uses: docker/metadata-action@v5 + with: + images: ghcr.io/vitistack/gslb-operator + tags: | + type=semver,pattern={{version}} + type=semver,pattern={{major}}.{{minor}} + type=semver,pattern={{major}} + type=sha + + - name: Log in to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Set release date + id: date + run: echo "DATE=$(date +%Y-%m-%d)" >> $GITHUB_OUTPUT + + - name: Build and push Docker image + uses: docker/build-push-action@v6 + with: + context: . + push: true + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + build-args: | + VERSION=${{ github.ref_name }} + DATE=${{ steps.date.outputs.DATE }} + platforms: linux/amd64,linux/arm64 + cache-from: type=gha + cache-to: type=gha,mode=max + build-and-publish-helm-chart: + needs: + - create-oci-image + runs-on: ubuntu-latest + steps: + - name: checkout code + uses: actions/checkout@v4 + + - name: Install helm + uses: azure/setup-helm@v1 + with: + version: v3.16.4 + + - name: install-yq + env: + VERSION: v4.44.5 + BINARY: yq_linux_amd64 + run: | + wget https://github.com/mikefarah/yq/releases/download/${VERSION}/${BINARY}.tar.gz -O - | tar xz && mv ${BINARY} yq && chmod +x yq + + - name: build and push chart + env: + VERSION: ${{ github.ref_name }} + run: | + export HELM_VERSION=${VERSION#v*} + ./yq e -i '.version = strenv(HELM_VERSION),.appVersion = strenv(VERSION)' chart/ipam-operator/Chart.yaml + ./yq e -i '.global.controller.tag = strenv(HELM_VERSION),.global.controller.repository = "ncr.sky.nhn.no/ghcr/vitistack/ipam-operator"' chart/ipam-operator/values.prod.yaml + ./yq e -i '.global.controller.tag = strenv(HELM_VERSION),.global.controller.repository = "ncr.sky.nhn.no/ghcr/vitistack/ipam-operator"' chart/ipam-operator/values.test.yaml + helm package chart/ipam-operator + echo ${{ secrets.GITHUB_TOKEN }} | helm registry login -u ${{ github.actor }} ghcr.io --password-stdin + helm push ipam-operator-${HELM_VERSION}.tgz oci://ghcr.io/vitistack/helm/ diff --git a/charts/gslb-operator/.helmignore b/charts/gslb-operator/.helmignore new file mode 100644 index 0000000..0e8a0eb --- /dev/null +++ b/charts/gslb-operator/.helmignore @@ -0,0 +1,23 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/charts/gslb-operator/Chart.yaml b/charts/gslb-operator/Chart.yaml new file mode 100644 index 0000000..d659aea --- /dev/null +++ b/charts/gslb-operator/Chart.yaml @@ -0,0 +1,27 @@ +apiVersion: v2 +name: gslb-operator +description: A Helm chart for Kubernetes + +# A chart can be either an 'application' or a 'library' chart. +# +# Application charts are a collection of templates that can be packaged into versioned archives +# to be deployed. +# +# Library charts provide useful utilities or functions for the chart developer. They're included as +# a dependency of application charts to inject those utilities and functions into the rendering +# pipeline. Library charts do not define any templates and therefore cannot be deployed. +type: application + +# This is the chart version. This version number should be incremented each time you make changes +# to the chart and its templates, including the app version. +# Versions are expected to follow Semantic Versioning (https://semver.org/) +version: 0.1.0 + +# This is the version number of the application being deployed. This version number should be +# incremented each time you make changes to the application. Versions are not expected to +# follow Semantic Versioning. They should reflect the version the application is using. +# It is recommended to use it with quotes. +appVersion: "1.16.0" + +# Chart icon - displayed in Helm repository UIs like ArtifactHub +icon: https://vitistack.io/images/viti1.svg diff --git a/charts/gslb-operator/templates/NOTES.txt b/charts/gslb-operator/templates/NOTES.txt new file mode 100644 index 0000000..f38c646 --- /dev/null +++ b/charts/gslb-operator/templates/NOTES.txt @@ -0,0 +1,29 @@ +1. Get the application URL by running these commands: +{{- if .Values.httpRoute.enabled }} +{{- if .Values.httpRoute.hostnames }} + export APP_HOSTNAME={{ .Values.httpRoute.hostnames | first }} +{{- else }} + export APP_HOSTNAME=$(kubectl get --namespace {{(first .Values.httpRoute.parentRefs).namespace | default .Release.Namespace }} gateway/{{ (first .Values.httpRoute.parentRefs).name }} -o jsonpath="{.spec.listeners[0].hostname}") + {{- end }} +{{- if and .Values.httpRoute.rules (first .Values.httpRoute.rules).matches (first (first .Values.httpRoute.rules).matches).path.value }} + echo "Visit http://$APP_HOSTNAME{{ (first (first .Values.httpRoute.rules).matches).path.value }} to use your application" + + NOTE: Your HTTPRoute depends on the listener configuration of your gateway and your HTTPRoute rules. + The rules can be set for path, method, header and query parameters. + You can check the gateway configuration with 'kubectl get --namespace {{(first .Values.httpRoute.parentRefs).namespace | default .Release.Namespace }} gateway/{{ (first .Values.httpRoute.parentRefs).name }} -o yaml' +{{- end }} +{{- else if contains "NodePort" .Values.service.type }} + export NODE_PORT=$(kubectl get --namespace {{ .Release.Namespace }} -o jsonpath="{.spec.ports[0].nodePort}" services {{ include "gslb-operator.fullname" . }}) + export NODE_IP=$(kubectl get nodes --namespace {{ .Release.Namespace }} -o jsonpath="{.items[0].status.addresses[0].address}") + echo http://$NODE_IP:$NODE_PORT +{{- else if contains "LoadBalancer" .Values.service.type }} + NOTE: It may take a few minutes for the LoadBalancer IP to be available. + You can watch its status by running 'kubectl get --namespace {{ .Release.Namespace }} svc -w {{ include "gslb-operator.fullname" . }}' + export SERVICE_IP=$(kubectl get svc --namespace {{ .Release.Namespace }} {{ include "gslb-operator.fullname" . }} --template "{{"{{ range (index .status.loadBalancer.ingress 0) }}{{.}}{{ end }}"}}") + echo http://$SERVICE_IP:{{ .Values.service.port }} +{{- else if contains "ClusterIP" .Values.service.type }} + export POD_NAME=$(kubectl get pods --namespace {{ .Release.Namespace }} -l "app.kubernetes.io/name={{ include "gslb-operator.name" . }},app.kubernetes.io/instance={{ .Release.Name }}" -o jsonpath="{.items[0].metadata.name}") + export CONTAINER_PORT=$(kubectl get pod --namespace {{ .Release.Namespace }} $POD_NAME -o jsonpath="{.spec.containers[0].ports[0].containerPort}") + echo "Visit http://127.0.0.1:8080 to use your application" + kubectl --namespace {{ .Release.Namespace }} port-forward $POD_NAME 8080:$CONTAINER_PORT +{{- end }} diff --git a/charts/gslb-operator/templates/_helpers.tpl b/charts/gslb-operator/templates/_helpers.tpl new file mode 100644 index 0000000..8a407f0 --- /dev/null +++ b/charts/gslb-operator/templates/_helpers.tpl @@ -0,0 +1,62 @@ +{{/* +Expand the name of the gslb-operator. +*/}} +{{- define "gslb-operator.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains gslb-operator name it will be used as a full name. +*/}} +{{- define "gslb-operator.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Create gslb-operator name and version as used by the gslb-operator label. +*/}} +{{- define "gslb-operator.gslb-operator" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "gslb-operator.labels" -}} +helm.sh/gslb-operator: {{ include "gslb-operator.gslb-operator" . }} +{{ include "gslb-operator.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "gslb-operator.selectorLabels" -}} +app.kubernetes.io/name: {{ include "gslb-operator.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + +{{/* +Create the name of the service account to use +*/}} +{{- define "gslb-operator.serviceAccountName" -}} +{{- if .Values.serviceAccount.create }} +{{- default (include "gslb-operator.fullname" .) .Values.serviceAccount.name }} +{{- else }} +{{- default "default" .Values.serviceAccount.name }} +{{- end }} +{{- end }} diff --git a/charts/gslb-operator/templates/deployment.yaml b/charts/gslb-operator/templates/deployment.yaml new file mode 100644 index 0000000..1d76a19 --- /dev/null +++ b/charts/gslb-operator/templates/deployment.yaml @@ -0,0 +1,79 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "gslb-operator.fullname" . }} + namespace: {{ .Values.namespace }} + labels: + {{- include "gslb-operator.labels" . | nindent 4 }} +spec: + {{- if not .Values.autoscaling.enabled }} + replicas: {{ .Values.replicaCount }} + {{- end }} + selector: + matchLabels: + {{- include "gslb-operator.selectorLabels" . | nindent 6 }} + template: + metadata: + {{- with .Values.podAnnotations }} + annotations: + {{- toYaml . | nindent 8 }} + {{- end }} + labels: + {{- include "gslb-operator.labels" . | nindent 8 }} + {{- with .Values.podLabels }} + {{- toYaml . | nindent 8 }} + {{- end }} + spec: + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + serviceAccountName: {{ include "gslb-operator.serviceAccountName" . }} + {{- with .Values.podSecurityContext }} + securityContext: + {{- toYaml . | nindent 8 }} + {{- end }} + containers: + - name: {{ .Chart.Name }} + {{- with .Values.securityContext }} + securityContext: + {{- toYaml . | nindent 12 }} + {{- end }} + image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + ports: + - name: http + containerPort: {{ .Values.service.port }} + protocol: TCP + {{- with .Values.livenessProbe }} + livenessProbe: + {{- toYaml . | nindent 12 }} + {{- end }} + {{- with .Values.readinessProbe }} + readinessProbe: + {{- toYaml . | nindent 12 }} + {{- end }} + {{- with .Values.resources }} + resources: + {{- toYaml . | nindent 12 }} + {{- end }} + {{- with .Values.volumeMounts }} + volumeMounts: + {{- toYaml . | nindent 12 }} + {{- end }} + {{- with .Values.volumes }} + volumes: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} diff --git a/charts/gslb-operator/templates/hpa.yaml b/charts/gslb-operator/templates/hpa.yaml new file mode 100644 index 0000000..e261eb6 --- /dev/null +++ b/charts/gslb-operator/templates/hpa.yaml @@ -0,0 +1,32 @@ +{{- if .Values.autoscaling.enabled }} +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: {{ include "gslb-operator.fullname" . }} + labels: + {{- include "gslb-operator.labels" . | nindent 4 }} +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: {{ include "gslb-operator.fullname" . }} + minReplicas: {{ .Values.autoscaling.minReplicas }} + maxReplicas: {{ .Values.autoscaling.maxReplicas }} + metrics: + {{- if .Values.autoscaling.targetCPUUtilizationPercentage }} + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: {{ .Values.autoscaling.targetCPUUtilizationPercentage }} + {{- end }} + {{- if .Values.autoscaling.targetMemoryUtilizationPercentage }} + - type: Resource + resource: + name: memory + target: + type: Utilization + averageUtilization: {{ .Values.autoscaling.targetMemoryUtilizationPercentage }} + {{- end }} +{{- end }} diff --git a/charts/gslb-operator/templates/httproute.yaml b/charts/gslb-operator/templates/httproute.yaml new file mode 100644 index 0000000..afe2371 --- /dev/null +++ b/charts/gslb-operator/templates/httproute.yaml @@ -0,0 +1,38 @@ +{{- if .Values.httpRoute.enabled -}} +{{- $fullName := include "gslb-operator.fullname" . -}} +{{- $svcPort := .Values.service.port -}} +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: {{ $fullName }} + labels: + {{- include "gslb-operator.labels" . | nindent 4 }} + {{- with .Values.httpRoute.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + parentRefs: + {{- with .Values.httpRoute.parentRefs }} + {{- toYaml . | nindent 4 }} + {{- end }} + {{- with .Values.httpRoute.hostnames }} + hostnames: + {{- toYaml . | nindent 4 }} + {{- end }} + rules: + {{- range .Values.httpRoute.rules }} + {{- with .matches }} + - matches: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .filters }} + filters: + {{- toYaml . | nindent 8 }} + {{- end }} + backendRefs: + - name: {{ $fullName }} + port: {{ $svcPort }} + weight: 1 + {{- end }} +{{- end }} diff --git a/charts/gslb-operator/templates/rbac/role.yaml b/charts/gslb-operator/templates/rbac/role.yaml new file mode 100644 index 0000000..e3c22c8 --- /dev/null +++ b/charts/gslb-operator/templates/rbac/role.yaml @@ -0,0 +1,9 @@ +{{- if .Values.rbac.create }} +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + {{- include "gslb-operator.labels" . | nindent 4 }} + name: gslb-operator-role +rules: [] +{{- end }} \ No newline at end of file diff --git a/charts/gslb-operator/templates/rbac/role_binding.yaml b/charts/gslb-operator/templates/rbac/role_binding.yaml new file mode 100644 index 0000000..67c7b88 --- /dev/null +++ b/charts/gslb-operator/templates/rbac/role_binding.yaml @@ -0,0 +1,16 @@ +{{- if .Values.rbac.create }} +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + labels: + {{- include "gslb-operator.labels" . | nindent 4 }} + name: gslb-operator-rolebinding +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: gslb-operator-role +subjects: +- kind: ServiceAccount + name: {{ .Values.serviceAccount.name }} + namespace: {{ .Values.namespace | default .Release.Namespace }} +{{- end -}} \ No newline at end of file diff --git a/charts/gslb-operator/templates/rbac/service_account.yaml b/charts/gslb-operator/templates/rbac/service_account.yaml new file mode 100644 index 0000000..aaac462 --- /dev/null +++ b/charts/gslb-operator/templates/rbac/service_account.yaml @@ -0,0 +1,15 @@ +{{- if .Values.rbac.create }} +apiVersion: v1 +kind: ServiceAccount +metadata: + labels: + {{- include "gslb-operator.labels" . | nindent 4 }} + {{- if and .Values.serviceAccount .Values.serviceAccount.annotations }} + annotations: + {{- range $key, $value := .Values.serviceAccount.annotations }} + {{ $key }}: {{ $value }} + {{- end }} + {{- end }} + name: {{ .Values.serviceAccount.name }} + namespace: {{ .Values.namespace | default .Release.Namespace }} +{{- end }} \ No newline at end of file diff --git a/charts/gslb-operator/templates/service.yaml b/charts/gslb-operator/templates/service.yaml new file mode 100644 index 0000000..36be3d1 --- /dev/null +++ b/charts/gslb-operator/templates/service.yaml @@ -0,0 +1,17 @@ +{{- if .Values.service.create }} +apiVersion: v1 +kind: Service +metadata: + name: {{ include "gslb-operator.fullname" . }} + labels: + {{- include "gslb-operator.labels" . | nindent 4 }} +spec: + type: {{ .Values.service.type }} + ports: + - port: {{ .Values.service.port }} + targetPort: http + protocol: TCP + name: http + selector: + {{- include "gslb-operator.selectorLabels" . | nindent 4 }} +{{- end }} diff --git a/charts/gslb-operator/values.yaml b/charts/gslb-operator/values.yaml new file mode 100644 index 0000000..b45122b --- /dev/null +++ b/charts/gslb-operator/values.yaml @@ -0,0 +1,153 @@ +# Default values for chart. +# This is a YAML-formatted file. +# Declare variables to be passed into your templates. + +# This will set the replicaset count more information can be found here: https://kubernetes.io/docs/concepts/workloads/controllers/replicaset/ +replicaCount: 1 + +namespace: "gslb-operator" + +# This sets the container image more information can be found here: https://kubernetes.io/docs/concepts/containers/images/ +image: + repository: ghcr.io/vitistack/gslb-operator + # This sets the pull policy for images. + pullPolicy: IfNotPresent + # Overrides the image tag whose default is the chart appVersion. + tag: "" + +# This is for the secrets for pulling an image from a private repository more information can be found here: https://kubernetes.io/docs/tasks/configure-pod-container/pull-image-private-registry/ +imagePullSecrets: [] +# This is to override the chart name. +nameOverride: "" +fullnameOverride: "" + +# This section builds out the service account more information can be found here: https://kubernetes.io/docs/concepts/security/service-accounts/ +serviceAccount: + # Specifies whether a service account should be created + create: true + # Automatically mount a ServiceAccount's API credentials? + automount: true + # Annotations to add to the service account + annotations: {} + # The name of the service account to use. + # If not set and create is true, a name is generated using the fullname template + name: "sa-gslb-operator" + +rbac: + create: true + +# This is for setting Kubernetes Annotations to a Pod. +# For more information checkout: https://kubernetes.io/docs/concepts/overview/working-with-objects/annotations/ +podAnnotations: {} +# This is for setting Kubernetes Labels to a Pod. +# For more information checkout: https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/ +podLabels: {} + +podSecurityContext: + fsGroup: 2000 + +securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + runAsNonRoot: true + runAsUser: 1000 + seccompProfile: + type: RuntimeDefault + +# This is for setting up a service more information can be found here: https://kubernetes.io/docs/concepts/services-networking/service/ +service: + create: false + # This sets the service type more information can be found here: https://kubernetes.io/docs/concepts/services-networking/service/#publishing-services-service-types + type: ClusterIP + # This sets the ports more information can be found here: https://kubernetes.io/docs/concepts/services-networking/service/#field-spec-ports + port: 80 + +# -- Expose the service via gateway-api HTTPRoute +# Requires Gateway API resources and suitable controller installed within the cluster +# (see: https://gateway-api.sigs.k8s.io/guides/) +httpRoute: + # HTTPRoute enabled. + enabled: false + # HTTPRoute annotations. + annotations: {} + # Which Gateways this Route is attached to. + parentRefs: + - name: gateway + sectionName: http + # namespace: default + # Hostnames matching HTTP header. + hostnames: + - chart-example.local + # List of rules and filters applied. + rules: + - matches: + - path: + type: PathPrefix + value: /headers + # filters: + # - type: RequestHeaderModifier + # requestHeaderModifier: + # set: + # - name: My-Overwrite-Header + # value: this-is-the-only-value + # remove: + # - User-Agent + # - matches: + # - path: + # type: PathPrefix + # value: /echo + # headers: + # - name: version + # value: v2 + +resources: + # We usually recommend not to specify default resources and to leave this as a conscious + # choice for the user. This also increases chances charts run on environments with little + # resources, such as Minikube. If you do want to specify resources, uncomment the following + # lines, adjust them as necessary, and remove the curly braces after 'resources:'. + #limits: + # cpu: 100m + # memory: 128Mi + requests: + cpu: 100m + memory: 128Mi + +# This is to setup the liveness and readiness probes more information can be found here: https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/ +#livenessProbe: +# httpGet: +# path: / +# port: http +#readinessProbe: +# httpGet: +# path: / +# port: http + +# This section is for setting up autoscaling more information can be found here: https://kubernetes.io/docs/concepts/workloads/autoscaling/ +autoscaling: + enabled: true + minReplicas: 1 + maxReplicas: 1 + targetCPUUtilizationPercentage: 60 + targetMemoryUtilizationPercentage: 60 + +# Additional volumes on the output Deployment definition. +volumes: [] +# - name: foo +# secret: +# secretName: mysecret +# optional: false + +# Additional volumeMounts on the output Deployment definition. +volumeMounts: [] +# - name: foo +# mountPath: "/etc/foo" +# readOnly: true + +nodeSelector: {} + +tolerations: [] + +affinity: {} From 1b1ab085d4da4b68d336f13ce5ef5d733b5a264a Mon Sep 17 00:00:00 2001 From: Espen Wobbes Date: Tue, 24 Feb 2026 13:21:02 +0100 Subject: [PATCH 10/24] necessary changes for k8s deployment --- .gitignore | 1 + charts/gslb-operator/templates/configmap.yaml | 11 +++ .../gslb-operator/templates/credentials.yaml | 31 ++++++++ .../gslb-operator/templates/deployment.yaml | 3 + charts/gslb-operator/values.yaml | 27 +++++-- cmd/main.go | 38 +++++++++- internal/config/config.go | 20 ++++-- internal/dns/handler.go | 8 +-- internal/manager/manager.go | 20 +++--- internal/manager/scheduler/scheduler.go | 31 +++++++- internal/manager/servicegroup.go | 5 +- pkg/loaders/file_loader.go | 72 +++++++++++++++++-- 12 files changed, 233 insertions(+), 34 deletions(-) create mode 100644 charts/gslb-operator/templates/configmap.yaml create mode 100644 charts/gslb-operator/templates/credentials.yaml diff --git a/.gitignore b/.gitignore index b7792ea..d91ff60 100644 --- a/.gitignore +++ b/.gitignore @@ -28,6 +28,7 @@ go.work.sum # env file *.env +secrets store.json # Editor/IDE diff --git a/charts/gslb-operator/templates/configmap.yaml b/charts/gslb-operator/templates/configmap.yaml new file mode 100644 index 0000000..e052afd --- /dev/null +++ b/charts/gslb-operator/templates/configmap.yaml @@ -0,0 +1,11 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: gslb-operator-config + namespace: {{ .Values.namespace }} +data: + SRV_ENV: {{ .Values.settings.env }} + SRV_LUA_SANDBOX: {{ .Values.settings.sandbox }} + API_PORT: {{ .Values.settings.port }} + GSLB_POLL_INTERVAL: {{ .Values.settings.poll_interval }} + GSLB_UPDATER_HOST: {{ .Values.settings.gslb_updater }} diff --git a/charts/gslb-operator/templates/credentials.yaml b/charts/gslb-operator/templates/credentials.yaml new file mode 100644 index 0000000..1b14ecb --- /dev/null +++ b/charts/gslb-operator/templates/credentials.yaml @@ -0,0 +1,31 @@ +{{- if .Values.vault.enable }} +apiVersion: external-secrets.io/v1 +kind: ExternalSecret +metadata: + name: gslb-operator-secrets + namespace: {{ .Values.namespace }} +spec: + refreshInterval: 1h + secretStoreRef: + name: lb-openbao + kind: ClusterSecretStore + target: + name: gslb-operator-secrets + data: + - secretKey: JWT_SECRET + remoteRef: + key: /gslb-operator + property: jwt-secret + - secretKey: JWT_USER + remoteRef: + key: /gslb-operator + property: jwt-user + - secretKey: GSLB_ZONE + remoteRef: + key: /gslb-operator + property: gslb-zone + - secretKey: GSLB_NAMESERVER + remoteRef: + key: /gslb-operator + property: gslb-nameserver +{{- end }} diff --git a/charts/gslb-operator/templates/deployment.yaml b/charts/gslb-operator/templates/deployment.yaml index 1d76a19..85da3b8 100644 --- a/charts/gslb-operator/templates/deployment.yaml +++ b/charts/gslb-operator/templates/deployment.yaml @@ -41,6 +41,9 @@ spec: {{- end }} image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" imagePullPolicy: {{ .Values.image.pullPolicy }} + envFrom: + - configMapRef: + name: gslb-operator-config ports: - name: http containerPort: {{ .Values.service.port }} diff --git a/charts/gslb-operator/values.yaml b/charts/gslb-operator/values.yaml index b45122b..37811cf 100644 --- a/charts/gslb-operator/values.yaml +++ b/charts/gslb-operator/values.yaml @@ -127,21 +127,24 @@ resources: # This section is for setting up autoscaling more information can be found here: https://kubernetes.io/docs/concepts/workloads/autoscaling/ autoscaling: - enabled: true + enabled: false minReplicas: 1 maxReplicas: 1 targetCPUUtilizationPercentage: 60 targetMemoryUtilizationPercentage: 60 # Additional volumes on the output Deployment definition. -volumes: [] -# - name: foo -# secret: -# secretName: mysecret -# optional: false +volumes: + - name: secrets + secret: + secretName: gslb-operator-secrets + optional: false # Additional volumeMounts on the output Deployment definition. -volumeMounts: [] +volumeMounts: + - name: secrets + mountPath: "/app/secrets" + readOnly: true # - name: foo # mountPath: "/etc/foo" # readOnly: true @@ -151,3 +154,13 @@ nodeSelector: {} tolerations: [] affinity: {} + +settings: + env: prod + sandbox: sandbox.lua + port: :3000 + poll_interval: 1m + gslb_updater: 127.0.0.1:9000 + +vault: + enable: true diff --git a/cmd/main.go b/cmd/main.go index a387544..0297d3c 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -13,11 +13,13 @@ import ( "github.com/vitistack/gslb-operator/internal/api/handlers/failover" "github.com/vitistack/gslb-operator/internal/api/handlers/spoofs" "github.com/vitistack/gslb-operator/internal/api/routes" + "github.com/vitistack/gslb-operator/internal/checks" "github.com/vitistack/gslb-operator/internal/config" "github.com/vitistack/gslb-operator/internal/dns" "github.com/vitistack/gslb-operator/internal/manager" "github.com/vitistack/gslb-operator/internal/model" "github.com/vitistack/gslb-operator/internal/repositories/service" + "github.com/vitistack/gslb-operator/internal/utils/timesutil" "github.com/vitistack/gslb-operator/pkg/auth" "github.com/vitistack/gslb-operator/pkg/auth/jwt" "github.com/vitistack/gslb-operator/pkg/bslog" @@ -55,6 +57,7 @@ func main() { manager.WithMinRunningWorkers(100), manager.WithNonBlockingBufferSize(110), manager.WithServiceRepository(svcRepo), + manager.WithDryRun(true), ) updater, err := dns.NewUpdater() @@ -70,6 +73,14 @@ func main() { background := context.Background() dnsHandler.Start(context.WithCancel(background)) + configs := getRandomGSLBConfig() + for _, cfg := range configs { + _, err := mgr.RegisterService(cfg) + if err != nil { + bslog.Fatal("could not create service", slog.String("reason", err.Error())) + } + } + api := http.NewServeMux() // routes handlers @@ -140,7 +151,7 @@ func main() { bslog.Info("gracefully shutting down...") } - shutdown, cancel := context.WithTimeout(background, time.Second*5) + shutdown, cancel := context.WithTimeout(background, time.Second*20) defer cancel() dnsHandler.Stop(shutdown) @@ -148,3 +159,28 @@ func main() { panic("error shutting down server: " + err.Error()) } } + +func getRandomGSLBConfig() []model.GSLBConfig { + configs := make([]model.GSLBConfig, 0, 500) + + cfg := model.GSLBConfig{ + Fqdn: "test.example.com", + Ip: "10.10.0.1", + Port: "80", + Datacenter: "DC1", + Interval: timesutil.FromDuration(time.Second * 5), + Priority: 1, + FailureThreshold: 3, + CheckType: checks.TCP_FULL, + } + + for idx := range cap(configs) { + + cfg.ServiceID = fmt.Sprintf("%d", idx) + cfg.MemberOf = fmt.Sprintf("%s.%s", cfg.ServiceID, cfg.Fqdn) + + configs = append(configs, cfg) + } + + return configs +} diff --git a/internal/config/config.go b/internal/config/config.go index c3ba99d..4dce0fb 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -73,7 +73,6 @@ func (c *Config) JWT() *JWT { // Server configuration type Server struct { ENV string `env:"SRV_ENV" flag:"env"` - DC string `env:"SRV_DATACENTER" flag:"datacenter"` LUA_SANDBOX string `env:"SRV_LUA_SANDBOX" flag:"lua-sandbox"` } @@ -81,10 +80,6 @@ func (s *Server) Env() string { return s.ENV } -func (s *Server) Datacenter() string { - return s.DC -} - func (s *Server) LuaSandbox() string { return s.LUA_SANDBOX } @@ -141,9 +136,18 @@ func (jwt *JWT) User() string { } func newConfig() (*Config, error) { + fileLoader, err := loaders.NewFileLoader( + ".env", + "./secrets", + ) + + if err != nil { + return nil, err + } + loader := loaders.NewChainLoader( loaders.NewEnvloader(), - loaders.NewFileLoader(".env"), + fileLoader, loaders.NewFlagLoader(), ) @@ -154,7 +158,9 @@ func newConfig() (*Config, error) { apiCfg := API{ PORT: ":8080", } - gslbCfg := GSLB{} + gslbCfg := GSLB{ + POLLINTERVAL: "1m", + } jwtCfg := JWT{} configs := []any{ diff --git a/internal/dns/handler.go b/internal/dns/handler.go index 6e8e4e8..e23a3ba 100644 --- a/internal/dns/handler.go +++ b/internal/dns/handler.go @@ -63,10 +63,10 @@ func (h *Handler) Start(ctx context.Context, cancel func()) { func (h *Handler) Stop(ctx context.Context) { done := make(chan struct{}) go func() { - h.cancel() // cancel zone-updates - h.wg.Wait() - h.svcManager.Stop() - close(done) + h.cancel() // cancel zone-updates + h.wg.Wait() + h.svcManager.Stop() + close(done) }() select { diff --git a/internal/manager/manager.go b/internal/manager/manager.go index abd5e7f..ea89815 100644 --- a/internal/manager/manager.go +++ b/internal/manager/manager.go @@ -69,19 +69,19 @@ func (sm *ServicesManager) Start() { } func (sm *ServicesManager) Stop() { - sm.pool.Stop() sm.stop.Do(func() { + for _, scheduler := range sm.schedulers { + scheduler.Stop() + } + bslog.Debug("waiting for schedulers to stop") + sm.wg.Wait() + + bslog.Debug("schedulers stopped - closing pool") + sm.pool.Stop() err := sm.OnShutdown() if err != nil { bslog.Error("error while performing shutdown tasks", slog.String("error", err.Error())) } - - for interval, scheduler := range sm.schedulers { - scheduler.Stop() - bslog.Debug("scheduler closed", slog.String("interval", interval.String())) - } - - sm.wg.Wait() bslog.Debug("service manager closed") }) } @@ -89,6 +89,7 @@ func (sm *ServicesManager) Stop() { func (sm *ServicesManager) OnShutdown() error { sm.mutex.Lock() defer sm.mutex.Unlock() + bslog.Debug("executing manager.OnShutdown()") for memberOf, group := range sm.serviceGroups { active := group.GetActive() @@ -406,6 +407,9 @@ func (sm *ServicesManager) handlePromotion(event *PromotionEvent) { } bslog.Info("new active service", slog.Any("service", event.NewActive)) sm.moveServiceToInterval(event.NewActive, baseInterval) + if sm.DNSUpdate == nil { + bslog.Fatal("DNSUpdate is nil!!!!") + } sm.DNSUpdate(event.NewActive, true) return } diff --git a/internal/manager/scheduler/scheduler.go b/internal/manager/scheduler/scheduler.go index 8119454..545af82 100644 --- a/internal/manager/scheduler/scheduler.go +++ b/internal/manager/scheduler/scheduler.go @@ -2,11 +2,13 @@ package scheduler import ( "container/heap" + "log/slog" "math/rand/v2" "sync" "time" "github.com/vitistack/gslb-operator/internal/service" + "github.com/vitistack/gslb-operator/pkg/bslog" ) const OFFSETS_PER_SECOND = 2 @@ -39,7 +41,7 @@ type Scheduler struct { // random jitter to spread out scheduled service on interval and sub-tick jitterRange time.Duration - stop chan struct{} + stop chan struct{} // signal stop wg *sync.WaitGroup mu sync.Mutex @@ -143,27 +145,52 @@ func (s *Scheduler) loop() { s.mu.Lock() s.isRunning = false s.mu.Unlock() + bslog.Debug("scheduler closed", slog.String("interval", s.interval.String())) }() for { + select { + case <-s.stop: // check stop + bslog.Debug("got stop, exiting scheduler...") + return + default: + } + s.mu.Lock() if s.heap.Len() == 0 { // no need to infinitly run on an empty queue s.mu.Unlock() - break + return } next := s.heap.Peek() s.mu.Unlock() if next.nextCheckTime.Before(time.Now()) { // check time already past, do action immediately and reschedule s.OnTick(next.service) + + select { + case <-s.stop: // check stop + bslog.Debug("got stop, exiting scheduler...") + return + default: + } + s.reSchedule() } else { timeUntil := time.Until(next.nextCheckTime) select { case <-s.stop: + bslog.Debug("got stop, exiting scheduler...") return case <-time.After(timeUntil): s.OnTick(next.service) + + select { + case <-s.stop: // check stop + bslog.Debug("got stop, exiting scheduler...") + return + default: + } + s.reSchedule() } } diff --git a/internal/manager/servicegroup.go b/internal/manager/servicegroup.go index 365650d..8e3c274 100644 --- a/internal/manager/servicegroup.go +++ b/internal/manager/servicegroup.go @@ -132,6 +132,7 @@ func (sg *ServiceGroup) OnServiceHealthChange(changedService *service.Service, h sg.lastActive = sg.active sg.active = changedService + sg.mu.Unlock() sg.OnPromotion(event) } @@ -139,6 +140,7 @@ func (sg *ServiceGroup) OnServiceHealthChange(changedService *service.Service, h if healthy { // If prioritized DC service becomes healthy, it must become active (single DNS record). if changedService.Datacenter == sg.prioritizedDatacenter && changedService != sg.active { + sg.mu.Unlock() sg.OnPromotion(&PromotionEvent{ Service: changedService.Fqdn, NewActive: changedService, @@ -149,6 +151,7 @@ func (sg *ServiceGroup) OnServiceHealthChange(changedService *service.Service, h } // If there is no active or the current active is unhealthy, promote this healthy service. if sg.active == nil || !sg.active.IsHealthy() { + sg.mu.Unlock() sg.OnPromotion(&PromotionEvent{ Service: changedService.Fqdn, NewActive: changedService, @@ -183,9 +186,9 @@ func (sg *ServiceGroup) OnServiceHealthChange(changedService *service.Service, h }) sg.lastActive = sg.active sg.active = nil + return } } - sg.mu.Unlock() } // This does not take in to account if the registered service has the highest priority diff --git a/pkg/loaders/file_loader.go b/pkg/loaders/file_loader.go index 770cf27..1fe6a1a 100644 --- a/pkg/loaders/file_loader.go +++ b/pkg/loaders/file_loader.go @@ -3,7 +3,9 @@ package loaders import ( "encoding/json" "fmt" + "io/fs" "os" + "path/filepath" "reflect" "strings" @@ -14,10 +16,36 @@ type FileLoader struct { fileNames []string } -func NewFileLoader(fileNames ...string) *FileLoader { - return &FileLoader{ - fileNames: fileNames, +func NewFileLoader(fileNames ...string) (*FileLoader, error) { + loader := &FileLoader{ + fileNames: make([]string, 0, len(fileNames)), } + for _, file := range fileNames { + info, err := os.Stat(file) + if err == nil { // silently drop files that dont exist + if info.IsDir() { + err := filepath.Walk(file, func(path string, info fs.FileInfo, err error) error { + if err != nil { + return err + } + + if info.IsDir() { + return nil + } + + loader.fileNames = append(loader.fileNames, path) + return nil + }) + if err != nil { + return nil, fmt.Errorf("could not list files in directory: %w", err) + } + } else { + loader.fileNames = append(loader.fileNames, file) + } + } + } + + return loader, nil } func (f *FileLoader) Load(dest any) error { @@ -31,7 +59,7 @@ func (f *FileLoader) Load(dest any) error { err = f.loadJSON(dest, file) default: - err = f.loadDotEnv(dest, file) + err = f.loadPlainText(dest, file) } if err != nil { return fmt.Errorf("could not load file: %s: %w", file, err) @@ -89,3 +117,39 @@ func (f *FileLoader) loadJSON(dest any, file string) error { return nil } + +func (f *FileLoader) loadPlainText(dest any, file string) error { + val := reflect.ValueOf(dest).Elem() + typ := val.Type() + + if typ.Kind() != reflect.Struct { + return fmt.Errorf("unable to load config file: %s: destination must be a struct pointer", file) + } + rawData, err := os.ReadFile(file) + if err != nil { + return fmt.Errorf("could not read file: %s: %w", file, err) + } + data := string(rawData) + + for i := range val.NumField() { + field := val.Field(i) + fieldTyp := typ.Field(i) + + if !field.CanSet() { + continue + } + + tag, ok := fieldTyp.Tag.Lookup("env") + if !ok { + continue + } + + if strings.Contains(file, tag) { // file name must contain the struct tag + if err := setEnvironmentVariable(field, data); err != nil { + return fmt.Errorf("unable to set struct value: %w", err) + } + } + } + + return nil +} From 9846477d22bfaf4dcfb8ce8d56c3ba295531d43f Mon Sep 17 00:00:00 2001 From: Espen Wobbes Date: Tue, 24 Feb 2026 13:44:32 +0100 Subject: [PATCH 11/24] fix: permissions --- .github/workflows/release.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index fa0522c..cea85d1 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -5,6 +5,10 @@ on: tags: - "v*" +permissions: + contents: read + packages: write + jobs: create-oci-image: runs-on: ubuntu-latest From 521f15fcbac464556b14ef13e4533e0eeefe0262 Mon Sep 17 00:00:00 2001 From: Espen Wobbes Date: Tue, 24 Feb 2026 14:18:15 +0100 Subject: [PATCH 12/24] fix: helmchart yq stuff --- .github/workflows/release.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index cea85d1..1974589 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -82,9 +82,8 @@ jobs: VERSION: ${{ github.ref_name }} run: | export HELM_VERSION=${VERSION#v*} - ./yq e -i '.version = strenv(HELM_VERSION),.appVersion = strenv(VERSION)' chart/ipam-operator/Chart.yaml - ./yq e -i '.global.controller.tag = strenv(HELM_VERSION),.global.controller.repository = "ncr.sky.nhn.no/ghcr/vitistack/ipam-operator"' chart/ipam-operator/values.prod.yaml - ./yq e -i '.global.controller.tag = strenv(HELM_VERSION),.global.controller.repository = "ncr.sky.nhn.no/ghcr/vitistack/ipam-operator"' chart/ipam-operator/values.test.yaml + ./yq e -i '.version = strenv(HELM_VERSION),.appVersion = strenv(VERSION)' charts/gslb-operator/Chart.yaml + ./yq e -i '.global.controller.tag = strenv(HELM_VERSION),.global.controller.repository = "ncr.sky.nhn.no/ghcr/vitistack/gslb-operator"' charts/gslb-operator/values.yaml helm package chart/ipam-operator echo ${{ secrets.GITHUB_TOKEN }} | helm registry login -u ${{ github.actor }} ghcr.io --password-stdin helm push ipam-operator-${HELM_VERSION}.tgz oci://ghcr.io/vitistack/helm/ From 6479eb551757238b8ce140857901dc74a0beecec Mon Sep 17 00:00:00 2001 From: Espen Wobbes Date: Tue, 24 Feb 2026 14:27:07 +0100 Subject: [PATCH 13/24] fix: helm commands again --- .github/workflows/release.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 1974589..7952ce3 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -84,6 +84,6 @@ jobs: export HELM_VERSION=${VERSION#v*} ./yq e -i '.version = strenv(HELM_VERSION),.appVersion = strenv(VERSION)' charts/gslb-operator/Chart.yaml ./yq e -i '.global.controller.tag = strenv(HELM_VERSION),.global.controller.repository = "ncr.sky.nhn.no/ghcr/vitistack/gslb-operator"' charts/gslb-operator/values.yaml - helm package chart/ipam-operator + helm package charts/gslb-operator echo ${{ secrets.GITHUB_TOKEN }} | helm registry login -u ${{ github.actor }} ghcr.io --password-stdin helm push ipam-operator-${HELM_VERSION}.tgz oci://ghcr.io/vitistack/helm/ From 0dcf2e70f5db371bbf4761d8887f361db23232da Mon Sep 17 00:00:00 2001 From: Espen Wobbes Date: Tue, 24 Feb 2026 14:32:24 +0100 Subject: [PATCH 14/24] fix: HELM --- .github/workflows/release.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 7952ce3..e8b5e68 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -86,4 +86,4 @@ jobs: ./yq e -i '.global.controller.tag = strenv(HELM_VERSION),.global.controller.repository = "ncr.sky.nhn.no/ghcr/vitistack/gslb-operator"' charts/gslb-operator/values.yaml helm package charts/gslb-operator echo ${{ secrets.GITHUB_TOKEN }} | helm registry login -u ${{ github.actor }} ghcr.io --password-stdin - helm push ipam-operator-${HELM_VERSION}.tgz oci://ghcr.io/vitistack/helm/ + helm push gslb-operator-${HELM_VERSION}.tgz oci://ghcr.io/vitistack/helm/ From c3b37a7c860961584519bab89bc9b89adafaf3e9 Mon Sep 17 00:00:00 2001 From: Espen Wobbes Date: Tue, 24 Feb 2026 15:13:34 +0100 Subject: [PATCH 15/24] HELM --- charts/gslb-operator/templates/deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/charts/gslb-operator/templates/deployment.yaml b/charts/gslb-operator/templates/deployment.yaml index 85da3b8..c1290b2 100644 --- a/charts/gslb-operator/templates/deployment.yaml +++ b/charts/gslb-operator/templates/deployment.yaml @@ -43,7 +43,7 @@ spec: imagePullPolicy: {{ .Values.image.pullPolicy }} envFrom: - configMapRef: - name: gslb-operator-config + name: gslb-operator-config ports: - name: http containerPort: {{ .Values.service.port }} From 57eb79c6dfebf6fe57b6ed9dd3a277a8b0e2e8cf Mon Sep 17 00:00:00 2001 From: Espen Wobbes Date: Wed, 25 Feb 2026 08:13:38 +0100 Subject: [PATCH 16/24] fix: repository url for deployment image --- charts/gslb-operator/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/charts/gslb-operator/values.yaml b/charts/gslb-operator/values.yaml index 37811cf..14f7609 100644 --- a/charts/gslb-operator/values.yaml +++ b/charts/gslb-operator/values.yaml @@ -9,7 +9,7 @@ namespace: "gslb-operator" # This sets the container image more information can be found here: https://kubernetes.io/docs/concepts/containers/images/ image: - repository: ghcr.io/vitistack/gslb-operator + repository: ncr.sky.nhn.no/vitistack/gslb-operator # This sets the pull policy for images. pullPolicy: IfNotPresent # Overrides the image tag whose default is the chart appVersion. From dd8f1f4ea98fda5581b46b907cd520ca3bf8f72d Mon Sep 17 00:00:00 2001 From: Espen Wobbes Date: Wed, 25 Feb 2026 09:04:36 +0100 Subject: [PATCH 17/24] fix: checking if file is directory before continuing in loading plain text --- pkg/loaders/file_loader.go | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pkg/loaders/file_loader.go b/pkg/loaders/file_loader.go index 1fe6a1a..ce632f0 100644 --- a/pkg/loaders/file_loader.go +++ b/pkg/loaders/file_loader.go @@ -119,6 +119,15 @@ func (f *FileLoader) loadJSON(dest any, file string) error { } func (f *FileLoader) loadPlainText(dest any, file string) error { + info, err := os.Stat(file) + if err != nil { + return fmt.Errorf("unable to load file: %s: %w", file, err) + } + + if info.IsDir() { // skip directories + return nil + } + val := reflect.ValueOf(dest).Elem() typ := val.Type() From b1b9e0c7faf7f522173b8878cbae81cd6c8d259b Mon Sep 17 00:00:00 2001 From: Espen Wobbes Date: Wed, 25 Feb 2026 09:47:27 +0100 Subject: [PATCH 18/24] fix: Dockerfile permissions on sandbox, and helm (again) --- .github/workflows/release.yml | 2 +- Dockerfile | 6 +++++- charts/gslb-operator/values.yaml | 10 ++++++---- cmd/main.go | 2 +- 4 files changed, 13 insertions(+), 7 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index e8b5e68..9cf45b9 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -83,7 +83,7 @@ jobs: run: | export HELM_VERSION=${VERSION#v*} ./yq e -i '.version = strenv(HELM_VERSION),.appVersion = strenv(VERSION)' charts/gslb-operator/Chart.yaml - ./yq e -i '.global.controller.tag = strenv(HELM_VERSION),.global.controller.repository = "ncr.sky.nhn.no/ghcr/vitistack/gslb-operator"' charts/gslb-operator/values.yaml + ./yq e -i '.image.tag = strenv(HELM_VERSION),.image.repository = "ncr.sky.nhn.no/ghcr/vitistack/gslb-operator"' charts/gslb-operator/values.yaml helm package charts/gslb-operator echo ${{ secrets.GITHUB_TOKEN }} | helm registry login -u ${{ github.actor }} ghcr.io --password-stdin helm push gslb-operator-${HELM_VERSION}.tgz oci://ghcr.io/vitistack/helm/ diff --git a/Dockerfile b/Dockerfile index 42e158d..efb0f19 100644 --- a/Dockerfile +++ b/Dockerfile @@ -19,11 +19,15 @@ FROM alpine:3.23 WORKDIR /app RUN addgroup -S gslb-group && adduser -S gslb-operator -G gslb-group -RUN chown -R gslb-operator:gslb-group /app COPY --from=build /app/gslb-operator /app/gslb-operator COPY sandbox.lua /app +# change ownership of directory +RUN chown -R gslb-operator:gslb-group /app + +# sandbox is read-only +RUN chmod 440 sandbox.lua USER gslb-operator CMD [ "./gslb-operator" ] diff --git a/charts/gslb-operator/values.yaml b/charts/gslb-operator/values.yaml index 14f7609..d4d707d 100644 --- a/charts/gslb-operator/values.yaml +++ b/charts/gslb-operator/values.yaml @@ -51,7 +51,7 @@ securityContext: capabilities: drop: - ALL - readOnlyRootFilesystem: true + readOnlyRootFilesystem: false runAsNonRoot: true runAsUser: 1000 seccompProfile: @@ -139,15 +139,17 @@ volumes: secret: secretName: gslb-operator-secrets optional: false + - name: data + emptyDir: {} # Additional volumeMounts on the output Deployment definition. volumeMounts: - name: secrets mountPath: "/app/secrets" readOnly: true -# - name: foo -# mountPath: "/etc/foo" -# readOnly: true + - name: data + mountPath: "/app/data" + readOnly: false nodeSelector: {} diff --git a/cmd/main.go b/cmd/main.go index 0297d3c..05507b0 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -45,7 +45,7 @@ func main() { bslog.Fatal("could not load lua configuration", slog.Any("reason", err)) } - serviceFileStore, err := file.NewStore[model.GSLBServiceGroup]("store.json") + serviceFileStore, err := file.NewStore[model.GSLBServiceGroup]("./data/store.json") if err != nil { bslog.Fatal("could not create persistent storage", slog.String("reason", err.Error())) } From d7382b3ac1944954c3d81251777ac7e51c36081a Mon Sep 17 00:00:00 2001 From: Espen Wobbes Date: Wed, 25 Feb 2026 10:29:39 +0100 Subject: [PATCH 19/24] fix: remove runasnonroot and runasuser handled in the Dockerfile instead --- charts/gslb-operator/values.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/charts/gslb-operator/values.yaml b/charts/gslb-operator/values.yaml index d4d707d..ef4c32a 100644 --- a/charts/gslb-operator/values.yaml +++ b/charts/gslb-operator/values.yaml @@ -52,8 +52,8 @@ securityContext: drop: - ALL readOnlyRootFilesystem: false - runAsNonRoot: true - runAsUser: 1000 + #runAsNonRoot: true + #runAsUser: 1000 seccompProfile: type: RuntimeDefault From 22a9d212a6a8d364ae3ae9efa34ff12027bd94c6 Mon Sep 17 00:00:00 2001 From: Espen Wobbes Date: Thu, 26 Feb 2026 12:07:45 +0100 Subject: [PATCH 20/24] feat: metrics collection for prometheus --- cmd/main.go | 4 + docker-compose.yaml | 51 ++++++++ go.mod | 10 ++ go.sum | 50 ++++++-- internal/api/routes/const.go | 3 + internal/manager/healthcheck/healtheck.go | 35 +++--- internal/manager/healthcheck/metrics.go | 137 +++------------------- internal/manager/manager.go | 39 ++++-- internal/manager/metrics.go | 26 ++++ internal/manager/servicegroup.go | 39 +++--- internal/manager/servicegroup_test.go | 6 +- internal/service/service.go | 2 +- pkg/pool/pool.go | 13 ++ 13 files changed, 237 insertions(+), 178 deletions(-) create mode 100644 internal/manager/metrics.go diff --git a/cmd/main.go b/cmd/main.go index a387544..61c5ad6 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -10,6 +10,7 @@ import ( "syscall" "time" + "github.com/prometheus/client_golang/prometheus/promhttp" "github.com/vitistack/gslb-operator/internal/api/handlers/failover" "github.com/vitistack/gslb-operator/internal/api/handlers/spoofs" "github.com/vitistack/gslb-operator/internal/api/routes" @@ -117,6 +118,9 @@ func main() { middleware.WithIncomingRequestLogging(slog.Default()), )(spoofsApiService.DeleteOverride)) + // metrics + api.Handle(routes.METRICS, promhttp.Handler()) + server := http.Server{ Addr: cfg.API().Port(), Handler: api, diff --git a/docker-compose.yaml b/docker-compose.yaml index 764edf1..fbeede4 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -7,6 +7,9 @@ services: # - ./policies:/policies opa: image: openpolicyagent/opa:0.60.0 + profiles: + - "opa" + - "all" command: - "run" - "--server" @@ -25,3 +28,51 @@ services: interval: 10s timeout: 5s retries: 3 + + prometheus: + image: prom/prometheus:latest + profiles: + - "monitoring" + - "all" + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.path=/prometheus' + - '--web.console.libraries=/usr/share/prometheus/console_libraries' + - '--web.console.templates=/usr/share/prometheus/consoles' + ports: + - "9090:9090" + volumes: + - ./monitoring/prometheus.yml:/etc/prometheus/prometheus.yml:ro + - prometheus-data:/prometheus + restart: unless-stopped + healthcheck: + test: ["CMD", "wget", "--spider", "-q", "http://localhost:9090/-/healthy"] + interval: 10s + timeout: 5s + retries: 3 + + grafana: + image: grafana/grafana:latest + profiles: + - "monitoring" + - "all" + ports: + - "3000:3000" + environment: + - GF_SECURITY_ADMIN_PASSWORD=admin + - GF_USERS_ALLOW_SIGN_UP=false + volumes: + - grafana-data:/var/lib/grafana + - ./monitoring/grafana/provisioning:/etc/grafana/provisioning:ro + depends_on: + - prometheus + restart: unless-stopped + healthcheck: + test: ["CMD", "wget", "--spider", "-q", "http://localhost:3000/api/health"] + interval: 10s + timeout: 5s + retries: 3 + +volumes: + prometheus-data: + grafana-data: \ No newline at end of file diff --git a/go.mod b/go.mod index 560d08a..36a1c4b 100644 --- a/go.mod +++ b/go.mod @@ -7,12 +7,22 @@ require ( github.com/golang-jwt/jwt/v5 v5.3.1 github.com/google/uuid v1.6.0 github.com/joho/godotenv v1.5.1 + github.com/prometheus/client_golang v1.23.2 github.com/tevino/tcp-shaker v0.0.0-20260210162928-fb888f26451b github.com/yuin/gopher-lua v1.1.1 golang.org/x/crypto v0.48.0 ) require ( + github.com/beorn7/perks v1.0.1 // indirect + github.com/cespare/xxhash/v2 v2.3.0 // indirect + github.com/kr/text v0.2.0 // indirect + github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect + github.com/prometheus/client_model v0.6.2 // indirect + github.com/prometheus/common v0.67.5 // indirect + github.com/prometheus/procfs v0.19.2 // indirect + go.yaml.in/yaml/v2 v2.4.3 // indirect golang.org/x/net v0.50.0 // indirect golang.org/x/sys v0.41.0 // indirect + google.golang.org/protobuf v1.36.11 // indirect ) diff --git a/go.sum b/go.sum index 6e25b55..3809fd9 100644 --- a/go.sum +++ b/go.sum @@ -1,28 +1,58 @@ -codeberg.org/miekg/dns v0.5.21 h1:O+Ibq9IJuOeMoBnNmYdQmKJ7J9zgEsUqcbBhjsSrzIc= -codeberg.org/miekg/dns v0.5.21/go.mod h1:Q10KolpjjNhl9x14KdKA3s+7Xynb8Zqvjj9jWyzrYRA= codeberg.org/miekg/dns v0.6.48 h1:+RZiJMKPq5BYjePB7AfTv7O+qf/3Kjsz9C4WmOUHdoA= codeberg.org/miekg/dns v0.6.48/go.mod h1:fIxAzBMDPnXWSw0fp8+pfZMRiAqYY4+HHYLzUo/S6Dg= +github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= +github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= +github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= +github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= +github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/golang-jwt/jwt/v5 v5.3.1 h1:kYf81DTWFe7t+1VvL7eS+jKFVWaUnK9cB1qbwn63YCY= github.com/golang-jwt/jwt/v5 v5.3.1/go.mod h1:fxCRLWMO43lRc8nhHWY6LGqRcf+1gQWArsqaEUEa5bE= +github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= +github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/joho/godotenv v1.5.1 h1:7eLL/+HRGLY0ldzfGMeQkb7vMd0as4CfYvUVzLqw0N0= github.com/joho/godotenv v1.5.1/go.mod h1:f4LDr5Voq0i2e/R5DDNOoa2zzDfwtkZa6DnEwAbqwq4= -github.com/tevino/tcp-shaker v0.0.0-20251020080735-c4094cd6c927 h1:BdtSwzS6fNIAC3Ylj3x/ak6PD4EV885gGhWR7eIplEI= -github.com/tevino/tcp-shaker v0.0.0-20251020080735-c4094cd6c927/go.mod h1:S0VUAF1puvgOrlSQqCrJiz2t7yn2gPKYSpGu4+w8eg0= +github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= +github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= +github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= +github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o= +github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg= +github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk= +github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE= +github.com/prometheus/common v0.67.5 h1:pIgK94WWlQt1WLwAC5j2ynLaBRDiinoAb86HZHTUGI4= +github.com/prometheus/common v0.67.5/go.mod h1:SjE/0MzDEEAyrdr5Gqc6G+sXI67maCxzaT3A2+HqjUw= +github.com/prometheus/procfs v0.19.2 h1:zUMhqEW66Ex7OXIiDkll3tl9a1ZdilUOd/F6ZXw4Vws= +github.com/prometheus/procfs v0.19.2/go.mod h1:M0aotyiemPhBCM0z5w87kL22CxfcH05ZpYlu+b4J7mw= +github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ= +github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog= +github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= +github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= github.com/tevino/tcp-shaker v0.0.0-20260210162928-fb888f26451b h1:vmeHwA9U5lODKqvdZQxKqy+i1Q2yMwShjxytoszeWmw= github.com/tevino/tcp-shaker v0.0.0-20260210162928-fb888f26451b/go.mod h1:bNnAwCfoEQXR47eBqFYS9fD6qTcY3t5ZUUgBZskRdcY= github.com/yuin/gopher-lua v1.1.1 h1:kYKnWBjvbNP4XLT3+bPEwAXJx262OhaHDWDVOPjL46M= github.com/yuin/gopher-lua v1.1.1/go.mod h1:GBR0iDaNXjAgGg9zfCvksxSRnQx76gclCIb7kdAd1Pw= -golang.org/x/crypto v0.43.0 h1:dduJYIi3A3KOfdGOHX8AVZ/jGiyPa3IbBozJ5kNuE04= -golang.org/x/crypto v0.43.0/go.mod h1:BFbav4mRNlXJL4wNeejLpWxB7wMbc79PdRGhWKncxR0= +go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= +go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= +go.yaml.in/yaml/v2 v2.4.3 h1:6gvOSjQoTB3vt1l+CU+tSyi/HOjfOjRLJ4YwYZGwRO0= +go.yaml.in/yaml/v2 v2.4.3/go.mod h1:zSxWcmIDjOzPXpjlTTbAsKokqkDNAVtZO0WOMiT90s8= golang.org/x/crypto v0.48.0 h1:/VRzVqiRSggnhY7gNRxPauEQ5Drw9haKdM0jqfcCFts= golang.org/x/crypto v0.48.0/go.mod h1:r0kV5h3qnFPlQnBSrULhlsRfryS2pmewsg+XfMgkVos= -golang.org/x/net v0.46.0 h1:giFlY12I07fugqwPuWJi68oOnpfqFnJIJzaIIm2JVV4= -golang.org/x/net v0.46.0/go.mod h1:Q9BGdFy1y4nkUwiLvT5qtyhAnEHgnQ/zd8PfU6nc210= golang.org/x/net v0.50.0 h1:ucWh9eiCGyDR3vtzso0WMQinm2Dnt8cFMuQa9K33J60= golang.org/x/net v0.50.0/go.mod h1:UgoSli3F/pBgdJBHCTc+tp3gmrU4XswgGRgtnwWTfyM= -golang.org/x/sys v0.37.0 h1:fdNQudmxPjkdUTPnLn5mdQv7Zwvbvpaxqs831goi9kQ= -golang.org/x/sys v0.37.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= golang.org/x/sys v0.41.0 h1:Ivj+2Cp/ylzLiEU89QhWblYnOE9zerudt9Ftecq2C6k= golang.org/x/sys v0.41.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= +google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE= +google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/internal/api/routes/const.go b/internal/api/routes/const.go index e58bf43..60f2412 100644 --- a/internal/api/routes/const.go +++ b/internal/api/routes/const.go @@ -27,6 +27,9 @@ const ( AUTH = ROOT + "auth" AUTH_LOGIN = AUTH + "/login" POST_AUTH_LOGIN = http.MethodPost + " " + AUTH_LOGIN + + METRICS = ROOT + "metrics" + GET_METRICS = http.MethodGet + " " + METRICS ) const ( diff --git a/internal/manager/healthcheck/healtheck.go b/internal/manager/healthcheck/healtheck.go index 795cfe6..e592d75 100644 --- a/internal/manager/healthcheck/healtheck.go +++ b/internal/manager/healthcheck/healtheck.go @@ -1,47 +1,40 @@ package healthcheck import ( + "log/slog" "time" "github.com/vitistack/gslb-operator/internal/service" + "github.com/vitistack/gslb-operator/pkg/bslog" ) -// the result of a singular health-check -type Result struct { - Success bool - timeTaken time.Duration -} - type HealthCheckJob struct { - service *service.Service - metrics *HealthCheckMetricsCounter + Service *service.Service lastCheck time.Time } func NewJob(svc *service.Service) *HealthCheckJob { return &HealthCheckJob{ - service: svc, - metrics: NewMetricsCounter(1_000), + Service: svc, } } func (hj *HealthCheckJob) Execute() error { hj.lastCheck = time.Now() - return hj.service.Execute() + err := hj.Service.Execute() + + bslog.Debug("check complete", slog.Float64("duration_ms", float64(time.Since(hj.lastCheck).Milliseconds()))) + return err } func (hj *HealthCheckJob) OnSuccess() { - hj.metrics.Record(&Result{ - Success: true, - timeTaken: time.Since(hj.lastCheck), - }) - hj.service.OnSuccess() + healthChecksTotal.WithLabelValues(hj.Service.String(), "success").Inc() + healthCheckDuration.WithLabelValues(hj.Service.String()).Observe(float64(time.Since(hj.lastCheck).Milliseconds())) + hj.Service.OnSuccess() } func (hj *HealthCheckJob) OnFailure(err error) { - hj.metrics.Record(&Result{ - Success: false, - timeTaken: time.Since(hj.lastCheck), - }) - hj.service.OnFailure(err) + healthChecksTotal.WithLabelValues(hj.Service.String(), "failure").Inc() + healthCheckDuration.WithLabelValues(hj.Service.String()).Observe(float64(time.Since(hj.lastCheck).Milliseconds())) + hj.Service.OnFailure(err) } diff --git a/internal/manager/healthcheck/metrics.go b/internal/manager/healthcheck/metrics.go index 421b0b3..6de8773 100644 --- a/internal/manager/healthcheck/metrics.go +++ b/internal/manager/healthcheck/metrics.go @@ -1,124 +1,25 @@ -// collects different metrics for health-checks since start of service package healthcheck import ( - "sync" - "sync/atomic" - "time" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" ) -// holds the timestamp of healthcheck time -// and the result of that healthcheck -type Recording struct { - timestamp time.Time - result *Result -} - -func NewRecording(res *Result) *Recording { - return &Recording{ - timestamp: time.Now().Add(-res.timeTaken), - result: res, - } -} - -type HealthCheckMetricsCounter struct { - totalChecks atomic.Int64 - totalSuccess atomic.Int64 - totalFailure atomic.Int64 - maxRecordings int - recordings []*Recording // health-check timestamps - mu sync.RWMutex -} - -func NewMetricsCounter(max int) *HealthCheckMetricsCounter { - return &HealthCheckMetricsCounter{ - totalChecks: atomic.Int64{}, - totalSuccess: atomic.Int64{}, - totalFailure: atomic.Int64{}, - maxRecordings: max, - recordings: make([]*Recording, 0, max), - mu: sync.RWMutex{}, - } -} - -// total checks in the last given time-frame -func (c *HealthCheckMetricsCounter) Last(dur time.Duration) int { - c.mu.Lock() - defer c.mu.Unlock() - count := 0 - - interval := time.Now().Add(-dur) - - for i := len(c.recordings) - 1; i >= 0; i-- { - if !c.recordings[i].timestamp.Before(interval) { - count++ - } else { - // since we start from the back - // the first recording that is BEFORE the interval - // means that every interval next after this one is also before the interval - // therefore no need to check them when we know they are not going to hit - break - } - } - - return count -} - -// total checks that have been successfull in the last given time-frame -func (c *HealthCheckMetricsCounter) SuccessLast(dur time.Duration) int { - c.mu.Lock() - defer c.mu.Unlock() - count := 0 - - interval := time.Now().Add(-dur) - - for i := len(c.recordings) - 1; i >= 0; i-- { - if !c.recordings[i].timestamp.Before(interval) && c.recordings[i].result.Success { - count++ - } else { - // same reason as Last(...) func - break - } - } - - return count -} - -// total checks that have been failure in the last given time-frame -func (c *HealthCheckMetricsCounter) FailuresLast(dur time.Duration) int { - c.mu.Lock() - defer c.mu.Unlock() - count := 0 - - interval := time.Now().Add(-dur) - - for i := len(c.recordings) - 1; i >= 0; i-- { - if !c.recordings[i].timestamp.Before(interval) && !c.recordings[i].result.Success { - count++ - } else { - // same reason as Last(...) func - break - } - } - - return count -} - -func (c *HealthCheckMetricsCounter) Record(result *Result) { - c.totalChecks.Add(1) - - if result.Success { - c.totalSuccess.Add(1) - } else { - c.totalFailure.Add(1) - } - - c.mu.Lock() - defer c.mu.Unlock() +var ( + healthChecksTotal = promauto.NewCounterVec( + prometheus.CounterOpts{ + Name: "healthcheck_total", + Help: "Total health checks performed", + }, + []string{"service", "success"}, + ) + + healthCheckDuration = promauto.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "healthcheck_duration_ms", + Help: "Health check duration", + }, + []string{"service"}, + ) +) - if len(c.recordings) == c.maxRecordings { - c.recordings = append(c.recordings[1:], NewRecording(result)) - } else { - c.recordings = append(c.recordings, NewRecording(result)) - } -} diff --git a/internal/manager/manager.go b/internal/manager/manager.go index abd5e7f..d1837d0 100644 --- a/internal/manager/manager.go +++ b/internal/manager/manager.go @@ -7,6 +7,7 @@ import ( "sync" "time" + "github.com/vitistack/gslb-operator/internal/manager/healthcheck" "github.com/vitistack/gslb-operator/internal/manager/scheduler" "github.com/vitistack/gslb-operator/internal/model" svcRepo "github.com/vitistack/gslb-operator/internal/repositories/service" @@ -27,7 +28,7 @@ type ServicesManager struct { svcRepo *svcRepo.ServiceRepo mutex sync.RWMutex stop sync.Once - pool pool.WorkerPool + pool *pool.WorkerPool wg *sync.WaitGroup // schedulers use this when scheduling services asynchronously DNSUpdate func(*service.Service, bool) dryrun bool @@ -49,13 +50,21 @@ func NewManager(opts ...serviceManagerOption) *ServicesManager { bslog.Warn("dry-run enabled") } + pool := pool.NewWorkerPool(cfg.MinRunningWorkers, cfg.NonBlockingBufferSize) + pool.OnScaleUp = func() { + workerPoolSize.Inc() + } + pool.OnScaleDown = func() { + workerPoolSize.Dec() + } + return &ServicesManager{ scheduledServices: make(ScheduledServices), schedulers: make(map[timesutil.Duration]*scheduler.Scheduler), serviceGroups: make(map[string]*ServiceGroup), svcRepo: cfg.repo, mutex: sync.RWMutex{}, - pool: *pool.NewWorkerPool(cfg.MinRunningWorkers, cfg.NonBlockingBufferSize), + pool: pool, stop: sync.Once{}, wg: &sync.WaitGroup{}, dryrun: cfg.DryRun, @@ -188,9 +197,10 @@ func (sm *ServicesManager) RemoveService(id string) error { sm.mutex.RLock() group := sm.serviceGroups[svc.MemberOf] sm.mutex.RUnlock() + empty := group.RemoveService(svc.GetID()) // registered in group if empty { - delete(sm.serviceGroups, svc.MemberOf) + sm.deleteGroup(svc.MemberOf) } sm.mutex.Lock() @@ -239,9 +249,7 @@ func (sm *ServicesManager) updateService(old, new *service.Service) { if ok { oldGroup.Update() // notify potential changes to group } else { // this will probably never run, but you never know in concurrency! - sm.mutex.Lock() - delete(sm.serviceGroups, oldMemberOf) - sm.mutex.Unlock() + sm.deleteGroup(oldMemberOf) } } @@ -310,7 +318,7 @@ func (sm *ServicesManager) memberOfChanged(oldMemberOf, newMemberOf string, svc } if empty { // delete empty service group - delete(sm.serviceGroups, oldMemberOf) + sm.deleteGroup(oldMemberOf) } bslog.Debug( "updated service group membership", @@ -425,14 +433,24 @@ func (sm *ServicesManager) handlePromotion(event *PromotionEvent) { } func (sm *ServicesManager) newServiceGroup(memberOf string) *ServiceGroup { - newGroup := NewEmptyServiceGroup() + newGroup := NewEmptyServiceGroup(memberOf) newGroup.OnPromotion = func(event *PromotionEvent) { sm.handlePromotion(event) } sm.serviceGroups[memberOf] = newGroup + + serviceGroups.Inc() return newGroup } +// only called when we know it is safe to delete a group +func (sm *ServicesManager) deleteGroup(memberOf string) { + sm.mutex.Lock() + delete(sm.serviceGroups, memberOf) + sm.mutex.Unlock() + serviceGroups.Dec() +} + // creates a new scheduler, and starts its loop func (sm *ServicesManager) newScheduler(interval timesutil.Duration) *scheduler.Scheduler { if scheduler, ok := sm.schedulers[interval]; ok { // scheduler already exists @@ -442,8 +460,8 @@ func (sm *ServicesManager) newScheduler(interval timesutil.Duration) *scheduler. scheduler := scheduler.NewScheduler(time.Duration(interval), sm.wg) sm.schedulers[interval] = scheduler - scheduler.OnTick = func(s *service.Service) { - err := sm.pool.Put(s) + scheduler.OnTick = func(svc *service.Service) { + err := sm.pool.Put(healthcheck.NewJob(svc)) if errors.Is(err, pool.ErrPutOnClosedPool) { bslog.Error("failed to schedule health check", slog.String("reason", err.Error())) } @@ -477,6 +495,7 @@ func (sm *ServicesManager) moveServiceToInterval(svc *service.Service, newInterv if newScheduler == nil { newScheduler = sm.newScheduler(newInterval) } + newScheduler.ScheduleService(svc) bslog.Debug("sucessfully moved service to new interval", slog.String("oldInterval", oldInterval.String()), diff --git a/internal/manager/metrics.go b/internal/manager/metrics.go new file mode 100644 index 0000000..32d8706 --- /dev/null +++ b/internal/manager/metrics.go @@ -0,0 +1,26 @@ +package manager + +import ( + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" +) + +var ( + workerPoolSize = promauto.NewGauge(prometheus.GaugeOpts{ + Name: "worker_pool_size_total", + Help: "Number of running workers that perform health checks", + }) + + serviceGroups = promauto.NewGauge(prometheus.GaugeOpts{ + Name: "service_groups_total", + Help: "Number of service groups", + }) + + serviceGroupMembers = promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "service_group_members", + Help: "Number of members in each service group", + }, + []string{"group"}, + ) +) diff --git a/internal/manager/servicegroup.go b/internal/manager/servicegroup.go index 365650d..08ade74 100644 --- a/internal/manager/servicegroup.go +++ b/internal/manager/servicegroup.go @@ -42,6 +42,7 @@ type PromotionEvent struct { } type ServiceGroup struct { + Name string mode ServiceGroupMode // sorted by priority. @@ -62,8 +63,9 @@ type ServiceGroup struct { mu sync.RWMutex } -func NewEmptyServiceGroup() *ServiceGroup { +func NewEmptyServiceGroup(name string) *ServiceGroup { return &ServiceGroup{ + Name: name, mode: ActiveActive, Members: make([]*service.Service, 0), active: nil, @@ -125,7 +127,7 @@ func (sg *ServiceGroup) OnServiceHealthChange(changedService *service.Service, h if healthy && sg.triggerPromotion(changedService) { event := &PromotionEvent{ - Service: changedService.Fqdn, + Service: sg.Name, OldActive: oldActive, NewActive: changedService, } @@ -140,7 +142,7 @@ func (sg *ServiceGroup) OnServiceHealthChange(changedService *service.Service, h // If prioritized DC service becomes healthy, it must become active (single DNS record). if changedService.Datacenter == sg.prioritizedDatacenter && changedService != sg.active { sg.OnPromotion(&PromotionEvent{ - Service: changedService.Fqdn, + Service: sg.Name, NewActive: changedService, OldActive: sg.active, }) @@ -150,7 +152,7 @@ func (sg *ServiceGroup) OnServiceHealthChange(changedService *service.Service, h // If there is no active or the current active is unhealthy, promote this healthy service. if sg.active == nil || !sg.active.IsHealthy() { sg.OnPromotion(&PromotionEvent{ - Service: changedService.Fqdn, + Service: sg.Name, NewActive: changedService, OldActive: sg.active, }) @@ -166,7 +168,7 @@ func (sg *ServiceGroup) OnServiceHealthChange(changedService *service.Service, h next := sg.firstHealthy() if next != nil { sg.OnPromotion(&PromotionEvent{ - Service: changedService.Fqdn, + Service: sg.Name, NewActive: next, OldActive: sg.active, }) @@ -177,7 +179,7 @@ func (sg *ServiceGroup) OnServiceHealthChange(changedService *service.Service, h // all down -> signal DNS delete (single-record) sg.OnPromotion(&PromotionEvent{ - Service: changedService.Fqdn, + Service: sg.Name, NewActive: nil, OldActive: sg.active, }) @@ -203,6 +205,7 @@ func (sg *ServiceGroup) RegisterService(newService *service.Service) { sg.mu.Unlock() sg.Update() + serviceGroupMembers.WithLabelValues(newService.MemberOf).Inc() } func (sg *ServiceGroup) RemoveService(id string) bool { @@ -210,6 +213,15 @@ func (sg *ServiceGroup) RemoveService(id string) bool { members := sg.Members sg.mu.Unlock() + idx := slices.IndexFunc(members, func(s *service.Service) bool { + return s.GetID() == id + }) + if idx != -1 { + sg.mu.Lock() + sg.Members = append(members[:idx], members[idx+1:]...) + sg.Update() + serviceGroupMembers.WithLabelValues().Dec() + } for idx, member := range members { if member.GetID() == id { sg.mu.Lock() @@ -243,7 +255,7 @@ func (sg *ServiceGroup) promoteNextHealthy() *PromotionEvent { if bestIdx != -1 { sg.active = sg.Members[bestIdx] return &PromotionEvent{ - Service: oldActive.Fqdn, + Service: sg.Name, NewActive: sg.active, OldActive: oldActive, } @@ -252,7 +264,7 @@ func (sg *ServiceGroup) promoteNextHealthy() *PromotionEvent { // No healthy services: signal DNS delete (NewActive=nil) sg.active = nil return &PromotionEvent{ - Service: oldActive.Fqdn, + Service: sg.Name, NewActive: nil, OldActive: oldActive, } @@ -304,21 +316,20 @@ func (sg *ServiceGroup) SetGroupMode() { } sg.mu.RUnlock() + sg.mu.Lock() + defer sg.mu.Unlock() + switch sg.mode { case ActiveActive: // If services have different priorities, switch to ActivePassive if !allSamePriority { - sg.mu.Lock() sg.mode = ActivePassive - sg.mu.Unlock() } case ActivePassive: // If all services have same priority, can switch to ActiveActive if allSamePriority { - sg.mu.Lock() sg.mode = ActiveActive - sg.mu.Unlock() // if none healthy, leave active nil } @@ -330,9 +341,7 @@ func (sg *ServiceGroup) SetGroupMode() { */ default: - sg.mu.Lock() sg.mode = ActiveActive - sg.mu.Unlock() } bslog.Debug("servicegroup mode set", slog.Any("mode", sg.mode.String())) } @@ -397,7 +406,7 @@ func (sg *ServiceGroup) Update() { sg.active = firstHealthy event := &PromotionEvent{ - Service: firstHealthy.MemberOf, + Service: sg.Name, OldActive: sg.lastActive, NewActive: sg.active, } diff --git a/internal/manager/servicegroup_test.go b/internal/manager/servicegroup_test.go index 60d3643..edee0aa 100644 --- a/internal/manager/servicegroup_test.go +++ b/internal/manager/servicegroup_test.go @@ -47,7 +47,7 @@ func TestMain(m *testing.M) { } func TestServiceGroup_RegisterService(t *testing.T) { - group := NewEmptyServiceGroup() + group := NewEmptyServiceGroup("test") group.OnPromotion = func(pe *PromotionEvent) { log.Println("got promotion") if pe != nil { @@ -75,7 +75,7 @@ func TestServiceGroup_RegisterService(t *testing.T) { } func TestServiceGroup_OnServiceHealthChange(t *testing.T) { - group := NewEmptyServiceGroup() + group := NewEmptyServiceGroup("test") group.RegisterService(active) group.OnPromotion = func(pe *PromotionEvent) { @@ -180,7 +180,7 @@ func TestServiceGroup_memberExists(t *testing.T) { } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - sg := NewEmptyServiceGroup() + sg := NewEmptyServiceGroup("test") if tt.want { sg.RegisterService(tt.member) } diff --git a/internal/service/service.go b/internal/service/service.go index 6288316..5ebf0d8 100644 --- a/internal/service/service.go +++ b/internal/service/service.go @@ -297,7 +297,7 @@ func (s *Service) LogValue() slog.Value { // satisfies the stringer interface to allow passing s for %v in formatted strings func (s *Service) String() string { - return fmt.Sprintf("id:%s, memberOf: %s, fqdn: %s, datacenter: %s, ip: %s", s.id, s.MemberOf, s.Fqdn, s.Datacenter, s.GetIP()) + return fmt.Sprintf("%s:%s:%s:%s:%s", s.id, s.MemberOf, s.Fqdn, s.Datacenter, s.GetIP()) } func (s *Service) GSLBService() *model.GSLBService { diff --git a/pkg/pool/pool.go b/pkg/pool/pool.go index fea83e0..cbb6f2b 100644 --- a/pkg/pool/pool.go +++ b/pkg/pool/pool.go @@ -24,6 +24,10 @@ type WorkerPool struct { poolWg *sync.WaitGroup lock sync.Mutex closed *atomic.Bool + + // configurable action to take on worker-pool scale + OnScaleUp func() + OnScaleDown func() } func NewWorkerPool(minRunningWorkers, nonBlockingBufferSize uint) *WorkerPool { @@ -109,6 +113,10 @@ func (wp *WorkerPool) newWorker() { wp.poolWg.Add(1) go wp.worker(id) + + if wp.OnScaleUp != nil { + wp.OnScaleUp() + } } func (wp *WorkerPool) worker(id uint32) { @@ -140,6 +148,11 @@ func (wp *WorkerPool) worker(id uint32) { if wp.numRunningWorkers > wp.minRunningWorkers { wp.numRunningWorkers-- wp.lock.Unlock() + + if wp.OnScaleDown != nil { + wp.OnScaleDown() + } + return } wp.lock.Unlock() From d138420e0142c7fd493cef7169fdadeeb06e83f9 Mon Sep 17 00:00:00 2001 From: Espen Wobbes Date: Thu, 26 Feb 2026 14:51:20 +0100 Subject: [PATCH 21/24] fix (servicegroup): correct locking on RemoveService --- internal/manager/servicegroup.go | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/internal/manager/servicegroup.go b/internal/manager/servicegroup.go index f5e8271..baeed47 100644 --- a/internal/manager/servicegroup.go +++ b/internal/manager/servicegroup.go @@ -7,7 +7,6 @@ import ( "sync" "github.com/vitistack/gslb-operator/internal/service" - "github.com/vitistack/gslb-operator/internal/utils" "github.com/vitistack/gslb-operator/pkg/bslog" "github.com/vitistack/gslb-operator/pkg/models/failover" ) @@ -222,18 +221,11 @@ func (sg *ServiceGroup) RemoveService(id string) bool { if idx != -1 { sg.mu.Lock() sg.Members = append(members[:idx], members[idx+1:]...) + sg.mu.Unlock() sg.Update() - serviceGroupMembers.WithLabelValues().Dec() - } - for idx, member := range members { - if member.GetID() == id { - sg.mu.Lock() - sg.Members = utils.RemoveIndexFromSlice(sg.Members, idx) - sg.mu.Unlock() - sg.Update() - break - } + serviceGroupMembers.WithLabelValues(sg.Name).Dec() } + return len(sg.Members) == 0 } From fe3d7aa8e641248c261d7e826c30c9ceb967046e Mon Sep 17 00:00:00 2001 From: Espen Wobbes Date: Fri, 27 Feb 2026 09:22:05 +0100 Subject: [PATCH 22/24] fix(worker-pool): fixed locking when checking blocked queue --- cmd/main.go | 4 ++-- internal/checks/dryrun.go | 3 +++ internal/manager/manager.go | 4 +++- pkg/pool/pool.go | 6 +++++- 4 files changed, 13 insertions(+), 4 deletions(-) diff --git a/cmd/main.go b/cmd/main.go index 653eca0..e6bf50d 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -56,7 +56,7 @@ func main() { zoneFetcher := dns.NewZoneFetcherWithAutoPoll() mgr := manager.NewManager( manager.WithMinRunningWorkers(100), - manager.WithNonBlockingBufferSize(110), + manager.WithNonBlockingBufferSize(50), manager.WithServiceRepository(svcRepo), manager.WithDryRun(true), ) @@ -165,7 +165,7 @@ func main() { } func getRandomGSLBConfig() []model.GSLBConfig { - configs := make([]model.GSLBConfig, 0, 500) + configs := make([]model.GSLBConfig, 0, 1000) cfg := model.GSLBConfig{ Fqdn: "test.example.com", diff --git a/internal/checks/dryrun.go b/internal/checks/dryrun.go index f02a769..45f3daa 100644 --- a/internal/checks/dryrun.go +++ b/internal/checks/dryrun.go @@ -9,6 +9,9 @@ import ( type DryRun struct{} func (dr *DryRun) Check() error { + + sleepDuration := time.Duration(100+rand.Intn(400)) * time.Millisecond + time.Sleep(sleepDuration) num := rand.Intn(10) if num == 0 { // 10% failure when dryrunning return errors.New("dry-run fail") diff --git a/internal/manager/manager.go b/internal/manager/manager.go index 5663904..b348b9f 100644 --- a/internal/manager/manager.go +++ b/internal/manager/manager.go @@ -37,7 +37,7 @@ type ServicesManager struct { func NewManager(opts ...serviceManagerOption) *ServicesManager { cfg := managerConfig{ MinRunningWorkers: 100, - NonBlockingBufferSize: 110, + NonBlockingBufferSize: 100, DryRun: false, repo: svcRepo.NewServiceRepo(memory.NewStore[model.GSLBServiceGroup]()), } @@ -52,9 +52,11 @@ func NewManager(opts ...serviceManagerOption) *ServicesManager { pool := pool.NewWorkerPool(cfg.MinRunningWorkers, cfg.NonBlockingBufferSize) pool.OnScaleUp = func() { + bslog.Debug("worker-pool on scale up", slog.Int("numWorkers", int(pool.NumWorkers()))) workerPoolSize.Inc() } pool.OnScaleDown = func() { + bslog.Debug("worker-pool on scale down", slog.Int("numWorkers", int(pool.NumWorkers()))) workerPoolSize.Dec() } diff --git a/pkg/pool/pool.go b/pkg/pool/pool.go index cbb6f2b..d7dd629 100644 --- a/pkg/pool/pool.go +++ b/pkg/pool/pool.go @@ -81,8 +81,12 @@ func (wp *WorkerPool) Put(job Job) error { } func (wp *WorkerPool) scale() { + wp.lock.Lock() if wp.jobs.Blocked() { + wp.lock.Unlock() wp.newWorker() + } else { + wp.lock.Unlock() } } @@ -106,13 +110,13 @@ func (wp *WorkerPool) ScaleTo(targetWorkers uint) { func (wp *WorkerPool) newWorker() { wp.lock.Lock() - defer wp.lock.Unlock() wp.numRunningWorkers++ id := uuid.New().ID() wp.poolWg.Add(1) go wp.worker(id) + wp.lock.Unlock() if wp.OnScaleUp != nil { wp.OnScaleUp() From 76a2ea0bfea2478dfb8f4f054d1a2c101a086afb Mon Sep 17 00:00:00 2001 From: Espen Wobbes Date: Tue, 3 Mar 2026 13:57:15 +0100 Subject: [PATCH 23/24] feat: dnsdist updates directly from operator instead of HTTP - call to dnsdist-worker --- cmd/main.go | 76 +++---- internal/config/config.go | 5 + internal/dns/handler.go | 17 +- internal/dns/update/dnsdist.go | 234 ++++++++++++++++++++ internal/dns/{updater.go => update/rest.go} | 20 +- internal/dns/update/updater.go | 8 + internal/model/dnsdist.go | 10 + internal/repositories/spoof/spoof.go | 27 +++ pkg/dnsdist/client.go | 55 ----- pkg/dnsdist/mock_server.go | 205 +++++++++++++++++ pkg/dnsdist/options.go | 74 +++++++ pkg/dnsdist/rule.go | 9 + 12 files changed, 631 insertions(+), 109 deletions(-) create mode 100644 internal/dns/update/dnsdist.go rename internal/dns/{updater.go => update/rest.go} (85%) create mode 100644 internal/dns/update/updater.go create mode 100644 internal/model/dnsdist.go create mode 100644 pkg/dnsdist/mock_server.go create mode 100644 pkg/dnsdist/options.go create mode 100644 pkg/dnsdist/rule.go diff --git a/cmd/main.go b/cmd/main.go index e6bf50d..8690c1f 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -14,13 +14,12 @@ import ( "github.com/vitistack/gslb-operator/internal/api/handlers/failover" "github.com/vitistack/gslb-operator/internal/api/handlers/spoofs" "github.com/vitistack/gslb-operator/internal/api/routes" - "github.com/vitistack/gslb-operator/internal/checks" "github.com/vitistack/gslb-operator/internal/config" "github.com/vitistack/gslb-operator/internal/dns" + "github.com/vitistack/gslb-operator/internal/dns/update" "github.com/vitistack/gslb-operator/internal/manager" "github.com/vitistack/gslb-operator/internal/model" "github.com/vitistack/gslb-operator/internal/repositories/service" - "github.com/vitistack/gslb-operator/internal/utils/timesutil" "github.com/vitistack/gslb-operator/pkg/auth" "github.com/vitistack/gslb-operator/pkg/auth/jwt" "github.com/vitistack/gslb-operator/pkg/bslog" @@ -58,13 +57,14 @@ func main() { manager.WithMinRunningWorkers(100), manager.WithNonBlockingBufferSize(50), manager.WithServiceRepository(svcRepo), - manager.WithDryRun(true), + //manager.WithDryRun(true), ) - updater, err := dns.NewUpdater() + updater, err := update.NewDNSDISTUpdater(serviceFileStore) if err != nil { bslog.Fatal("unable to create updater", slog.String("error", err.Error())) } + dnsHandler := dns.NewHandler( zoneFetcher, mgr, @@ -72,15 +72,17 @@ func main() { ) background := context.Background() - dnsHandler.Start(context.WithCancel(background)) - - configs := getRandomGSLBConfig() - for _, cfg := range configs { - _, err := mgr.RegisterService(cfg) - if err != nil { - bslog.Fatal("could not create service", slog.String("reason", err.Error())) - } - } + ctx, cancel := context.WithCancel(background) + dnsHandler.Start(ctx, cancel) + updater.Synchronize(ctx) + + //configs := getRandomGSLBConfig() + //for _, cfg := range configs { + // _, err := mgr.RegisterService(cfg) + // if err != nil { + // bslog.Fatal("could not create service", slog.String("reason", err.Error())) + // } + //} api := http.NewServeMux() @@ -164,27 +166,27 @@ func main() { } } -func getRandomGSLBConfig() []model.GSLBConfig { - configs := make([]model.GSLBConfig, 0, 1000) - - cfg := model.GSLBConfig{ - Fqdn: "test.example.com", - Ip: "10.10.0.1", - Port: "80", - Datacenter: "DC1", - Interval: timesutil.FromDuration(time.Second * 5), - Priority: 1, - FailureThreshold: 3, - CheckType: checks.TCP_FULL, - } - - for idx := range cap(configs) { - - cfg.ServiceID = fmt.Sprintf("%d", idx) - cfg.MemberOf = fmt.Sprintf("%s.%s", cfg.ServiceID, cfg.Fqdn) - - configs = append(configs, cfg) - } - - return configs -} +//func getRandomGSLBConfig() []model.GSLBConfig { +// configs := make([]model.GSLBConfig, 0, 500) +// +// cfg := model.GSLBConfig{ +// Fqdn: "test.example.com", +// Ip: "10.10.0.1", +// Port: "80", +// Datacenter: "DC1", +// Interval: timesutil.FromDuration(time.Second * 5), +// Priority: 1, +// FailureThreshold: 3, +// CheckType: checks.TCP_FULL, +// } +// +// for idx := range cap(configs) { +// +// cfg.ServiceID = fmt.Sprintf("%d", idx) +// cfg.MemberOf = fmt.Sprintf("%s.%s", cfg.ServiceID, cfg.Fqdn) +// +// configs = append(configs, cfg) +// } +// +// return configs +//} diff --git a/internal/config/config.go b/internal/config/config.go index 4dce0fb..6018ba6 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -99,6 +99,7 @@ type GSLB struct { NAMESERVER string `env:"GSLB_NAMESERVER" flag:"gslb-nameserver"` POLLINTERVAL string `env:"GSLB_POLL_INTERVAL" flag:"poll-interval"` UPDATERHOST string `env:"GSLB_UPDATER_HOST" flag:"updater-host"` + SERVERS string `env:"GSLB_DNSDIST_SERVERS_FILE"` } func (g *GSLB) Zone() string { @@ -122,6 +123,10 @@ func (g *GSLB) UpdaterHost() string { return g.UPDATERHOST } +func (g *GSLB) Servers() string { + return g.SERVERS +} + type JWT struct { SECRET string `env:"JWT_SECRET"` USER string `env:"JWT_USER"` diff --git a/internal/dns/handler.go b/internal/dns/handler.go index e23a3ba..52fde59 100644 --- a/internal/dns/handler.go +++ b/internal/dns/handler.go @@ -8,6 +8,7 @@ import ( "sync" "codeberg.org/miekg/dns" + "github.com/vitistack/gslb-operator/internal/dns/update" "github.com/vitistack/gslb-operator/internal/manager" "github.com/vitistack/gslb-operator/internal/model" "github.com/vitistack/gslb-operator/internal/service" @@ -18,14 +19,14 @@ import ( type Handler struct { fetcher *ZoneFetcher // fetch GSLB config from dns svcManager *manager.ServicesManager - updater *Updater + updater update.Updater knownServices map[string]struct{} // service.ID: makes it easier to look up using map, but dont need a real value! stop chan struct{} cancel func() // cancels context wg sync.WaitGroup } -func NewHandler(fetcher *ZoneFetcher, mgr *manager.ServicesManager, updater *Updater) *Handler { +func NewHandler(fetcher *ZoneFetcher, mgr *manager.ServicesManager, updater update.Updater) *Handler { return &Handler{ fetcher: fetcher, svcManager: mgr, @@ -63,10 +64,10 @@ func (h *Handler) Start(ctx context.Context, cancel func()) { func (h *Handler) Stop(ctx context.Context) { done := make(chan struct{}) go func() { - h.cancel() // cancel zone-updates - h.wg.Wait() - h.svcManager.Stop() - close(done) + h.cancel() // cancel zone-updates + h.wg.Wait() + h.svcManager.Stop() + close(done) }() select { @@ -78,14 +79,14 @@ func (h *Handler) Stop(ctx context.Context) { } func (h *Handler) onServiceDown(svc *service.Service) { - err := h.updater.ServiceDown(svc) + err := h.updater.OnServiceDown(svc) if err != nil { bslog.Warn("error while updating service on service down", slog.String("error", err.Error())) } } func (h *Handler) onServiceUp(svc *service.Service) { - err := h.updater.ServiceUp(svc) + err := h.updater.OnServiceUp(svc) if err != nil { bslog.Warn("error while updating service state on service up", slog.String("error", err.Error())) } diff --git a/internal/dns/update/dnsdist.go b/internal/dns/update/dnsdist.go new file mode 100644 index 0000000..b6d7761 --- /dev/null +++ b/internal/dns/update/dnsdist.go @@ -0,0 +1,234 @@ +package update + +import ( + "bufio" + "cmp" + "context" + "crypto/sha256" + "encoding/hex" + "encoding/json" + "fmt" + "log/slog" + "os" + "regexp" + "slices" + "strings" + "sync" + "time" + + "github.com/vitistack/gslb-operator/internal/config" + "github.com/vitistack/gslb-operator/internal/model" + repo "github.com/vitistack/gslb-operator/internal/repositories/spoof" + "github.com/vitistack/gslb-operator/internal/service" + "github.com/vitistack/gslb-operator/pkg/bslog" + "github.com/vitistack/gslb-operator/pkg/dnsdist" + "github.com/vitistack/gslb-operator/pkg/models/spoofs" + "github.com/vitistack/gslb-operator/pkg/persistence" +) + +const DEFAULT_SYNCHRONIZE_JOB = time.Minute + +// contacts dnsdist servers to make update directly +type DNSDISTUpdater struct { + servers map[string]*dnsdist.Client + spoofRepo repo.SpoofRepo +} + +func NewDNSDISTUpdater(store persistence.Store[model.GSLBServiceGroup]) (*DNSDISTUpdater, error) { + updater := &DNSDISTUpdater{ + servers: make(map[string]*dnsdist.Client), + spoofRepo: *repo.NewSpoofRepo(store), + } + + file, err := os.ReadFile(config.GetInstance().GSLB().Servers()) + if err != nil { + return nil, fmt.Errorf("could could not load dnsdist servers configuration: %w", err) + } + servers := []model.DNSDISTServer{} + err = json.Unmarshal(file, &servers) + if err != nil { + return nil, fmt.Errorf("malformed dnsdist servers configuration: %w", err) + } + + for _, server := range servers { + client, err := dnsdist.NewClient( + server.Key, + dnsdist.WithHost(server.Host.String()), + dnsdist.WithPort(server.Port), + dnsdist.WithTimeout(time.Second*5), + dnsdist.WithNumRetriesOnCommandFailure(3), + ) + + if err != nil { + return nil, fmt.Errorf("unable to create dnsdist client: %w", err) + } + + updater.servers[server.Name] = client + } + + err = updater.synchronizeServers() + if err != nil { + return updater, fmt.Errorf("failed synchronization on updater init: %w", err) + } + + return updater, nil +} + +func (d *DNSDISTUpdater) OnServiceUp(svc *service.Service) error { + + for _, client := range d.servers { + err := client.AddDomainSpoof(svc.MemberOf+":"+svc.Datacenter, svc.MemberOf, svc.GetIP()) + if err != nil { + return fmt.Errorf("could not create dnsdist-spoof: %w", err) + } + } + + return nil +} + +func (d *DNSDISTUpdater) OnServiceDown(svc *service.Service) error { + for _, client := range d.servers { + err := client.RmRuleWithName(svc.MemberOf + ":" + svc.Datacenter) + if err != nil { + return fmt.Errorf("could not remove dnsdist-spoof: %w", err) + } + } + return nil +} + +func (d *DNSDISTUpdater) Synchronize(ctx context.Context) { + go func() { + for { + select { + case <-ctx.Done(): + bslog.Info("stopping dnsdist - server synchronization") + + // close controll socket connections + for _, client := range d.servers { + client.Disconnect() + } + + return + case <-time.After(DEFAULT_SYNCHRONIZE_JOB): + err := d.synchronizeServers() + if err != nil { + bslog.Error("unable to synchronize dnsdist - servers", slog.String("reason", err.Error())) + } + } + } + }() +} + +func (d *DNSDISTUpdater) synchronizeServers() error { + desiredHash, err := d.spoofRepo.Hash() + if err != nil { + return fmt.Errorf("unable to get hash representation of spoofs: %w", err) + } + + wg := sync.WaitGroup{} + + for server, client := range d.servers { + wg.Go(func() { + rawRuleSet, err := client.ShowRules() + if err != nil { + bslog.Error("unable to fetch ruleset from dnsdist server", slog.String("reason", err.Error())) + return + } + + data, err := d.ParseRuleSet(rawRuleSet) + if err != nil { + bslog.Error("could not synchronize dnsdist server", slog.String("reason", err.Error())) + } + + slices.SortFunc(data, func(a, b spoofs.Spoof) int { + return cmp.Compare(fmt.Sprintf("%s:%s", a.FQDN, a.DC), fmt.Sprintf("%s:%s", b.FQDN, b.DC)) + }) + + marshalledSpoofs, err := json.Marshal(data) + if err != nil { + bslog.Error("unable to marshall spoofs", slog.String("reason", err.Error())) + return + } + + rawHash := sha256.Sum256(marshalledSpoofs) // creating bytes representation of spoofs + hash := hex.EncodeToString(rawHash[:]) + if hash != desiredHash { + err := d.reconcileServer(client, data) + if err != nil { + bslog.Warn("failed to reconcile server", slog.String("server_name", server)) + } + } + }) + } + + wg.Wait() + + return nil +} + +func (d *DNSDISTUpdater) ParseRuleSet(ruleSet string) ([]spoofs.Spoof, error) { + reader := strings.NewReader(ruleSet) + lines := bufio.NewScanner(reader) + + pattern, err := regexp.Compile(`[a-zA-Z0-9._-]+:[A-Z0-9]+|spoof|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}`) + if err != nil { + return nil, fmt.Errorf("unable to compile regex: %w", err) + } + + spoofRules := make([]spoofs.Spoof, 0) + for lines.Scan() { + line := lines.Text() + matches := pattern.FindAllString(line, -1) + if len(matches) < 3 { + continue + } + rule := dnsdist.Rule{ + Name: matches[0], + Action: matches[1], + } + + if rule.Action != "spoof" { + continue + } + + spoofRules = append(spoofRules, + spoofs.Spoof{ + FQDN: strings.Split(rule.Name, ":")[0], + DC: strings.Split(rule.Name, ":")[1], + IP: matches[2], + }) + } + + return spoofRules, nil +} + +func (d *DNSDISTUpdater) reconcileServer(client *dnsdist.Client, configuredSpoofs []spoofs.Spoof) error { + gslbspoofs, err := d.spoofRepo.ReadAll() + if err != nil { + return fmt.Errorf("could not fetch spoofs: %w", err) + } + + for _, spoof := range configuredSpoofs { // remove all spoofs that should not exist any more + if !slices.ContainsFunc(gslbspoofs, func(s spoofs.Spoof) bool { + return s.FQDN+":"+s.DC == spoof.FQDN+":"+spoof.DC + }) { + err := client.RmRuleWithName(spoof.FQDN + ":" + spoof.DC) + if err != nil { + return fmt.Errorf("could not remove spoof: %w", err) + } + } + } + + for _, spoof := range gslbspoofs { // add all spoofs that does not exist but should + if !slices.ContainsFunc(configuredSpoofs, func(s spoofs.Spoof) bool { + return s.FQDN+":"+s.DC == spoof.FQDN+":"+spoof.DC + }) { + err := client.AddDomainSpoof(spoof.FQDN+":"+spoof.DC, spoof.FQDN, spoof.IP) + if err != nil { + return fmt.Errorf("could not remove spoof: %w", err) + } + } + } + + return nil +} diff --git a/internal/dns/updater.go b/internal/dns/update/rest.go similarity index 85% rename from internal/dns/updater.go rename to internal/dns/update/rest.go index 8b87f32..e49a810 100644 --- a/internal/dns/updater.go +++ b/internal/dns/update/rest.go @@ -1,4 +1,6 @@ -package dns +package update + +// sends HTTP request to update dns import ( "fmt" @@ -14,16 +16,16 @@ import ( "github.com/vitistack/gslb-operator/pkg/rest/request/client" ) -type updaterOption func(u *Updater) +type updaterOption func(u *RESTUpdater) -type Updater struct { +type RESTUpdater struct { Server string client client.HTTPClient builder *request.Builder mu *sync.Mutex } -func NewUpdater(opts ...updaterOption) (*Updater, error) { +func NewUpdater(opts ...updaterOption) (*RESTUpdater, error) { c, err := client.NewClient( time.Second*5, client.WithRetry(3), @@ -34,7 +36,7 @@ func NewUpdater(opts ...updaterOption) (*Updater, error) { return nil, fmt.Errorf("unable to create http client: %s", err.Error()) } - u := &Updater{ + u := &RESTUpdater{ Server: config.GetInstance().GSLB().UpdaterHost(), client: *c, mu: &sync.Mutex{}, @@ -49,18 +51,18 @@ func NewUpdater(opts ...updaterOption) (*Updater, error) { } func UpdaterWithServer(server string) updaterOption { - return func(u *Updater) { + return func(u *RESTUpdater) { u.Server = server } } func UpdaterWithClient(client *client.HTTPClient) updaterOption { - return func(u *Updater) { + return func(u *RESTUpdater) { u.client = *client } } -func (u *Updater) ServiceDown(svc *service.Service) error { +func (u *RESTUpdater) ServiceDown(svc *service.Service) error { token, err := jwt.GetInstance().GetServiceToken() if err != nil { return fmt.Errorf("could not fetch service token: %w", err) @@ -86,7 +88,7 @@ func (u *Updater) ServiceDown(svc *service.Service) error { return nil } -func (u *Updater) ServiceUp(svc *service.Service) error { +func (u *RESTUpdater) ServiceUp(svc *service.Service) error { token, err := jwt.GetInstance().GetServiceToken() if err != nil { return fmt.Errorf("could not fetch service token: %w", err) diff --git a/internal/dns/update/updater.go b/internal/dns/update/updater.go new file mode 100644 index 0000000..c76bb2c --- /dev/null +++ b/internal/dns/update/updater.go @@ -0,0 +1,8 @@ +package update + +import "github.com/vitistack/gslb-operator/internal/service" + +type Updater interface { + OnServiceUp(*service.Service) error + OnServiceDown(*service.Service) error +} diff --git a/internal/model/dnsdist.go b/internal/model/dnsdist.go new file mode 100644 index 0000000..5ed8d21 --- /dev/null +++ b/internal/model/dnsdist.go @@ -0,0 +1,10 @@ +package model + +import "net" + +type DNSDISTServer struct { + Name string `json:"name"` + Host net.IP `json:"host"` + Port string `json:"port"` + Key string `json:"key"` +} diff --git a/internal/repositories/spoof/spoof.go b/internal/repositories/spoof/spoof.go index c8cd1cc..6a99003 100644 --- a/internal/repositories/spoof/spoof.go +++ b/internal/repositories/spoof/spoof.go @@ -1,8 +1,13 @@ package spoof import ( + "cmp" + "crypto/sha256" + "encoding/hex" + "encoding/json" "errors" "fmt" + "slices" "github.com/vitistack/gslb-operator/internal/model" "github.com/vitistack/gslb-operator/pkg/models/spoofs" @@ -71,3 +76,25 @@ func (r *SpoofRepo) ReadAll() ([]spoofs.Spoof, error) { return spoofs, nil } + +func (r *SpoofRepo) Hash() (string, error) { + data, err := r.ReadAll() + if err != nil { + return "", err + } + + slices.SortFunc( + data, + func(a, b spoofs.Spoof) int { + return cmp.Compare(a.FQDN+":"+a.DC, b.FQDN+":"+b.DC) + }, + ) + + marshalledSpoofs, err := json.Marshal(data) + if err != nil { + return "", fmt.Errorf("unable to serialize spoofs: %w", err) + } + + rawHash := sha256.Sum256(marshalledSpoofs) // creating bytes representation of spoofs + return hex.EncodeToString(rawHash[:]), nil +} diff --git a/pkg/dnsdist/client.go b/pkg/dnsdist/client.go index 5d49c4a..19c4977 100644 --- a/pkg/dnsdist/client.go +++ b/pkg/dnsdist/client.go @@ -13,8 +13,6 @@ import ( "fmt" "io" "net" - "strconv" - "strings" "time" "golang.org/x/crypto/nacl/secretbox" @@ -25,8 +23,6 @@ const ( NONCE_LEN = 24 ) -type clientOption func(c *Client) error - type Client struct { conn net.Conn //raw connection to configured Host and Port key [KEY_LEN]byte @@ -72,57 +68,6 @@ func NewClient(key string, options ...clientOption) (*Client, error) { return client, nil } -func WithHost(host string) clientOption { - return func(c *Client) error { - ip := net.ParseIP(host) - if ip == nil { - return ErrCouldNotParseAddr - } - c.host = ip - return nil - } -} - -func WithPort(port string) clientOption { - return func(c *Client) error { - port = strings.TrimSpace(port) - if port == "" { - return ErrCouldNotParseAddr - } - // Ensure all characters are digits - for _, r := range port { - if r < '0' || r > '9' { - return ErrCouldNotParseAddr - } - } - - p, err := strconv.Atoi(port) - if err != nil || p < 1 || p > 65535 { - return ErrCouldNotParseAddr - } - - c.port = port - return nil - } -} - -func WithTimeout(timeout time.Duration) clientOption { - return func(c *Client) error { - c.timeout = timeout - return nil - } -} - -func WithNumRetriesOnCommandFailure(retries int) clientOption { - return func(c *Client) error { - if retries < 0 { - return ErrNegativeRetryCount - } - c.retries = retries - return nil - } -} - func (c *Client) generateClientNonce() error { bufferNonce := make([]byte, NONCE_LEN) _, err := rand.Read(bufferNonce) // initialize client nonce diff --git a/pkg/dnsdist/mock_server.go b/pkg/dnsdist/mock_server.go new file mode 100644 index 0000000..14791d5 --- /dev/null +++ b/pkg/dnsdist/mock_server.go @@ -0,0 +1,205 @@ +package dnsdist + +import ( + "crypto/rand" + "encoding/binary" + "fmt" + "io" + "net" + "sync" + "testing" + + "golang.org/x/crypto/nacl/secretbox" +) + +// MockServer simulates a dnsdist console server for testing +type MockServer struct { + listener net.Listener + key [KEY_LEN]byte + addr string + handlers map[string]func(string) string // command -> response handler + mu sync.RWMutex + running bool + wg sync.WaitGroup +} + +// NewMockServer creates a new mock dnsdist server with the given key +func NewMockServer(t *testing.T, key [KEY_LEN]byte) *MockServer { + listener, err := net.Listen("tcp", "127.0.0.1:0") + if err != nil { + t.Fatalf("failed to create mock server: %v", err) + } + + ms := &MockServer{ + listener: listener, + key: key, + addr: listener.Addr().String(), + handlers: make(map[string]func(string) string), + } + + // Set default handlers + ms.SetHandler("", func(cmd string) string { return "" }) // empty command for handshake + ms.SetHandler("showRules()", func(cmd string) string { return "Rules:\n" }) + + return ms +} + +// Start begins accepting connections +func (ms *MockServer) Start() { + ms.mu.Lock() + ms.running = true + ms.mu.Unlock() + + ms.wg.Add(1) + go ms.acceptLoop() +} + +// Stop stops the server and closes all connections +func (ms *MockServer) Stop() { + ms.mu.Lock() + ms.running = false + ms.mu.Unlock() + + ms.listener.Close() + ms.wg.Wait() +} + +// Addr returns the server's address +func (ms *MockServer) Addr() string { + return ms.addr +} + +// SetHandler sets a response handler for a specific command +func (ms *MockServer) SetHandler(cmd string, handler func(string) string) { + ms.mu.Lock() + defer ms.mu.Unlock() + ms.handlers[cmd] = handler +} + +func (ms *MockServer) acceptLoop() { + defer ms.wg.Done() + + for { + conn, err := ms.listener.Accept() + if err != nil { + ms.mu.RLock() + running := ms.running + ms.mu.RUnlock() + if !running { + return + } + continue + } + + ms.wg.Add(1) + go ms.handleConnection(conn) + } +} + +func (ms *MockServer) handleConnection(conn net.Conn) { + defer ms.wg.Done() + defer conn.Close() + + // Read client nonce + cNonce := make([]byte, NONCE_LEN) + _, err := io.ReadFull(conn, cNonce) + if err != nil { + return + } + + // Generate and send server nonce + sNonce := make([]byte, NONCE_LEN) + _, err = rand.Read(sNonce) + if err != nil { + return + } + + _, err = conn.Write(sNonce) + if err != nil { + return + } + + // Initialize read/write nonces + var rNonce, wNonce [NONCE_LEN]byte + halfNonce := NONCE_LEN / 2 + + // Server's read nonce (client's write nonce) + copy(rNonce[:halfNonce], sNonce[:halfNonce]) + copy(rNonce[halfNonce:], cNonce[halfNonce:]) + + // Server's write nonce (client's read nonce) + copy(wNonce[:halfNonce], cNonce[:halfNonce]) + copy(wNonce[halfNonce:], sNonce[halfNonce:]) + + // Handle commands + for { + cmd, err := ms.receiveCommand(conn, &rNonce) + if err != nil { + return + } + + response := ms.getResponse(cmd) + + err = ms.sendResponse(conn, response, &wNonce) + if err != nil { + return + } + } +} + +func (ms *MockServer) receiveCommand(conn net.Conn, rNonce *[NONCE_LEN]byte) (string, error) { + // Read length + bufferLen := make([]byte, 4) + _, err := io.ReadFull(conn, bufferLen) + if err != nil { + return "", err + } + + // Read encrypted command + cmdLen := binary.BigEndian.Uint32(bufferLen) + encryptedCmd := make([]byte, cmdLen) + _, err = io.ReadFull(conn, encryptedCmd) + if err != nil { + return "", err + } + + // Decrypt + decrypted, ok := secretbox.Open(nil, encryptedCmd, rNonce, &ms.key) + if !ok { + return "", fmt.Errorf("decryption failed") + } + + incrementNonce(rNonce) + + return string(decrypted), nil +} + +func (ms *MockServer) sendResponse(conn net.Conn, response string, wNonce *[NONCE_LEN]byte) error { + // Encrypt response + encrypted := secretbox.Seal(nil, []byte(response), wNonce, &ms.key) + incrementNonce(wNonce) + + // Send length + bufferLen := make([]byte, 4) + binary.BigEndian.PutUint32(bufferLen, uint32(len(encrypted))) + _, err := conn.Write(bufferLen) + if err != nil { + return err + } + + // Send encrypted response + _, err = conn.Write(encrypted) + return err +} + +func (ms *MockServer) getResponse(cmd string) string { + ms.mu.RLock() + defer ms.mu.RUnlock() + + if handler, ok := ms.handlers[cmd]; ok { + return handler(cmd) + } + + // Default: return empty response + return "" +} \ No newline at end of file diff --git a/pkg/dnsdist/options.go b/pkg/dnsdist/options.go new file mode 100644 index 0000000..6b36286 --- /dev/null +++ b/pkg/dnsdist/options.go @@ -0,0 +1,74 @@ +package dnsdist + +import ( + "fmt" + "net" + "strconv" + "strings" + "time" +) + +type clientOption func(c *Client) error + +func WithHost(host string) clientOption { + return func(c *Client) error { + ip := net.ParseIP(host) + if ip == nil { + return ErrCouldNotParseAddr + } + c.host = ip + return nil + } +} + +func WithHostName(hostname string) clientOption { + return func(c *Client) error { + ips, err := net.LookupHost(hostname) + if err != nil { + return fmt.Errorf("DNS - lookup failed: %w", err) + } + + c.host = net.IP(ips[0]) + return nil + } +} + +func WithPort(port string) clientOption { + return func(c *Client) error { + port = strings.TrimSpace(port) + if port == "" { + return ErrCouldNotParseAddr + } + // Ensure all characters are digits + for _, r := range port { + if r < '0' || r > '9' { + return ErrCouldNotParseAddr + } + } + + p, err := strconv.Atoi(port) + if err != nil || p < 1 || p > 65535 { + return ErrCouldNotParseAddr + } + + c.port = port + return nil + } +} + +func WithTimeout(timeout time.Duration) clientOption { + return func(c *Client) error { + c.timeout = timeout + return nil + } +} + +func WithNumRetriesOnCommandFailure(retries int) clientOption { + return func(c *Client) error { + if retries < 0 { + return ErrNegativeRetryCount + } + c.retries = retries + return nil + } +} diff --git a/pkg/dnsdist/rule.go b/pkg/dnsdist/rule.go new file mode 100644 index 0000000..34b37aa --- /dev/null +++ b/pkg/dnsdist/rule.go @@ -0,0 +1,9 @@ +package dnsdist + +type Rule struct { + ID string + Name string + Matches string + Rule string + Action string +} From d05b9398cd2c61355b2ceb1f0d6871ddea9a071e Mon Sep 17 00:00:00 2001 From: Espen Wobbes Date: Wed, 4 Mar 2026 15:12:24 +0100 Subject: [PATCH 24/24] fix (metrics): better label utilization on healthchecks --- cmd/main.go | 2 +- examples/prometheus.ql | 17 +++++++++++++++++ internal/manager/healthcheck/healtheck.go | 23 ++++++++++++++++++----- internal/manager/healthcheck/metrics.go | 5 +++-- internal/manager/metrics.go | 2 +- 5 files changed, 40 insertions(+), 9 deletions(-) create mode 100644 examples/prometheus.ql diff --git a/cmd/main.go b/cmd/main.go index 8690c1f..bbd670b 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -54,7 +54,7 @@ func main() { // creating dns - handler objects zoneFetcher := dns.NewZoneFetcherWithAutoPoll() mgr := manager.NewManager( - manager.WithMinRunningWorkers(100), + manager.WithMinRunningWorkers(80), manager.WithNonBlockingBufferSize(50), manager.WithServiceRepository(svcRepo), //manager.WithDryRun(true), diff --git a/examples/prometheus.ql b/examples/prometheus.ql new file mode 100644 index 0000000..acf421e --- /dev/null +++ b/examples/prometheus.ql @@ -0,0 +1,17 @@ +# worker-pool size +worker_pool_size_total + +# number of service groups +service_groups_total + +# number of registered services +sum(service_group_members) + +# number of health checks in the last +sum(increase(healthcheck_total[$__rate_interval])) + +# average health check duration towards each datacenter +sum by(datacenter) (rate(healthcheck_duration_ms_sum[5m])) / sum by(datacenter) (rate(healthcheck_duration_ms_count[5m])) + +# health check success rate percentage towards each datacenter +(sum by(datacenter) (rate(healthcheck_total{status="success"}[$__rate_interval]))) * 100 / (sum by(datacenter) (rate(healthcheck_total[$__rate_interval]))) diff --git a/internal/manager/healthcheck/healtheck.go b/internal/manager/healthcheck/healtheck.go index e592d75..5891ba6 100644 --- a/internal/manager/healthcheck/healtheck.go +++ b/internal/manager/healthcheck/healtheck.go @@ -23,18 +23,31 @@ func (hj *HealthCheckJob) Execute() error { hj.lastCheck = time.Now() err := hj.Service.Execute() - bslog.Debug("check complete", slog.Float64("duration_ms", float64(time.Since(hj.lastCheck).Milliseconds()))) + checkTimeMs := float64(time.Since(hj.lastCheck).Milliseconds()) + + bslog.Debug("check complete", slog.Float64("duration_ms", checkTimeMs)) + healthCheckDuration.WithLabelValues( + hj.Service.MemberOf, + hj.Service.Fqdn, + hj.Service.Datacenter). + Observe(checkTimeMs) return err } func (hj *HealthCheckJob) OnSuccess() { - healthChecksTotal.WithLabelValues(hj.Service.String(), "success").Inc() - healthCheckDuration.WithLabelValues(hj.Service.String()).Observe(float64(time.Since(hj.lastCheck).Milliseconds())) + healthChecksTotal.WithLabelValues(hj.Service.MemberOf, + hj.Service.Fqdn, + hj.Service.Datacenter, + "success"). + Inc() hj.Service.OnSuccess() } func (hj *HealthCheckJob) OnFailure(err error) { - healthChecksTotal.WithLabelValues(hj.Service.String(), "failure").Inc() - healthCheckDuration.WithLabelValues(hj.Service.String()).Observe(float64(time.Since(hj.lastCheck).Milliseconds())) + healthChecksTotal.WithLabelValues(hj.Service.MemberOf, + hj.Service.Fqdn, + hj.Service.Datacenter, + "failure"). + Inc() hj.Service.OnFailure(err) } diff --git a/internal/manager/healthcheck/metrics.go b/internal/manager/healthcheck/metrics.go index 6de8773..f4aa0f8 100644 --- a/internal/manager/healthcheck/metrics.go +++ b/internal/manager/healthcheck/metrics.go @@ -11,15 +11,16 @@ var ( Name: "healthcheck_total", Help: "Total health checks performed", }, - []string{"service", "success"}, + []string{"memberOf", "endpoint", "datacenter", "status"}, ) healthCheckDuration = promauto.NewHistogramVec( prometheus.HistogramOpts{ Name: "healthcheck_duration_ms", Help: "Health check duration", + Buckets: []float64{1, 5, 25, 50, 100, 250, 500, 1000, 2500, 5000}, }, - []string{"service"}, + []string{"memberOf", "endpoint", "datacenter"}, ) ) diff --git a/internal/manager/metrics.go b/internal/manager/metrics.go index 32d8706..233e1ce 100644 --- a/internal/manager/metrics.go +++ b/internal/manager/metrics.go @@ -21,6 +21,6 @@ var ( Name: "service_group_members", Help: "Number of members in each service group", }, - []string{"group"}, + []string{"memberOf"}, ) )