diff --git a/core/resource/resource.go b/core/resource/resource.go index 468644fa8..5e82116c3 100644 --- a/core/resource/resource.go +++ b/core/resource/resource.go @@ -8,7 +8,6 @@ import ( "os" "os/exec" "path/filepath" - "slices" "strings" "time" @@ -1511,17 +1510,6 @@ func getFiles(ctx context.Context, t Driver) Files { return files } -func (t Files) Merge(f File) Files { - l := slices.Clone(t) - for i, this := range l { - if this.Name == f.Name { - l[i] = f - return l - } - } - return append(l, f) -} - func (t Files) Lookup(name string) (File, bool) { for _, file := range t { if file.Name == name { diff --git a/daemon/imon/fetch_resource_files.go b/daemon/imon/fetch_resource_files.go index 665e9a40d..5f9a8fade 100644 --- a/daemon/imon/fetch_resource_files.go +++ b/daemon/imon/fetch_resource_files.go @@ -22,8 +22,46 @@ import ( "github.com/opensvc/om3/v3/daemon/msgbus" ) -func (t *filesManager) Fetched() resource.Files { - l := make(resource.Files, len(t.fetched)) +type ( + ridFile struct { + *resource.File + rid string + } + + ridFiles []ridFile + + filesManager struct { + // fetched stores the resource files we fetched to avoid uneeded refetch + fetched map[string]ridFile + + // attention stores a pending InstanceStatusUpdated event received while the fetch + // manager was already processing an event. This serves as a flag to immediately + // retrigger a new fetch cycle upon completion of the current one. + attention *msgbus.InstanceStatusUpdated + + // fetching is true when the resource files fetch and ingest routine is + // running + fetching bool + } +) + +func newFilesManager() *filesManager { + return &filesManager{ + fetched: make(map[string]ridFile), + } +} + +func (t ridFiles) Lookup(name string) (ridFile, bool) { + for _, file := range t { + if file.Name == name { + return file, true + } + } + return ridFile{}, false +} + +func (t *filesManager) Fetched() ridFiles { + l := make(ridFiles, len(t.fetched)) i := 0 for _, v := range t.fetched { l[i] = v @@ -33,18 +71,25 @@ func (t *filesManager) Fetched() resource.Files { } func (t *Manager) handleResourceFiles(ev *msgbus.InstanceStatusUpdated) { + if t.instConfig.ActorConfig == nil { + return + } + if ev.Node == t.localhost { + t.handleLocalResourceFiles(ev) + return + } + if !t.needFetchResourceFiles(ev) { + return + } if t.files.fetching { t.files.attention = ev return } - t.doHandleResourceFiles(ev) + t.handlePeerResourceFiles(ev) } -func (t *Manager) doHandleResourceFiles(ev *msgbus.InstanceStatusUpdated) { +func (t *Manager) handlePeerResourceFiles(ev *msgbus.InstanceStatusUpdated) { t.files.attention = nil - if !t.needFetchResourceFiles(ev) { - return - } t.files.fetching = true go t.fetchResourceFiles(t.files.Fetched(), t.instStatus[t.localhost], ev) } @@ -54,14 +99,8 @@ func (t *Manager) needFetchResourceFiles(ev *msgbus.InstanceStatusUpdated) bool if t.state.LocalExpect != instance.MonitorLocalExpectNone { return false } - if t.instConfig.ActorConfig == nil { - return false - } // Is the remote node a valid resource file authority - if ev.Node == t.localhost { - return false - } if ev.Value.Avail != status.Up { return false } @@ -69,7 +108,83 @@ func (t *Manager) needFetchResourceFiles(ev *msgbus.InstanceStatusUpdated) bool return true } -func (t *Manager) fetchResourceFiles(fetched resource.Files, localInstanceStatus instance.Status, ev *msgbus.InstanceStatusUpdated) { +func (t *Manager) initLocalResourceFiles() { + instanceStatus, ok := t.instStatus[t.localhost] + if !ok { + return + } + for rid, localResourceStatus := range instanceStatus.Resources { + for _, f := range localResourceStatus.Files { + t.log.Infof("%s: file %s discovered (csum=%s, mtime=%s)", rid, f.Name, f.Checksum, f.Mtime) + t.files.fetched[f.Name] = ridFile{ + rid: rid, + File: &f, + } + } + } +} + +func (t *Manager) handleLocalResourceFiles(ev *msgbus.InstanceStatusUpdated) { + var needFetch bool + + // Prepare a map of existing filenames (e.g. reported in the local + // instance status). + existingFilenames := make(map[string]any) + for rid, localResourceStatus := range ev.Value.Resources { + for _, f := range localResourceStatus.Files { + existingFilenames[f.Name] = nil + + if ev.Value.Avail != status.Up { + fetchedFile, ok := t.files.fetched[f.Name] + if !ok { + t.log.Infof("%s: file %s discovered (csum=%s, mtime=%s)", rid, f.Name, f.Checksum, f.Mtime) + t.files.fetched[f.Name] = ridFile{ + rid: rid, + File: &f, + } + needFetch = true + } else if fetchedFile.Checksum != f.Checksum { + t.log.Infof("%s: file %s altered locally (csum=%s, mtime=%s)", rid, f.Name, f.Checksum, f.Mtime) + needFetch = true + } + } + } + } + + // Prepare the list of filenames we fetched but are no longer existing + var toDelete []string + for filename, fetchedFile := range t.files.fetched { + if _, ok := existingFilenames[filename]; !ok { + toDelete = append(toDelete, filename) + t.log.Infof("%s: file %s disappeared", fetchedFile.rid, filename) + needFetch = true + } + } + + // Mark these filenames as not fetched + for _, filename := range toDelete { + delete(t.files.fetched, filename) + } + + // If a file was changed or removed or discovered, fetch from the up instance + if needFetch && ev.Value.Avail != status.Up { + for nodename, instanceStatus := range instance.StatusData.GetByPath(t.path) { + if t.localhost == nodename { + continue + } + if instanceStatus.Avail != status.Up { + continue + } + t.handleResourceFiles(&msgbus.InstanceStatusUpdated{ + Node: nodename, + Value: *instanceStatus, + }) + break + } + } +} + +func (t *Manager) fetchResourceFiles(fetched ridFiles, localInstanceStatus instance.Status, ev *msgbus.InstanceStatusUpdated) { var ( ridsToIngest []string @@ -97,7 +212,11 @@ func (t *Manager) fetchResourceFiles(fetched resource.Files, localInstanceStatus if !ok { // fallback to the t.instStatus cache, ie we never fetched // this file yet. - localFile, _ = localResourceStatus.Files.Lookup(peerFile.Name) + file, _ := localResourceStatus.Files.Lookup(peerFile.Name) + localFile = ridFile{ + File: &file, + rid: rid, + } } if !peerFile.Mtime.After(localFile.Mtime) { continue @@ -106,11 +225,14 @@ func (t *Manager) fetchResourceFiles(fetched resource.Files, localInstanceStatus continue } if err := t.fetchResourceFile(rid, peerFile, ev.Node); err != nil { - t.log.Warnf("%s", err) + t.log.Warnf("%s: fetch %s: %s", rid, peerFile.Name, err) continue } - done.Files = append(done.Files, peerFile) + done.Files = append(done.Files, ridFile{ + File: &peerFile, + rid: rid, + }) if peerFile.Ingest { ridsToIngest = append(ridsToIngest, rid) @@ -130,7 +252,7 @@ func (t *Manager) fetchResourceFiles(fetched resource.Files, localInstanceStatus } func (t *Manager) fetchResourceFile(rid string, peerFile resource.File, from string) error { - t.log.Infof("%s: fetch %s from %s", rid, peerFile.Name, from) + t.log.Infof("%s: file %s fetch from %s", rid, peerFile.Name, from) c, err := client.New( client.WithURL(daemonsubsystem.PeerURL(from)), client.WithUsername(t.localhost), @@ -151,7 +273,7 @@ func (t *Manager) fetchResourceFile(rid string, peerFile resource.File, from str defer resp.Body.Close() if resp.StatusCode != http.StatusOK { - return fmt.Errorf("%s: fetch %s: %s", rid, peerFile.Name, resp.Status) + return fmt.Errorf("unexpected api response status %s", resp.Status) } // Create a temp file and write the response body @@ -221,6 +343,6 @@ func (t *Manager) onFetchDone(c cmdFetchDone) { } t.files.fetching = false if t.files.attention != nil { - t.doHandleResourceFiles(t.files.attention) + t.handlePeerResourceFiles(t.files.attention) } } diff --git a/daemon/imon/main.go b/daemon/imon/main.go index 85fb35421..c960c2faa 100644 --- a/daemon/imon/main.go +++ b/daemon/imon/main.go @@ -36,7 +36,6 @@ import ( "github.com/opensvc/om3/v3/core/object" "github.com/opensvc/om3/v3/core/priority" "github.com/opensvc/om3/v3/core/rawconfig" - "github.com/opensvc/om3/v3/core/resource" "github.com/opensvc/om3/v3/core/status" "github.com/opensvc/om3/v3/daemon/daemondata" "github.com/opensvc/om3/v3/daemon/daemonenv" @@ -72,7 +71,7 @@ type ( // or InstanceStatusUpdated. instStatus map[string]instance.Status - files filesManager + files *filesManager // instMonitor tracks instance.Monitor for path on other nodes, iit is updated on // ObjectStatusUpdated for path events where srcEvent is InstanceMonitorDeleted @@ -170,20 +169,6 @@ type ( regularResourceOrchestrate orchestrationResource } - filesManager struct { - // fetched stores the resource files we fetched to avoid uneeded refetch - fetched map[string]resource.File - - // attention stores a pending InstanceStatusUpdated event received while the fetch - // manager was already processing an event. This serves as a flag to immediately - // retrigger a new fetch cycle upon completion of the current one. - attention *msgbus.InstanceStatusUpdated - - // fetching is true when the resource files fetch and ingest routine is - // running - fetching bool - } - // cmdOrchestrate can be used from post action go routines cmdOrchestrate struct { state instance.MonitorState @@ -201,7 +186,7 @@ type ( } cmdFetchDone struct { - Files resource.Files + Files ridFiles } Factory struct { @@ -256,9 +241,7 @@ func start(parent context.Context, qs pubsub.QueueSizer, p naming.Path, nodes [] cmdC: make(chan any), databus: databus, publisher: pubsub.PubFromContext(ctx), - files: filesManager{ - fetched: make(map[string]resource.File), - }, + files: newFilesManager(), instStatus: make(map[string]instance.Status), instMonitor: make(map[string]instance.Monitor), nodeMonitor: make(map[string]node.Monitor), @@ -385,6 +368,7 @@ func (t *Manager) worker(initialNodes []string) { t.initRelationAvailStatus() t.initResourceMonitor() + t.initLocalResourceFiles() t.updateIsLeader() t.updateIfChange() diff --git a/drivers/rescontainerocibase/executor.go b/drivers/rescontainerocibase/executor.go index 306578474..5c1f8f615 100644 --- a/drivers/rescontainerocibase/executor.go +++ b/drivers/rescontainerocibase/executor.go @@ -1,6 +1,7 @@ package rescontainerocibase import ( + "bytes" "context" "fmt" "io" @@ -69,13 +70,22 @@ func (e *Executor) Enter(ctx context.Context) error { candidates := []string{"/bin/bash", "/bin/sh"} ctx, cancel := context.WithTimeout(ctx, 10*time.Second) inspect, err := e.Inspect(ctx) + if err != nil { + return err + } + if inspect == nil { + return fmt.Errorf("the container is not running") + } pid := inspect.PID() + env, err := pidEnv(pid) if err != nil { return err } + outerLoop: for _, candidate := range candidates { - cmd := exec.CommandContext(ctx, "nsenter", "-t", fmt.Sprint(pid), "--all", "-e", "-w", candidate) + cmd := exec.CommandContext(ctx, "nsenter", "-t", fmt.Sprint(pid), "--all", "-w", candidate) + cmd.Env = env _ = cmd.Run() switch cmd.ProcessState.ExitCode() { @@ -91,7 +101,8 @@ outerLoop: return fmt.Errorf("can't enter: container needs at least one of following command: %s", strings.Join(candidates, ", ")) } - cmd := exec.Command("nsenter", "-t", fmt.Sprint(pid), "--all", "-e", "-w", enterCmd) + cmd := exec.Command("nsenter", "-t", fmt.Sprint(pid), "--all", "-w", enterCmd) + cmd.Env = env cmd.Stdin = os.Stdin cmd.Stdout = os.Stdout cmd.Stderr = os.Stderr @@ -106,6 +117,31 @@ outerLoop: return nil } +func pidEnv(pid int) ([]string, error) { + envPath := fmt.Sprintf("/proc/%d/environ", pid) + + data, err := os.ReadFile(envPath) + if err != nil { + return nil, err + } + + // Parse NUL-separated environment + raw := bytes.Split(data, []byte{0}) + + env := make([]string, 0, len(raw)) + for _, kv := range raw { + if len(kv) == 0 { + continue + } + // must contain '=' and not start with '=' + if i := bytes.IndexByte(kv, '='); i <= 0 { + continue + } + env = append(env, string(kv)) + } + return env, nil +} + func (e *Executor) HasImage(ctx context.Context) (bool, string, error) { var cmd *exec.Cmd a := e.getArgs(e.args.HasImageArgs().Get()...) diff --git a/drivers/resiphost/main.go b/drivers/resiphost/main.go index b52bfb694..36266b717 100644 --- a/drivers/resiphost/main.go +++ b/drivers/resiphost/main.go @@ -16,13 +16,15 @@ import ( "github.com/opensvc/om3/v3/core/resource" "github.com/opensvc/om3/v3/core/status" "github.com/opensvc/om3/v3/drivers/resip" - "github.com/opensvc/om3/v3/util/hostname" + "github.com/opensvc/om3/v3/util/duration" + "github.com/opensvc/om3/v3/util/getaddr" "github.com/opensvc/om3/v3/util/netif" "github.com/opensvc/om3/v3/util/ping" ) const ( tagNonRouted = "nonrouted" + maxIPAddrAge = 19 * time.Minute ) type ( @@ -46,9 +48,10 @@ type ( WaitDNS *time.Duration `json:"wait_dns"` // cache - _ipaddr net.IP - _ipmask net.IPMask - _ipnet *net.IPNet + _ipaddr net.IP + _ipaddrAge time.Duration + _ipmask net.IPMask + _ipnet *net.IPNet } Addrs []net.Addr @@ -92,10 +95,15 @@ func (t *T) getDevAndLabel() (string, string, error) { } func (t *T) Start(ctx context.Context) error { - if initialStatus := t.Status(ctx); initialStatus == status.Up { + if initialStatus := t.statusWithIPAddrCacheTrust(ctx); initialStatus == status.Up { t.Log().Infof("%s is already up on %s", t.Name, t.Dev) return nil } + if t._ipaddrAge > maxIPAddrAge { + return fmt.Errorf("ip %s lookup issue, cache expired (%s old)", t.Name, duration.FmtShortDuration(t._ipaddrAge)) + } else if t._ipaddrAge > 0 { + t.Log().Warnf("ip %s lookup issue, cache valid (%s old)", t.Name, duration.FmtShortDuration(t._ipaddrAge)) + } dev, label, err := t.getDevAndLabel() if err != nil { return err @@ -116,6 +124,11 @@ func (t *T) Start(ctx context.Context) error { } func (t *T) Stop(ctx context.Context) error { + if t._ipaddrAge > maxIPAddrAge { + return fmt.Errorf("ip %s lookup issue, cache expired (%s old)", t.Name, duration.FmtShortDuration(t._ipaddrAge)) + } else if t._ipaddrAge > 0 { + t.Log().Warnf("ip %s lookup issue, cache valid (%s old)", t.Name, duration.FmtShortDuration(t._ipaddrAge)) + } dev, _ := resip.SplitDevLabel(t.Dev) if err := t.stopAddr(ctx, dev); err != nil { return err @@ -124,6 +137,14 @@ func (t *T) Stop(ctx context.Context) error { } func (t *T) Status(ctx context.Context) status.T { + s := t.statusWithIPAddrCacheTrust(ctx) + if s == status.Up && t._ipaddrAge > 0 { + return status.Warn + } + return s +} + +func (t *T) statusWithIPAddrCacheTrust(ctx context.Context) status.T { if t.Name == "" { t.StatusLog().Warn("name not set") return status.NotApplicable @@ -159,10 +180,19 @@ func (t *T) statusOfAddr(ctx context.Context, dev string) status.T { err error addrs Addrs ) - ip := t.ipaddr() if t.Name == "" { return status.NotApplicable } + ip := t.ipaddr() + if ip == nil { + t.StatusLog().Error("ip %s lookup issue, cache miss", t.Name) + return status.Undef + } else if t._ipaddrAge > maxIPAddrAge { + t.StatusLog().Error("ip %s lookup issue, cache expired (%s old)", t.Name, duration.FmtShortDuration(t._ipaddrAge)) + return status.Warn + } else if t._ipaddrAge > 0 { + t.StatusLog().Warn("ip %s lookup issue, cache valid (%s old)", t.Name, duration.FmtShortDuration(t._ipaddrAge)) + } if i, err = net.InterfaceByName(dev); err != nil { if fmt.Sprint(err.(*net.OpError).Unwrap()) == "no such network interface" { t.StatusLog().Warn("interface %s not found", dev) @@ -201,7 +231,10 @@ func (t *T) Abort(ctx context.Context) bool { if t.ipaddr() == nil { return false // let start fail with an explicit error message } - if initialStatus := t.Status(ctx); initialStatus == status.Up { + if t._ipaddrAge > maxIPAddrAge { + return false // let start fail with an explicit error message + } + if initialStatus := t.statusWithIPAddrCacheTrust(ctx); initialStatus == status.Up { return false // let start fail with an explicit error message } if t.CheckCarrier { @@ -250,8 +283,13 @@ func (t *T) ipaddr() net.IP { if t._ipaddr != nil { return t._ipaddr } - t._ipaddr = t.getIPAddr() - return t._ipaddr + ip, age, err := getaddr.Lookup(t.Name) + if getaddr.IsErrManyAddr(err) { + t.StatusLog().Warn("%s", err) + } + t._ipaddr = ip + t._ipaddrAge = age + return ip } func (t *T) ipmask() net.IPMask { @@ -304,33 +342,6 @@ func (t *T) defaultMask() (net.IPMask, error) { return net.Mask, nil } -func (t *T) getIPAddr() net.IP { - switch { - case naming.IsValidFQDN(t.Name) || hostname.IsValid(t.Name): - var ( - l []net.IP - err error - ) - l, err = net.LookupIP(t.Name) - if err != nil { - t.Log().Errorf("%s", err) - return nil - } - n := len(l) - switch n { - case 0: - t.Log().Errorf("name %s is unresolvable", t.Name) - case 1: - // ok - default: - t.Log().Tracef("name %s is resolvables to %d address. Using the first.", t.Name, n) - } - return l[0] - default: - return net.ParseIP(t.Name) - } -} - func (t Addrs) Has(ip net.IP) bool { for _, addr := range t { listIP, _, _ := net.ParseCIDR(addr.String()) diff --git a/drivers/resiphost/text/kw/name b/drivers/resiphost/text/kw/name index 811cb1da9..58b15da1f 100644 --- a/drivers/resiphost/text/kw/name +++ b/drivers/resiphost/text/kw/name @@ -1,8 +1,43 @@ The DNS name or IP address of the ip resource. +Scoping +------- + Can be different from one node to the other, in which case the `name@` scoping syntax can be used. This is most useful to specify a different ip when the service starts in DRP mode, where subnets are likely to be different than those of the production datacenter. + +DNS name caching +---------------- + +If the value is a DNS name, a lookup is done for every resource status +evaluation. + +* If the lookup succeeds, the addr is store in a cache file. + +* If the lookup fails, the address is fetched from the cache file. + + * If the cache is not expired (refreshed in the past 19 minutes) the + address is trusted: + + * The resource status is degraded to from `up` to `warn`. + + * The resource restarts begin if `restarts > 0`. + The restart is a no-op but the resource status is re-evaluated. + If the lookup succeeds, the resource status to `up`. + + * The monitor action is triggered if all restarts failed and + `monitor_action` is set. + For example, a `freezestop` can still succeed using the cached + ip address. + + * If the cache is expired (older than 19 minutes): + + * The resource status is `warn`, weither the actual untrusted ip + address is plumbed or not. + + * The `stop` and `start` actions no longer work. An admin action + is necessary to repair the configuration or dns lookups. diff --git a/drivers/resipnetns/main.go b/drivers/resipnetns/main.go index bb3522acb..c1e10eb14 100644 --- a/drivers/resipnetns/main.go +++ b/drivers/resipnetns/main.go @@ -20,7 +20,8 @@ import ( "github.com/opensvc/om3/v3/core/resource" "github.com/opensvc/om3/v3/core/status" "github.com/opensvc/om3/v3/drivers/resip" - "github.com/opensvc/om3/v3/util/hostname" + "github.com/opensvc/om3/v3/util/duration" + "github.com/opensvc/om3/v3/util/getaddr" "github.com/opensvc/om3/v3/util/netif" "github.com/opensvc/om3/v3/util/ping" @@ -30,6 +31,8 @@ import ( const ( tagNonRouted = "nonrouted" tagDedicated = "dedicated" + + maxIPAddrAge = 19 * time.Minute ) type ( @@ -60,9 +63,10 @@ type ( Expose []string `json:"expose"` // cache - _ipaddr net.IP - _ipmask net.IPMask - _ipnet *net.IPNet + _ipaddr net.IP + _ipaddrAge time.Duration + _ipmask net.IPMask + _ipnet *net.IPNet } Addrs []net.Addr @@ -138,6 +142,11 @@ func (t *T) ActionResourceDeps() []actionresdeps.Dep { } func (t *T) Start(ctx context.Context) error { + if t._ipaddrAge > maxIPAddrAge { + return fmt.Errorf("ip %s lookup issue, cache expired (%s old)", t.Name, duration.FmtShortDuration(t._ipaddrAge)) + } else if t._ipaddrAge > 0 { + t.Log().Warnf("ip %s lookup issue, cache valid (%s old)", t.Name, duration.FmtShortDuration(t._ipaddrAge)) + } if err := t.startMode(ctx); err != nil { return err } @@ -271,6 +280,11 @@ func (t *T) startARP(netns ns.NetNS, guestDev string) error { } func (t *T) Stop(ctx context.Context) error { + if t._ipaddrAge > maxIPAddrAge { + return fmt.Errorf("ip %s lookup issue, cache expired (%s old)", t.Name, duration.FmtShortDuration(t._ipaddrAge)) + } else if t._ipaddrAge > 0 { + t.Log().Warnf("ip %s lookup issue, cache valid (%s old)", t.Name, duration.FmtShortDuration(t._ipaddrAge)) + } if t.Tags.Has(tagDedicated) { return t.stopDedicated(ctx) } @@ -301,6 +315,14 @@ func (t *T) devMTU() (int, error) { } func (t *T) Status(ctx context.Context) status.T { + s := t.statusWithIPAddrCacheTrust(ctx) + if s == status.Up && t._ipaddrAge > 0 { + return status.Warn + } + return s +} + +func (t *T) statusWithIPAddrCacheTrust(ctx context.Context) status.T { var ( err error carrier bool @@ -329,6 +351,18 @@ func (t *T) Status(ctx context.Context) status.T { return status.Down } } + + ip := t.ipaddr() + if ip == nil { + t.StatusLog().Error("ip %s lookup issue, cache miss", t.Name) + return status.Undef + } else if t._ipaddrAge > maxIPAddrAge { + t.StatusLog().Error("ip %s lookup issue, cache expired (%s old)", t.Name, duration.FmtShortDuration(t._ipaddrAge)) + return status.Undef + } else if t._ipaddrAge > 0 { + t.StatusLog().Warn("ip %s lookup issue, cache valid (%s old)", t.Name, duration.FmtShortDuration(t._ipaddrAge)) + } + netns, err := t.getNS(ctx) if err != nil { t.StatusLog().Error("netns: %s", err) @@ -380,7 +414,10 @@ func (t *T) Abort(ctx context.Context) bool { if t.ipaddr() == nil { return false // let start fail with an explicit error message } - if initialStatus := t.Status(ctx); initialStatus == status.Up { + if t._ipaddrAge > maxIPAddrAge { + return false // let start fail with an explicit error message + } + if initialStatus := t.statusWithIPAddrCacheTrust(ctx); initialStatus == status.Up { return false // let start fail with an explicit error message } if t.abortPing() { @@ -423,8 +460,13 @@ func (t *T) ipaddr() net.IP { if t._ipaddr != nil { return t._ipaddr } - t._ipaddr = t.getIPAddr() - return t._ipaddr + ip, age, err := getaddr.Lookup(t.Name) + if getaddr.IsErrManyAddr(err) { + t.StatusLog().Warn("%s", err) + } + t._ipaddr = ip + t._ipaddrAge = age + return ip } func (t *T) ipmask() net.IPMask { @@ -477,33 +519,6 @@ func (t *T) defaultMask() (net.IPMask, error) { return net.Mask, nil } -func (t *T) getIPAddr() net.IP { - switch { - case naming.IsValidFQDN(t.Name) || hostname.IsValid(t.Name): - var ( - l []net.IP - err error - ) - l, err = net.LookupIP(t.Name) - if err != nil { - t.Log().Errorf("%s", err) - return nil - } - n := len(l) - switch n { - case 0: - t.Log().Errorf("name %s is unresolvable", t.Name) - case 1: - // ok - default: - t.Log().Tracef("name %s is resolvables to %d address. Using the first.", t.Name, n) - } - return l[0] - default: - return net.ParseIP(t.Name) - } -} - func (t *T) netInterface() (*net.Interface, error) { return net.InterfaceByName(t.Dev) } diff --git a/drivers/resipnetns/text/kw/name b/drivers/resipnetns/text/kw/name index 811cb1da9..64fa81f0a 100644 --- a/drivers/resipnetns/text/kw/name +++ b/drivers/resipnetns/text/kw/name @@ -1,8 +1,44 @@ The DNS name or IP address of the ip resource. +Scoping +------- + Can be different from one node to the other, in which case the `name@` scoping syntax can be used. This is most useful to specify a different ip when the service starts in DRP mode, where subnets are likely to be different than those of the production datacenter. + +DNS name caching +---------------- + +If the value is a DNS name, a lookup is done for every resource status +evaluation. + +* If the lookup succeeds, the addr is store in a cache file. + +* If the lookup fails, the address is fetched from the cache file. + + * If the cache is not expired (refreshed in the past 19 minutes) the + address is trusted: + + * The resource status is degraded to from `up` to `warn`. + + * The resource restarts begin if `restarts > 0`. + The restart is a no-op but the resource status is re-evaluated. + If the lookup succeeds, the resource status to `up`. + + * The monitor action is triggered if all restarts failed and + `monitor_action` is set. + For example, a `freezestop` can still succeed using the cached + ip address. + + * If the cache is expired (older than 19 minutes): + + * The resource status is `warn`, weither the actual untrusted ip + address is plumbed or not. + + * The `stop` and `start` actions no longer work. An admin action + is necessary to repair the configuration or dns lookups. + diff --git a/util/getaddr/main.go b/util/getaddr/main.go new file mode 100644 index 000000000..20a5dfd03 --- /dev/null +++ b/util/getaddr/main.go @@ -0,0 +1,126 @@ +package getaddr + +import ( + "errors" + "fmt" + "net" + "os" + "path/filepath" + "time" + + "github.com/opensvc/om3/v3/core/naming" + "github.com/opensvc/om3/v3/core/rawconfig" + "github.com/opensvc/om3/v3/util/hostname" +) + +type ( + ErrCacheAddr struct { + name string + } + + ErrManyAddr struct { + name string + count int + } +) + +var ( + // netLookupIP is here to facilitate mocking in tests + netLookupIP = net.LookupIP + + // cacheDir is where to store the successful lookup results + cacheDir = filepath.Join(rawconfig.Paths.Var, "cache", "addrinfo") +) + +func (t ErrManyAddr) Error() string { + return fmt.Sprintf("name %s resolves to %d address", t.name, t.count) +} + +func (t ErrCacheAddr) Error() string { + return fmt.Sprintf("error caching the name %s addr", t.name) +} + +func IsErrManyAddr(err error) bool { + var e ErrManyAddr + return errors.As(err, &e) +} + +func fmtCacheFile(name string) string { + return filepath.Join(cacheDir, name) +} + +func load(name string) (net.IP, time.Duration, error) { + cacheFilename := fmtCacheFile(name) + stat, err := os.Stat(cacheFilename) + if err != nil { + return nil, 0, err + } + b, err := os.ReadFile(cacheFilename) + if err != nil { + return nil, 0, err + } + return net.ParseIP(string(b)), time.Since(stat.ModTime()), nil +} + +func cache(name string, ip net.IP) error { + if ip == nil { + return fmt.Errorf("refuse to cache invalid ip") + } + cacheFilename := fmtCacheFile(name) + write := func() error { + return os.WriteFile(cacheFilename, []byte(ip.String()), 0o0644) + } + if err := write(); !os.IsNotExist(err) { + return err + } + if err := os.MkdirAll(cacheDir, os.ModePerm); err != nil { + return err + } + return write() +} + +func lookup(name string) (net.IP, error) { + var ( + l []net.IP + err error + ) + l, err = netLookupIP(name) + if err != nil { + return nil, err + } + n := len(l) + switch n { + case 0: + return nil, fmt.Errorf("name %s is unresolvable", name) + case 1: + // ok + default: + return l[0], ErrManyAddr{name: name, count: n} + } + return l[0], nil +} + +func lookupAndCache(name string) (net.IP, error) { + if !naming.IsValidFQDN(name) && !hostname.IsValid(name) { + ip := net.ParseIP(name) + if ip == nil { + return nil, fmt.Errorf("unparsable ip %s", name) + } + return ip, nil + } + ip, err := lookup(name) + if err == nil || IsErrManyAddr(err) { + if err := cache(name, ip); err != nil { + return ip, fmt.Errorf("%w: %w", ErrCacheAddr{name: name}, err) + } + } + return ip, err +} + +func Lookup(name string) (net.IP, time.Duration, error) { + ip, err := lookupAndCache(name) + if err == nil { + return ip, 0, nil + } + return load(name) +} diff --git a/util/getaddr/main_test.go b/util/getaddr/main_test.go new file mode 100644 index 000000000..1301e6d1f --- /dev/null +++ b/util/getaddr/main_test.go @@ -0,0 +1,283 @@ +package getaddr + +import ( + "errors" + "net" + "os" + "path/filepath" + "testing" + + "github.com/opensvc/om3/v3/core/naming" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// mockLookupIP simule net.LookupIP pour les tests. +var mockLookupIP = func(name string) ([]net.IP, error) { + return nil, errors.New("not implemented") +} + +// saveOriginalLookupIP sauvegarde la fonction originale netLookupIP. +var originalLookupIP func(string) ([]net.IP, error) + +// TestMain configure les tests et nettoie après. +func TestMain(m *testing.M) { + // Sauvegarder la fonction originale netLookupIP + originalLookupIP = netLookupIP + // Remplacer par notre mock + netLookupIP = mockLookupIP + + // Exécuter les tests + code := m.Run() + + // Restaurer la fonction originale + netLookupIP = originalLookupIP + + os.Exit(code) +} + +func TestErrManyAddr_Error(t *testing.T) { + err := ErrManyAddr{name: "example.com", count: 3} + assert.Equal(t, "name example.com resolves to 3 address", err.Error()) +} + +func TestErrCacheAddr_Error(t *testing.T) { + err := ErrCacheAddr{name: "example.com"} + assert.Equal(t, "error caching the name example.com addr", err.Error()) +} + +func TestFmtCacheFile(t *testing.T) { + // Sauvegarder la valeur originale de cacheDir + originalCacheDir := cacheDir + // Définir un chemin temporaire pour les tests + cacheDir = filepath.Join(os.TempDir(), "test_cache_dir") + + // Tester fmtCacheFile + filename := fmtCacheFile("example.com") + expected := filepath.Join(cacheDir, "example.com") + assert.Equal(t, expected, filename) + + // Restaurer cacheDir + cacheDir = originalCacheDir +} + +func TestLoad(t *testing.T) { + // Créer un répertoire temporaire pour les tests + tempDir := t.TempDir() + originalCacheDir := cacheDir + cacheDir = tempDir + + // Créer un fichier de cache factice + cacheFile := filepath.Join(tempDir, "example.com") + ip := net.ParseIP("192.168.1.1") + err := os.WriteFile(cacheFile, []byte(ip.String()), 0o644) + require.NoError(t, err) + + // Tester load + loadedIP, duration, err := load("example.com") + require.NoError(t, err) + assert.Equal(t, ip, loadedIP) + assert.True(t, duration > 0) + + // Tester avec un fichier inexistant + _, _, err = load("nonexistent.com") + assert.Error(t, err) + + // Restaurer cacheDir + cacheDir = originalCacheDir +} + +func TestCache(t *testing.T) { + // Créer un répertoire temporaire pour les tests + tempDir := t.TempDir() + originalCacheDir := cacheDir + cacheDir = tempDir + + // Tester cache avec un IP valide + ip := net.ParseIP("192.168.1.1") + err := cache("example.com", ip) + require.NoError(t, err) + + // Vérifier que le fichier a été créé + cacheFile := filepath.Join(tempDir, "example.com") + _, err = os.Stat(cacheFile) + assert.NoError(t, err) + + // Tester avec un IP invalide (nil) + err = cache("example.com", nil) + assert.Error(t, err) + + // Restaurer cacheDir + cacheDir = originalCacheDir +} + +func TestLookup(t *testing.T) { + tests := []struct { + name string + mockFunc func(string) ([]net.IP, error) + expected net.IP + err error + }{ + { + name: "single IP", + mockFunc: func(name string) ([]net.IP, error) { + return []net.IP{net.ParseIP("192.168.1.1")}, nil + }, + expected: net.ParseIP("192.168.1.1"), + err: nil, + }, + { + name: "no IP", + mockFunc: func(name string) ([]net.IP, error) { + return []net.IP{}, nil + }, + expected: nil, + err: errors.New("name example.com is unresolvable"), + }, + { + name: "multiple IPs", + mockFunc: func(name string) ([]net.IP, error) { + return []net.IP{ + net.ParseIP("192.168.1.1"), + net.ParseIP("192.168.1.2"), + }, nil + }, + expected: net.ParseIP("192.168.1.1"), + err: ErrManyAddr{name: "example.com", count: 2}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Remplacer le mock + netLookupIP = tt.mockFunc + + ip, err := lookup("example.com") + if tt.err != nil { + assert.EqualError(t, err, tt.err.Error()) + } else { + assert.NoError(t, err) + } + assert.Equal(t, tt.expected, ip) + }) + } + + // Restaurer le mock original + netLookupIP = mockLookupIP +} + +func TestLookupAndCache(t *testing.T) { + // Créer un répertoire temporaire pour les tests + tempDir := t.TempDir() + originalCacheDir := cacheDir + cacheDir = tempDir + + tests := []struct { + name string + input string + mockFunc func(string) ([]net.IP, error) + expected net.IP + err error + }{ + { + name: "valid IP string", + input: "192.168.1.1", + expected: net.ParseIP("192.168.1.1"), + err: nil, + }, + { + name: "invalid IP string", + input: "1.2.3.?", + expected: nil, + err: errors.New("unparsable ip 1.2.3.?"), + }, + { + name: "valid FQDN", + input: "example.com", + mockFunc: func(name string) ([]net.IP, error) { + return []net.IP{net.ParseIP("192.168.1.1")}, nil + }, + expected: net.ParseIP("192.168.1.1"), + err: nil, + }, + { + name: "unresolvable FQDN", + input: "unresolvable.com", + mockFunc: func(name string) ([]net.IP, error) { + return []net.IP{}, nil + }, + expected: nil, + err: errors.New("name unresolvable.com is unresolvable"), + }, + { + name: "FQDN with multiple IPs", + input: "multihomed.com", + mockFunc: func(name string) ([]net.IP, error) { + return []net.IP{ + net.ParseIP("192.168.1.1"), + net.ParseIP("192.168.1.2"), + }, nil + }, + expected: net.ParseIP("192.168.1.1"), + err: ErrManyAddr{name: "multihomed.com", count: 2}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Remplacer le mock + netLookupIP = tt.mockFunc + + ip, err := lookupAndCache(tt.input) + if tt.err != nil { + assert.EqualError(t, err, tt.err.Error()) + } else { + assert.NoError(t, err) + } + assert.Equal(t, tt.expected, ip) + + // Vérifier que le cache a été créé si nécessaire + if tt.err == nil && naming.IsValidFQDN(tt.input) { + cacheFile := filepath.Join(tempDir, tt.input) + _, err := os.Stat(cacheFile) + assert.NoError(t, err) + } + }) + } + + // Restaurer le mock original et cacheDir + netLookupIP = mockLookupIP + cacheDir = originalCacheDir +} + +func TestLookupFunction(t *testing.T) { + // Créer un répertoire temporaire pour les tests + tempDir := t.TempDir() + originalCacheDir := cacheDir + cacheDir = tempDir + + // Créer un fichier de cache factice + cacheFile := filepath.Join(tempDir, "example.com") + ip := net.ParseIP("192.168.1.1") + err := os.WriteFile(cacheFile, []byte(ip.String()), 0o644) + require.NoError(t, err) + + // Définir un mock pour netLookupIP qui échoue + netLookupIP = func(name string) ([]net.IP, error) { + return nil, errors.New("lookup failed") + } + + // Tester Lookup avec un cache existant + loadedIP, duration, err := Lookup("example.com") + require.NoError(t, err) + assert.Equal(t, ip, loadedIP) + assert.True(t, duration > 0) + + // Tester Lookup sans cache + _, _, err = Lookup("nonexistent.com") + assert.Error(t, err) + + // Restaurer cacheDir et le mock original + cacheDir = originalCacheDir + netLookupIP = originalLookupIP +}