From abf81915fc7a429830bedffde9058287af15b8b6 Mon Sep 17 00:00:00 2001 From: Christophe Varoqui Date: Sat, 10 Jan 2026 22:10:44 +0100 Subject: [PATCH 1/7] Add the ip addr file cache /var/lib/opensvc/cache/addrinfo/ files contain the resolved ipaddr. When the name resolution does not work, the cache files content is trusted for 16m. After that duration, the ip resource status is considered undef and we now longer accept to stop or start the resource as we don't know the addr. --- drivers/resiphost/main.go | 71 +++++----- drivers/resipnetns/main.go | 73 +++++----- util/getaddr/main.go | 126 +++++++++++++++++ util/getaddr/main_test.go | 283 +++++++++++++++++++++++++++++++++++++ 4 files changed, 486 insertions(+), 67 deletions(-) create mode 100644 util/getaddr/main.go create mode 100644 util/getaddr/main_test.go diff --git a/drivers/resiphost/main.go b/drivers/resiphost/main.go index b52bfb694..1a1c71a68 100644 --- a/drivers/resiphost/main.go +++ b/drivers/resiphost/main.go @@ -16,13 +16,15 @@ import ( "github.com/opensvc/om3/v3/core/resource" "github.com/opensvc/om3/v3/core/status" "github.com/opensvc/om3/v3/drivers/resip" - "github.com/opensvc/om3/v3/util/hostname" + "github.com/opensvc/om3/v3/util/duration" + "github.com/opensvc/om3/v3/util/getaddr" "github.com/opensvc/om3/v3/util/netif" "github.com/opensvc/om3/v3/util/ping" ) const ( tagNonRouted = "nonrouted" + maxIPAddrAge = 16 * time.Minute ) type ( @@ -46,9 +48,10 @@ type ( WaitDNS *time.Duration `json:"wait_dns"` // cache - _ipaddr net.IP - _ipmask net.IPMask - _ipnet *net.IPNet + _ipaddr net.IP + _ipaddrAge time.Duration + _ipmask net.IPMask + _ipnet *net.IPNet } Addrs []net.Addr @@ -96,6 +99,11 @@ func (t *T) Start(ctx context.Context) error { t.Log().Infof("%s is already up on %s", t.Name, t.Dev) return nil } + if t._ipaddrAge > maxIPAddrAge { + return fmt.Errorf("ip %s lookup issue, cache expired (%s old)", t.Name, duration.FmtShortDuration(t._ipaddrAge)) + } else if t._ipaddrAge > 0 { + t.Log().Warnf("ip %s lookup issue, cache valid (%s old)", t.Name, duration.FmtShortDuration(t._ipaddrAge)) + } dev, label, err := t.getDevAndLabel() if err != nil { return err @@ -116,6 +124,11 @@ func (t *T) Start(ctx context.Context) error { } func (t *T) Stop(ctx context.Context) error { + if t._ipaddrAge > maxIPAddrAge { + return fmt.Errorf("ip %s lookup issue, cache expired (%s old)", t.Name, duration.FmtShortDuration(t._ipaddrAge)) + } else if t._ipaddrAge > 0 { + t.Log().Warnf("ip %s lookup issue, cache valid (%s old)", t.Name, duration.FmtShortDuration(t._ipaddrAge)) + } dev, _ := resip.SplitDevLabel(t.Dev) if err := t.stopAddr(ctx, dev); err != nil { return err @@ -159,10 +172,16 @@ func (t *T) statusOfAddr(ctx context.Context, dev string) status.T { err error addrs Addrs ) - ip := t.ipaddr() if t.Name == "" { return status.NotApplicable } + ip := t.ipaddr() + if t._ipaddrAge > maxIPAddrAge { + t.StatusLog().Error("ip %s lookup issue, cache expired (%s old)", t.Name, duration.FmtShortDuration(t._ipaddrAge)) + return status.Undef + } else if t._ipaddrAge > 0 { + t.StatusLog().Warn("ip %s lookup issue, cache valid (%s old)", t.Name, duration.FmtShortDuration(t._ipaddrAge)) + } if i, err = net.InterfaceByName(dev); err != nil { if fmt.Sprint(err.(*net.OpError).Unwrap()) == "no such network interface" { t.StatusLog().Warn("interface %s not found", dev) @@ -179,6 +198,9 @@ func (t *T) statusOfAddr(ctx context.Context, dev string) status.T { t.Log().Tracef("ip not found on intf") return status.Down } + if t._ipaddrAge > 0 { + return status.Warn + } return status.Up } @@ -201,6 +223,9 @@ func (t *T) Abort(ctx context.Context) bool { if t.ipaddr() == nil { return false // let start fail with an explicit error message } + if t._ipaddrAge > maxIPAddrAge { + return false // let start fail with an explicit error message + } if initialStatus := t.Status(ctx); initialStatus == status.Up { return false // let start fail with an explicit error message } @@ -250,8 +275,13 @@ func (t *T) ipaddr() net.IP { if t._ipaddr != nil { return t._ipaddr } - t._ipaddr = t.getIPAddr() - return t._ipaddr + ip, age, err := getaddr.Lookup(t.Name) + if getaddr.IsErrManyAddr(err) { + t.StatusLog().Warn("%s", err) + } + t._ipaddr = ip + t._ipaddrAge = age + return ip } func (t *T) ipmask() net.IPMask { @@ -304,33 +334,6 @@ func (t *T) defaultMask() (net.IPMask, error) { return net.Mask, nil } -func (t *T) getIPAddr() net.IP { - switch { - case naming.IsValidFQDN(t.Name) || hostname.IsValid(t.Name): - var ( - l []net.IP - err error - ) - l, err = net.LookupIP(t.Name) - if err != nil { - t.Log().Errorf("%s", err) - return nil - } - n := len(l) - switch n { - case 0: - t.Log().Errorf("name %s is unresolvable", t.Name) - case 1: - // ok - default: - t.Log().Tracef("name %s is resolvables to %d address. Using the first.", t.Name, n) - } - return l[0] - default: - return net.ParseIP(t.Name) - } -} - func (t Addrs) Has(ip net.IP) bool { for _, addr := range t { listIP, _, _ := net.ParseCIDR(addr.String()) diff --git a/drivers/resipnetns/main.go b/drivers/resipnetns/main.go index bb3522acb..7403cf229 100644 --- a/drivers/resipnetns/main.go +++ b/drivers/resipnetns/main.go @@ -20,7 +20,8 @@ import ( "github.com/opensvc/om3/v3/core/resource" "github.com/opensvc/om3/v3/core/status" "github.com/opensvc/om3/v3/drivers/resip" - "github.com/opensvc/om3/v3/util/hostname" + "github.com/opensvc/om3/v3/util/duration" + "github.com/opensvc/om3/v3/util/getaddr" "github.com/opensvc/om3/v3/util/netif" "github.com/opensvc/om3/v3/util/ping" @@ -30,6 +31,8 @@ import ( const ( tagNonRouted = "nonrouted" tagDedicated = "dedicated" + + maxIPAddrAge = 16 * time.Minute ) type ( @@ -60,9 +63,10 @@ type ( Expose []string `json:"expose"` // cache - _ipaddr net.IP - _ipmask net.IPMask - _ipnet *net.IPNet + _ipaddr net.IP + _ipaddrAge time.Duration + _ipmask net.IPMask + _ipnet *net.IPNet } Addrs []net.Addr @@ -138,6 +142,11 @@ func (t *T) ActionResourceDeps() []actionresdeps.Dep { } func (t *T) Start(ctx context.Context) error { + if t._ipaddrAge > maxIPAddrAge { + return fmt.Errorf("ip %s lookup issue, cache expired (%s old)", t.Name, duration.FmtShortDuration(t._ipaddrAge)) + } else if t._ipaddrAge > 0 { + t.Log().Warnf("ip %s lookup issue, cache valid (%s old)", t.Name, duration.FmtShortDuration(t._ipaddrAge)) + } if err := t.startMode(ctx); err != nil { return err } @@ -271,6 +280,11 @@ func (t *T) startARP(netns ns.NetNS, guestDev string) error { } func (t *T) Stop(ctx context.Context) error { + if t._ipaddrAge > maxIPAddrAge { + return fmt.Errorf("ip %s lookup issue, cache expired (%s old)", t.Name, duration.FmtShortDuration(t._ipaddrAge)) + } else if t._ipaddrAge > 0 { + t.Log().Warnf("ip %s lookup issue, cache valid (%s old)", t.Name, duration.FmtShortDuration(t._ipaddrAge)) + } if t.Tags.Has(tagDedicated) { return t.stopDedicated(ctx) } @@ -329,6 +343,15 @@ func (t *T) Status(ctx context.Context) status.T { return status.Down } } + + _ = t.ipaddr() + if t._ipaddrAge > maxIPAddrAge { + t.StatusLog().Error("ip %s lookup issue, cache expired (%s old)", t.Name, duration.FmtShortDuration(t._ipaddrAge)) + return status.Undef + } else if t._ipaddrAge > 0 { + t.StatusLog().Warn("ip %s lookup issue, cache valid (%s old)", t.Name, duration.FmtShortDuration(t._ipaddrAge)) + } + netns, err := t.getNS(ctx) if err != nil { t.StatusLog().Error("netns: %s", err) @@ -347,6 +370,9 @@ func (t *T) Status(ctx context.Context) status.T { if guestDev == "" { return status.Down } + if t._ipaddrAge > 0 { + return status.Warn + } return status.Up } @@ -380,6 +406,9 @@ func (t *T) Abort(ctx context.Context) bool { if t.ipaddr() == nil { return false // let start fail with an explicit error message } + if t._ipaddrAge > maxIPAddrAge { + return false // let start fail with an explicit error message + } if initialStatus := t.Status(ctx); initialStatus == status.Up { return false // let start fail with an explicit error message } @@ -423,8 +452,13 @@ func (t *T) ipaddr() net.IP { if t._ipaddr != nil { return t._ipaddr } - t._ipaddr = t.getIPAddr() - return t._ipaddr + ip, age, err := getaddr.Lookup(t.Name) + if getaddr.IsErrManyAddr(err) { + t.StatusLog().Warn("%s", err) + } + t._ipaddr = ip + t._ipaddrAge = age + return ip } func (t *T) ipmask() net.IPMask { @@ -477,33 +511,6 @@ func (t *T) defaultMask() (net.IPMask, error) { return net.Mask, nil } -func (t *T) getIPAddr() net.IP { - switch { - case naming.IsValidFQDN(t.Name) || hostname.IsValid(t.Name): - var ( - l []net.IP - err error - ) - l, err = net.LookupIP(t.Name) - if err != nil { - t.Log().Errorf("%s", err) - return nil - } - n := len(l) - switch n { - case 0: - t.Log().Errorf("name %s is unresolvable", t.Name) - case 1: - // ok - default: - t.Log().Tracef("name %s is resolvables to %d address. Using the first.", t.Name, n) - } - return l[0] - default: - return net.ParseIP(t.Name) - } -} - func (t *T) netInterface() (*net.Interface, error) { return net.InterfaceByName(t.Dev) } diff --git a/util/getaddr/main.go b/util/getaddr/main.go new file mode 100644 index 000000000..20a5dfd03 --- /dev/null +++ b/util/getaddr/main.go @@ -0,0 +1,126 @@ +package getaddr + +import ( + "errors" + "fmt" + "net" + "os" + "path/filepath" + "time" + + "github.com/opensvc/om3/v3/core/naming" + "github.com/opensvc/om3/v3/core/rawconfig" + "github.com/opensvc/om3/v3/util/hostname" +) + +type ( + ErrCacheAddr struct { + name string + } + + ErrManyAddr struct { + name string + count int + } +) + +var ( + // netLookupIP is here to facilitate mocking in tests + netLookupIP = net.LookupIP + + // cacheDir is where to store the successful lookup results + cacheDir = filepath.Join(rawconfig.Paths.Var, "cache", "addrinfo") +) + +func (t ErrManyAddr) Error() string { + return fmt.Sprintf("name %s resolves to %d address", t.name, t.count) +} + +func (t ErrCacheAddr) Error() string { + return fmt.Sprintf("error caching the name %s addr", t.name) +} + +func IsErrManyAddr(err error) bool { + var e ErrManyAddr + return errors.As(err, &e) +} + +func fmtCacheFile(name string) string { + return filepath.Join(cacheDir, name) +} + +func load(name string) (net.IP, time.Duration, error) { + cacheFilename := fmtCacheFile(name) + stat, err := os.Stat(cacheFilename) + if err != nil { + return nil, 0, err + } + b, err := os.ReadFile(cacheFilename) + if err != nil { + return nil, 0, err + } + return net.ParseIP(string(b)), time.Since(stat.ModTime()), nil +} + +func cache(name string, ip net.IP) error { + if ip == nil { + return fmt.Errorf("refuse to cache invalid ip") + } + cacheFilename := fmtCacheFile(name) + write := func() error { + return os.WriteFile(cacheFilename, []byte(ip.String()), 0o0644) + } + if err := write(); !os.IsNotExist(err) { + return err + } + if err := os.MkdirAll(cacheDir, os.ModePerm); err != nil { + return err + } + return write() +} + +func lookup(name string) (net.IP, error) { + var ( + l []net.IP + err error + ) + l, err = netLookupIP(name) + if err != nil { + return nil, err + } + n := len(l) + switch n { + case 0: + return nil, fmt.Errorf("name %s is unresolvable", name) + case 1: + // ok + default: + return l[0], ErrManyAddr{name: name, count: n} + } + return l[0], nil +} + +func lookupAndCache(name string) (net.IP, error) { + if !naming.IsValidFQDN(name) && !hostname.IsValid(name) { + ip := net.ParseIP(name) + if ip == nil { + return nil, fmt.Errorf("unparsable ip %s", name) + } + return ip, nil + } + ip, err := lookup(name) + if err == nil || IsErrManyAddr(err) { + if err := cache(name, ip); err != nil { + return ip, fmt.Errorf("%w: %w", ErrCacheAddr{name: name}, err) + } + } + return ip, err +} + +func Lookup(name string) (net.IP, time.Duration, error) { + ip, err := lookupAndCache(name) + if err == nil { + return ip, 0, nil + } + return load(name) +} diff --git a/util/getaddr/main_test.go b/util/getaddr/main_test.go new file mode 100644 index 000000000..1301e6d1f --- /dev/null +++ b/util/getaddr/main_test.go @@ -0,0 +1,283 @@ +package getaddr + +import ( + "errors" + "net" + "os" + "path/filepath" + "testing" + + "github.com/opensvc/om3/v3/core/naming" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// mockLookupIP simule net.LookupIP pour les tests. +var mockLookupIP = func(name string) ([]net.IP, error) { + return nil, errors.New("not implemented") +} + +// saveOriginalLookupIP sauvegarde la fonction originale netLookupIP. +var originalLookupIP func(string) ([]net.IP, error) + +// TestMain configure les tests et nettoie après. +func TestMain(m *testing.M) { + // Sauvegarder la fonction originale netLookupIP + originalLookupIP = netLookupIP + // Remplacer par notre mock + netLookupIP = mockLookupIP + + // Exécuter les tests + code := m.Run() + + // Restaurer la fonction originale + netLookupIP = originalLookupIP + + os.Exit(code) +} + +func TestErrManyAddr_Error(t *testing.T) { + err := ErrManyAddr{name: "example.com", count: 3} + assert.Equal(t, "name example.com resolves to 3 address", err.Error()) +} + +func TestErrCacheAddr_Error(t *testing.T) { + err := ErrCacheAddr{name: "example.com"} + assert.Equal(t, "error caching the name example.com addr", err.Error()) +} + +func TestFmtCacheFile(t *testing.T) { + // Sauvegarder la valeur originale de cacheDir + originalCacheDir := cacheDir + // Définir un chemin temporaire pour les tests + cacheDir = filepath.Join(os.TempDir(), "test_cache_dir") + + // Tester fmtCacheFile + filename := fmtCacheFile("example.com") + expected := filepath.Join(cacheDir, "example.com") + assert.Equal(t, expected, filename) + + // Restaurer cacheDir + cacheDir = originalCacheDir +} + +func TestLoad(t *testing.T) { + // Créer un répertoire temporaire pour les tests + tempDir := t.TempDir() + originalCacheDir := cacheDir + cacheDir = tempDir + + // Créer un fichier de cache factice + cacheFile := filepath.Join(tempDir, "example.com") + ip := net.ParseIP("192.168.1.1") + err := os.WriteFile(cacheFile, []byte(ip.String()), 0o644) + require.NoError(t, err) + + // Tester load + loadedIP, duration, err := load("example.com") + require.NoError(t, err) + assert.Equal(t, ip, loadedIP) + assert.True(t, duration > 0) + + // Tester avec un fichier inexistant + _, _, err = load("nonexistent.com") + assert.Error(t, err) + + // Restaurer cacheDir + cacheDir = originalCacheDir +} + +func TestCache(t *testing.T) { + // Créer un répertoire temporaire pour les tests + tempDir := t.TempDir() + originalCacheDir := cacheDir + cacheDir = tempDir + + // Tester cache avec un IP valide + ip := net.ParseIP("192.168.1.1") + err := cache("example.com", ip) + require.NoError(t, err) + + // Vérifier que le fichier a été créé + cacheFile := filepath.Join(tempDir, "example.com") + _, err = os.Stat(cacheFile) + assert.NoError(t, err) + + // Tester avec un IP invalide (nil) + err = cache("example.com", nil) + assert.Error(t, err) + + // Restaurer cacheDir + cacheDir = originalCacheDir +} + +func TestLookup(t *testing.T) { + tests := []struct { + name string + mockFunc func(string) ([]net.IP, error) + expected net.IP + err error + }{ + { + name: "single IP", + mockFunc: func(name string) ([]net.IP, error) { + return []net.IP{net.ParseIP("192.168.1.1")}, nil + }, + expected: net.ParseIP("192.168.1.1"), + err: nil, + }, + { + name: "no IP", + mockFunc: func(name string) ([]net.IP, error) { + return []net.IP{}, nil + }, + expected: nil, + err: errors.New("name example.com is unresolvable"), + }, + { + name: "multiple IPs", + mockFunc: func(name string) ([]net.IP, error) { + return []net.IP{ + net.ParseIP("192.168.1.1"), + net.ParseIP("192.168.1.2"), + }, nil + }, + expected: net.ParseIP("192.168.1.1"), + err: ErrManyAddr{name: "example.com", count: 2}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Remplacer le mock + netLookupIP = tt.mockFunc + + ip, err := lookup("example.com") + if tt.err != nil { + assert.EqualError(t, err, tt.err.Error()) + } else { + assert.NoError(t, err) + } + assert.Equal(t, tt.expected, ip) + }) + } + + // Restaurer le mock original + netLookupIP = mockLookupIP +} + +func TestLookupAndCache(t *testing.T) { + // Créer un répertoire temporaire pour les tests + tempDir := t.TempDir() + originalCacheDir := cacheDir + cacheDir = tempDir + + tests := []struct { + name string + input string + mockFunc func(string) ([]net.IP, error) + expected net.IP + err error + }{ + { + name: "valid IP string", + input: "192.168.1.1", + expected: net.ParseIP("192.168.1.1"), + err: nil, + }, + { + name: "invalid IP string", + input: "1.2.3.?", + expected: nil, + err: errors.New("unparsable ip 1.2.3.?"), + }, + { + name: "valid FQDN", + input: "example.com", + mockFunc: func(name string) ([]net.IP, error) { + return []net.IP{net.ParseIP("192.168.1.1")}, nil + }, + expected: net.ParseIP("192.168.1.1"), + err: nil, + }, + { + name: "unresolvable FQDN", + input: "unresolvable.com", + mockFunc: func(name string) ([]net.IP, error) { + return []net.IP{}, nil + }, + expected: nil, + err: errors.New("name unresolvable.com is unresolvable"), + }, + { + name: "FQDN with multiple IPs", + input: "multihomed.com", + mockFunc: func(name string) ([]net.IP, error) { + return []net.IP{ + net.ParseIP("192.168.1.1"), + net.ParseIP("192.168.1.2"), + }, nil + }, + expected: net.ParseIP("192.168.1.1"), + err: ErrManyAddr{name: "multihomed.com", count: 2}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Remplacer le mock + netLookupIP = tt.mockFunc + + ip, err := lookupAndCache(tt.input) + if tt.err != nil { + assert.EqualError(t, err, tt.err.Error()) + } else { + assert.NoError(t, err) + } + assert.Equal(t, tt.expected, ip) + + // Vérifier que le cache a été créé si nécessaire + if tt.err == nil && naming.IsValidFQDN(tt.input) { + cacheFile := filepath.Join(tempDir, tt.input) + _, err := os.Stat(cacheFile) + assert.NoError(t, err) + } + }) + } + + // Restaurer le mock original et cacheDir + netLookupIP = mockLookupIP + cacheDir = originalCacheDir +} + +func TestLookupFunction(t *testing.T) { + // Créer un répertoire temporaire pour les tests + tempDir := t.TempDir() + originalCacheDir := cacheDir + cacheDir = tempDir + + // Créer un fichier de cache factice + cacheFile := filepath.Join(tempDir, "example.com") + ip := net.ParseIP("192.168.1.1") + err := os.WriteFile(cacheFile, []byte(ip.String()), 0o644) + require.NoError(t, err) + + // Définir un mock pour netLookupIP qui échoue + netLookupIP = func(name string) ([]net.IP, error) { + return nil, errors.New("lookup failed") + } + + // Tester Lookup avec un cache existant + loadedIP, duration, err := Lookup("example.com") + require.NoError(t, err) + assert.Equal(t, ip, loadedIP) + assert.True(t, duration > 0) + + // Tester Lookup sans cache + _, _, err = Lookup("nonexistent.com") + assert.Error(t, err) + + // Restaurer cacheDir et le mock original + cacheDir = originalCacheDir + netLookupIP = originalLookupIP +} From 2d036d90561c767a9e27f142889b58b2c289515d Mon Sep 17 00:00:00 2001 From: Christophe Varoqui Date: Mon, 12 Jan 2026 16:01:35 +0100 Subject: [PATCH 2/7] Fix a hang with "om enter --rid container#db" With micro-containers we used "nsenter -e" but this can hang with certain /proc//environ contents. Parse the container process environ ourselves and pass the env to the exec.Command. Don't use -e anymore. --- drivers/rescontainerocibase/executor.go | 37 +++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/drivers/rescontainerocibase/executor.go b/drivers/rescontainerocibase/executor.go index 306578474..c89ed9df9 100644 --- a/drivers/rescontainerocibase/executor.go +++ b/drivers/rescontainerocibase/executor.go @@ -1,6 +1,7 @@ package rescontainerocibase import ( + "bytes" "context" "fmt" "io" @@ -73,9 +74,15 @@ func (e *Executor) Enter(ctx context.Context) error { if err != nil { return err } + env, err := pidEnv(pid) + if err != nil { + return err + } + outerLoop: for _, candidate := range candidates { - cmd := exec.CommandContext(ctx, "nsenter", "-t", fmt.Sprint(pid), "--all", "-e", "-w", candidate) + cmd := exec.CommandContext(ctx, "nsenter", "-t", fmt.Sprint(pid), "--all", "-w", candidate) + cmd.Env = env _ = cmd.Run() switch cmd.ProcessState.ExitCode() { @@ -91,7 +98,8 @@ outerLoop: return fmt.Errorf("can't enter: container needs at least one of following command: %s", strings.Join(candidates, ", ")) } - cmd := exec.Command("nsenter", "-t", fmt.Sprint(pid), "--all", "-e", "-w", enterCmd) + cmd := exec.Command("nsenter", "-t", fmt.Sprint(pid), "--all", "-w", enterCmd) + cmd.Env = env cmd.Stdin = os.Stdin cmd.Stdout = os.Stdout cmd.Stderr = os.Stderr @@ -106,6 +114,31 @@ outerLoop: return nil } +func pidEnv(pid int) ([]string, error) { + envPath := fmt.Sprintf("/proc/%d/environ", pid) + + data, err := os.ReadFile(envPath) + if err != nil { + return nil, err + } + + // Parse NUL-separated environment + raw := bytes.Split(data, []byte{0}) + + env := make([]string, 0, len(raw)) + for _, kv := range raw { + if len(kv) == 0 { + continue + } + // must contain '=' and not start with '=' + if i := bytes.IndexByte(kv, '='); i <= 0 { + continue + } + env = append(env, string(kv)) + } + return env, nil +} + func (e *Executor) HasImage(ctx context.Context) (bool, string, error) { var cmd *exec.Cmd a := e.getArgs(e.args.HasImageArgs().Get()...) From bdad9e4db7bfa2708e715d8aec1ff260af2428bb Mon Sep 17 00:00:00 2001 From: Christophe Varoqui Date: Mon, 12 Jan 2026 16:55:44 +0100 Subject: [PATCH 3/7] Purge the fetched resource files cache when a file is removed So the secondary instance is ready to refetch when the up instance status is next refreshed. This patch fixes a case where the file was never refetched when the admin removed the file from a secondary node in the hope to demonstrate how opensvc handles resync automatically. --- daemon/imon/fetch_resource_files.go | 32 +++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/daemon/imon/fetch_resource_files.go b/daemon/imon/fetch_resource_files.go index 665e9a40d..36774aad2 100644 --- a/daemon/imon/fetch_resource_files.go +++ b/daemon/imon/fetch_resource_files.go @@ -33,7 +33,11 @@ func (t *filesManager) Fetched() resource.Files { } func (t *Manager) handleResourceFiles(ev *msgbus.InstanceStatusUpdated) { - if t.files.fetching { + if ev.Node == t.localhost { + t.doHandleLocalResourceFiles(ev) + return + } + if t.files.fetching && t.needFetchResourceFiles(ev) { t.files.attention = ev return } @@ -59,9 +63,6 @@ func (t *Manager) needFetchResourceFiles(ev *msgbus.InstanceStatusUpdated) bool } // Is the remote node a valid resource file authority - if ev.Node == t.localhost { - return false - } if ev.Value.Avail != status.Up { return false } @@ -69,6 +70,29 @@ func (t *Manager) needFetchResourceFiles(ev *msgbus.InstanceStatusUpdated) bool return true } +func (t *Manager) doHandleLocalResourceFiles(ev *msgbus.InstanceStatusUpdated) { + if t.instConfig.ActorConfig == nil { + return + } + announcedFilenames := make(map[string]any) + for _, localResourceStatus := range ev.Value.Resources { + for _, f := range localResourceStatus.Files { + announcedFilenames[f.Name] = nil + } + } + var toDelete []string + for filename := range t.files.fetched { + if _, ok := announcedFilenames[filename]; !ok { + // This already fetched file is no longer announced by the resource + // Drop. + toDelete = append(toDelete, filename) + } + } + for _, filename := range toDelete { + delete(t.files.fetched, filename) + } +} + func (t *Manager) fetchResourceFiles(fetched resource.Files, localInstanceStatus instance.Status, ev *msgbus.InstanceStatusUpdated) { var ( From 2bd9b4c9d46c148502a66bb51b40c7b5dc754c25 Mon Sep 17 00:00:00 2001 From: Christophe Varoqui Date: Tue, 13 Jan 2026 10:46:07 +0100 Subject: [PATCH 4/7] Fix a stack on entering a not-running container e.g.: # om test/svc/hdoc enter --rid container#hdoc panic: runtime error: invalid memory address or nil pointer dereference [signal SIGSEGV: segmentation violation code=0x1 addr=0x48 pc=0x1b01d04] goroutine 15 [running]: github.com/opensvc/om3/v3/drivers/rescontainerocibase.(*Executor).Enter(0xc0002caaa0, {0x37e7cb8?, 0xc0004eeff0?}) /root/dev/om3/drivers/rescontainerocibase/executor.go:73 +0xc4 github.com/opensvc/om3/v3/drivers/rescontainerocibase.(*BT).Enter(0x446df20?, {0x37e7cb8?, 0xc0004eeff0?}) /root/dev/om3/drivers/rescontainerocibase/main.go:318 +0x32 github.com/opensvc/om3/v3/core/object.(*actor).Enter(0xc0004f0fc0, {0x37e7cb8, 0xc0004eeff0}, {0x7fffe2325645, 0xe}) /root/dev/om3/core/object/actor_enter.go:31 +0x214 github.com/opensvc/om3/v3/core/omcmd.(*CmdObjectEnter).Run.func1({0x37e7cb8, 0xc0004eeff0}, {{0x7fffe2325634, 0x4}, {0x7fffe232562b, 0x4}, {0x7fffe2325630, 0x3}}) /root/dev/om3/core/omcmd/object_enter.go:33 +0xa2 github.com/opensvc/om3/v3/core/objectaction.T.DoLocal.func1({0x37e7cb8?, 0xc0004eeff0?}, {0x0?, 0x0?}, {{0x7fffe2325634, 0x4}, {0x7fffe232562b, 0x4}, {0x7fffe2325630, 0x3}}) /root/dev/om3/core/objectaction/object.go:383 +0x68 github.com/opensvc/om3/v3/core/objectaction.T.instanceDo.func1({0xc00022441a?, 0x0?}, {{0x7fffe2325634, 0x4}, {0x7fffe232562b, 0x4}, {0x7fffe2325630, 0x3}}) /root/dev/om3/core/objectaction/object.go:986 +0x105 created by github.com/opensvc/om3/v3/core/objectaction.T.instanceDo in goroutine 1 /root/dev/om3/core/objectaction/object.go:972 +0x196 Test if inspect data is nil before use. --- drivers/rescontainerocibase/executor.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/rescontainerocibase/executor.go b/drivers/rescontainerocibase/executor.go index c89ed9df9..5c1f8f615 100644 --- a/drivers/rescontainerocibase/executor.go +++ b/drivers/rescontainerocibase/executor.go @@ -70,10 +70,13 @@ func (e *Executor) Enter(ctx context.Context) error { candidates := []string{"/bin/bash", "/bin/sh"} ctx, cancel := context.WithTimeout(ctx, 10*time.Second) inspect, err := e.Inspect(ctx) - pid := inspect.PID() if err != nil { return err } + if inspect == nil { + return fmt.Errorf("the container is not running") + } + pid := inspect.PID() env, err := pidEnv(pid) if err != nil { return err From cb6b3f2ba4e80e88e0c5e008216f340512982f60 Mon Sep 17 00:00:00 2001 From: Christophe Varoqui Date: Tue, 13 Jan 2026 16:29:06 +0100 Subject: [PATCH 5/7] Addrinfo cache tweaks * Return the real up/down status when the name ip cache is valid. Just add a resource status warn log. * Return undef status when the name ip cache is expired. At this point, stop and start instance actions won't work. * Bump the expiration time from 16m to 19m so the instance apps have at 9-19m to stop instead of 6-16m. The 6m worst case has been seen in real life. --- drivers/resiphost/main.go | 24 ++++++++++++++++-------- drivers/resipnetns/main.go | 22 +++++++++++++++------- 2 files changed, 31 insertions(+), 15 deletions(-) diff --git a/drivers/resiphost/main.go b/drivers/resiphost/main.go index 1a1c71a68..36266b717 100644 --- a/drivers/resiphost/main.go +++ b/drivers/resiphost/main.go @@ -24,7 +24,7 @@ import ( const ( tagNonRouted = "nonrouted" - maxIPAddrAge = 16 * time.Minute + maxIPAddrAge = 19 * time.Minute ) type ( @@ -95,7 +95,7 @@ func (t *T) getDevAndLabel() (string, string, error) { } func (t *T) Start(ctx context.Context) error { - if initialStatus := t.Status(ctx); initialStatus == status.Up { + if initialStatus := t.statusWithIPAddrCacheTrust(ctx); initialStatus == status.Up { t.Log().Infof("%s is already up on %s", t.Name, t.Dev) return nil } @@ -137,6 +137,14 @@ func (t *T) Stop(ctx context.Context) error { } func (t *T) Status(ctx context.Context) status.T { + s := t.statusWithIPAddrCacheTrust(ctx) + if s == status.Up && t._ipaddrAge > 0 { + return status.Warn + } + return s +} + +func (t *T) statusWithIPAddrCacheTrust(ctx context.Context) status.T { if t.Name == "" { t.StatusLog().Warn("name not set") return status.NotApplicable @@ -176,9 +184,12 @@ func (t *T) statusOfAddr(ctx context.Context, dev string) status.T { return status.NotApplicable } ip := t.ipaddr() - if t._ipaddrAge > maxIPAddrAge { - t.StatusLog().Error("ip %s lookup issue, cache expired (%s old)", t.Name, duration.FmtShortDuration(t._ipaddrAge)) + if ip == nil { + t.StatusLog().Error("ip %s lookup issue, cache miss", t.Name) return status.Undef + } else if t._ipaddrAge > maxIPAddrAge { + t.StatusLog().Error("ip %s lookup issue, cache expired (%s old)", t.Name, duration.FmtShortDuration(t._ipaddrAge)) + return status.Warn } else if t._ipaddrAge > 0 { t.StatusLog().Warn("ip %s lookup issue, cache valid (%s old)", t.Name, duration.FmtShortDuration(t._ipaddrAge)) } @@ -198,9 +209,6 @@ func (t *T) statusOfAddr(ctx context.Context, dev string) status.T { t.Log().Tracef("ip not found on intf") return status.Down } - if t._ipaddrAge > 0 { - return status.Warn - } return status.Up } @@ -226,7 +234,7 @@ func (t *T) Abort(ctx context.Context) bool { if t._ipaddrAge > maxIPAddrAge { return false // let start fail with an explicit error message } - if initialStatus := t.Status(ctx); initialStatus == status.Up { + if initialStatus := t.statusWithIPAddrCacheTrust(ctx); initialStatus == status.Up { return false // let start fail with an explicit error message } if t.CheckCarrier { diff --git a/drivers/resipnetns/main.go b/drivers/resipnetns/main.go index 7403cf229..c1e10eb14 100644 --- a/drivers/resipnetns/main.go +++ b/drivers/resipnetns/main.go @@ -32,7 +32,7 @@ const ( tagNonRouted = "nonrouted" tagDedicated = "dedicated" - maxIPAddrAge = 16 * time.Minute + maxIPAddrAge = 19 * time.Minute ) type ( @@ -315,6 +315,14 @@ func (t *T) devMTU() (int, error) { } func (t *T) Status(ctx context.Context) status.T { + s := t.statusWithIPAddrCacheTrust(ctx) + if s == status.Up && t._ipaddrAge > 0 { + return status.Warn + } + return s +} + +func (t *T) statusWithIPAddrCacheTrust(ctx context.Context) status.T { var ( err error carrier bool @@ -344,8 +352,11 @@ func (t *T) Status(ctx context.Context) status.T { } } - _ = t.ipaddr() - if t._ipaddrAge > maxIPAddrAge { + ip := t.ipaddr() + if ip == nil { + t.StatusLog().Error("ip %s lookup issue, cache miss", t.Name) + return status.Undef + } else if t._ipaddrAge > maxIPAddrAge { t.StatusLog().Error("ip %s lookup issue, cache expired (%s old)", t.Name, duration.FmtShortDuration(t._ipaddrAge)) return status.Undef } else if t._ipaddrAge > 0 { @@ -370,9 +381,6 @@ func (t *T) Status(ctx context.Context) status.T { if guestDev == "" { return status.Down } - if t._ipaddrAge > 0 { - return status.Warn - } return status.Up } @@ -409,7 +417,7 @@ func (t *T) Abort(ctx context.Context) bool { if t._ipaddrAge > maxIPAddrAge { return false // let start fail with an explicit error message } - if initialStatus := t.Status(ctx); initialStatus == status.Up { + if initialStatus := t.statusWithIPAddrCacheTrust(ctx); initialStatus == status.Up { return false // let start fail with an explicit error message } if t.abortPing() { From 8d4a09904d55c260f0e3f451abe3c2da5b03ba95 Mon Sep 17 00:00:00 2001 From: Christophe Varoqui Date: Wed, 14 Jan 2026 12:10:03 +0100 Subject: [PATCH 6/7] Resource files fetching fixes and enhancements * Add a imon resource files fetched cache init * Add resource files changes and deletes detection on secondary instances, and trigger a immediate fetch when that happens. Example: Jan 14 12:06:14 dev2n2 om[126251]: daemon: imon: kvm/svc/vm2: container#1: file /etc/libvirt/qemu/vm2.xml discovered (csum=c8013f2da973061d8d86e3b60d35b753, mtime=2026-01-10 22:13:50.170941242 +0100 CET) Jan 14 12:06:30 dev2n2 om[126251]: daemon: imon: kvm/svc/vm2: container#1: file /etc/libvirt/qemu/vm2.xml disappeared Jan 14 12:06:30 dev2n2 om[126251]: daemon: imon: kvm/svc/vm2: container#1: file /etc/libvirt/qemu/vm2.xml fetch from dev2n1 --- core/resource/resource.go | 12 --- daemon/imon/fetch_resource_files.go | 154 +++++++++++++++++++++++----- daemon/imon/main.go | 24 +---- 3 files changed, 130 insertions(+), 60 deletions(-) diff --git a/core/resource/resource.go b/core/resource/resource.go index 468644fa8..5e82116c3 100644 --- a/core/resource/resource.go +++ b/core/resource/resource.go @@ -8,7 +8,6 @@ import ( "os" "os/exec" "path/filepath" - "slices" "strings" "time" @@ -1511,17 +1510,6 @@ func getFiles(ctx context.Context, t Driver) Files { return files } -func (t Files) Merge(f File) Files { - l := slices.Clone(t) - for i, this := range l { - if this.Name == f.Name { - l[i] = f - return l - } - } - return append(l, f) -} - func (t Files) Lookup(name string) (File, bool) { for _, file := range t { if file.Name == name { diff --git a/daemon/imon/fetch_resource_files.go b/daemon/imon/fetch_resource_files.go index 36774aad2..5f9a8fade 100644 --- a/daemon/imon/fetch_resource_files.go +++ b/daemon/imon/fetch_resource_files.go @@ -22,8 +22,46 @@ import ( "github.com/opensvc/om3/v3/daemon/msgbus" ) -func (t *filesManager) Fetched() resource.Files { - l := make(resource.Files, len(t.fetched)) +type ( + ridFile struct { + *resource.File + rid string + } + + ridFiles []ridFile + + filesManager struct { + // fetched stores the resource files we fetched to avoid uneeded refetch + fetched map[string]ridFile + + // attention stores a pending InstanceStatusUpdated event received while the fetch + // manager was already processing an event. This serves as a flag to immediately + // retrigger a new fetch cycle upon completion of the current one. + attention *msgbus.InstanceStatusUpdated + + // fetching is true when the resource files fetch and ingest routine is + // running + fetching bool + } +) + +func newFilesManager() *filesManager { + return &filesManager{ + fetched: make(map[string]ridFile), + } +} + +func (t ridFiles) Lookup(name string) (ridFile, bool) { + for _, file := range t { + if file.Name == name { + return file, true + } + } + return ridFile{}, false +} + +func (t *filesManager) Fetched() ridFiles { + l := make(ridFiles, len(t.fetched)) i := 0 for _, v := range t.fetched { l[i] = v @@ -33,22 +71,25 @@ func (t *filesManager) Fetched() resource.Files { } func (t *Manager) handleResourceFiles(ev *msgbus.InstanceStatusUpdated) { + if t.instConfig.ActorConfig == nil { + return + } if ev.Node == t.localhost { - t.doHandleLocalResourceFiles(ev) + t.handleLocalResourceFiles(ev) return } - if t.files.fetching && t.needFetchResourceFiles(ev) { + if !t.needFetchResourceFiles(ev) { + return + } + if t.files.fetching { t.files.attention = ev return } - t.doHandleResourceFiles(ev) + t.handlePeerResourceFiles(ev) } -func (t *Manager) doHandleResourceFiles(ev *msgbus.InstanceStatusUpdated) { +func (t *Manager) handlePeerResourceFiles(ev *msgbus.InstanceStatusUpdated) { t.files.attention = nil - if !t.needFetchResourceFiles(ev) { - return - } t.files.fetching = true go t.fetchResourceFiles(t.files.Fetched(), t.instStatus[t.localhost], ev) } @@ -58,9 +99,6 @@ func (t *Manager) needFetchResourceFiles(ev *msgbus.InstanceStatusUpdated) bool if t.state.LocalExpect != instance.MonitorLocalExpectNone { return false } - if t.instConfig.ActorConfig == nil { - return false - } // Is the remote node a valid resource file authority if ev.Value.Avail != status.Up { @@ -70,30 +108,83 @@ func (t *Manager) needFetchResourceFiles(ev *msgbus.InstanceStatusUpdated) bool return true } -func (t *Manager) doHandleLocalResourceFiles(ev *msgbus.InstanceStatusUpdated) { - if t.instConfig.ActorConfig == nil { +func (t *Manager) initLocalResourceFiles() { + instanceStatus, ok := t.instStatus[t.localhost] + if !ok { return } - announcedFilenames := make(map[string]any) - for _, localResourceStatus := range ev.Value.Resources { + for rid, localResourceStatus := range instanceStatus.Resources { for _, f := range localResourceStatus.Files { - announcedFilenames[f.Name] = nil + t.log.Infof("%s: file %s discovered (csum=%s, mtime=%s)", rid, f.Name, f.Checksum, f.Mtime) + t.files.fetched[f.Name] = ridFile{ + rid: rid, + File: &f, + } } } +} + +func (t *Manager) handleLocalResourceFiles(ev *msgbus.InstanceStatusUpdated) { + var needFetch bool + + // Prepare a map of existing filenames (e.g. reported in the local + // instance status). + existingFilenames := make(map[string]any) + for rid, localResourceStatus := range ev.Value.Resources { + for _, f := range localResourceStatus.Files { + existingFilenames[f.Name] = nil + + if ev.Value.Avail != status.Up { + fetchedFile, ok := t.files.fetched[f.Name] + if !ok { + t.log.Infof("%s: file %s discovered (csum=%s, mtime=%s)", rid, f.Name, f.Checksum, f.Mtime) + t.files.fetched[f.Name] = ridFile{ + rid: rid, + File: &f, + } + needFetch = true + } else if fetchedFile.Checksum != f.Checksum { + t.log.Infof("%s: file %s altered locally (csum=%s, mtime=%s)", rid, f.Name, f.Checksum, f.Mtime) + needFetch = true + } + } + } + } + + // Prepare the list of filenames we fetched but are no longer existing var toDelete []string - for filename := range t.files.fetched { - if _, ok := announcedFilenames[filename]; !ok { - // This already fetched file is no longer announced by the resource - // Drop. + for filename, fetchedFile := range t.files.fetched { + if _, ok := existingFilenames[filename]; !ok { toDelete = append(toDelete, filename) + t.log.Infof("%s: file %s disappeared", fetchedFile.rid, filename) + needFetch = true } } + + // Mark these filenames as not fetched for _, filename := range toDelete { delete(t.files.fetched, filename) } + + // If a file was changed or removed or discovered, fetch from the up instance + if needFetch && ev.Value.Avail != status.Up { + for nodename, instanceStatus := range instance.StatusData.GetByPath(t.path) { + if t.localhost == nodename { + continue + } + if instanceStatus.Avail != status.Up { + continue + } + t.handleResourceFiles(&msgbus.InstanceStatusUpdated{ + Node: nodename, + Value: *instanceStatus, + }) + break + } + } } -func (t *Manager) fetchResourceFiles(fetched resource.Files, localInstanceStatus instance.Status, ev *msgbus.InstanceStatusUpdated) { +func (t *Manager) fetchResourceFiles(fetched ridFiles, localInstanceStatus instance.Status, ev *msgbus.InstanceStatusUpdated) { var ( ridsToIngest []string @@ -121,7 +212,11 @@ func (t *Manager) fetchResourceFiles(fetched resource.Files, localInstanceStatus if !ok { // fallback to the t.instStatus cache, ie we never fetched // this file yet. - localFile, _ = localResourceStatus.Files.Lookup(peerFile.Name) + file, _ := localResourceStatus.Files.Lookup(peerFile.Name) + localFile = ridFile{ + File: &file, + rid: rid, + } } if !peerFile.Mtime.After(localFile.Mtime) { continue @@ -130,11 +225,14 @@ func (t *Manager) fetchResourceFiles(fetched resource.Files, localInstanceStatus continue } if err := t.fetchResourceFile(rid, peerFile, ev.Node); err != nil { - t.log.Warnf("%s", err) + t.log.Warnf("%s: fetch %s: %s", rid, peerFile.Name, err) continue } - done.Files = append(done.Files, peerFile) + done.Files = append(done.Files, ridFile{ + File: &peerFile, + rid: rid, + }) if peerFile.Ingest { ridsToIngest = append(ridsToIngest, rid) @@ -154,7 +252,7 @@ func (t *Manager) fetchResourceFiles(fetched resource.Files, localInstanceStatus } func (t *Manager) fetchResourceFile(rid string, peerFile resource.File, from string) error { - t.log.Infof("%s: fetch %s from %s", rid, peerFile.Name, from) + t.log.Infof("%s: file %s fetch from %s", rid, peerFile.Name, from) c, err := client.New( client.WithURL(daemonsubsystem.PeerURL(from)), client.WithUsername(t.localhost), @@ -175,7 +273,7 @@ func (t *Manager) fetchResourceFile(rid string, peerFile resource.File, from str defer resp.Body.Close() if resp.StatusCode != http.StatusOK { - return fmt.Errorf("%s: fetch %s: %s", rid, peerFile.Name, resp.Status) + return fmt.Errorf("unexpected api response status %s", resp.Status) } // Create a temp file and write the response body @@ -245,6 +343,6 @@ func (t *Manager) onFetchDone(c cmdFetchDone) { } t.files.fetching = false if t.files.attention != nil { - t.doHandleResourceFiles(t.files.attention) + t.handlePeerResourceFiles(t.files.attention) } } diff --git a/daemon/imon/main.go b/daemon/imon/main.go index 85fb35421..c960c2faa 100644 --- a/daemon/imon/main.go +++ b/daemon/imon/main.go @@ -36,7 +36,6 @@ import ( "github.com/opensvc/om3/v3/core/object" "github.com/opensvc/om3/v3/core/priority" "github.com/opensvc/om3/v3/core/rawconfig" - "github.com/opensvc/om3/v3/core/resource" "github.com/opensvc/om3/v3/core/status" "github.com/opensvc/om3/v3/daemon/daemondata" "github.com/opensvc/om3/v3/daemon/daemonenv" @@ -72,7 +71,7 @@ type ( // or InstanceStatusUpdated. instStatus map[string]instance.Status - files filesManager + files *filesManager // instMonitor tracks instance.Monitor for path on other nodes, iit is updated on // ObjectStatusUpdated for path events where srcEvent is InstanceMonitorDeleted @@ -170,20 +169,6 @@ type ( regularResourceOrchestrate orchestrationResource } - filesManager struct { - // fetched stores the resource files we fetched to avoid uneeded refetch - fetched map[string]resource.File - - // attention stores a pending InstanceStatusUpdated event received while the fetch - // manager was already processing an event. This serves as a flag to immediately - // retrigger a new fetch cycle upon completion of the current one. - attention *msgbus.InstanceStatusUpdated - - // fetching is true when the resource files fetch and ingest routine is - // running - fetching bool - } - // cmdOrchestrate can be used from post action go routines cmdOrchestrate struct { state instance.MonitorState @@ -201,7 +186,7 @@ type ( } cmdFetchDone struct { - Files resource.Files + Files ridFiles } Factory struct { @@ -256,9 +241,7 @@ func start(parent context.Context, qs pubsub.QueueSizer, p naming.Path, nodes [] cmdC: make(chan any), databus: databus, publisher: pubsub.PubFromContext(ctx), - files: filesManager{ - fetched: make(map[string]resource.File), - }, + files: newFilesManager(), instStatus: make(map[string]instance.Status), instMonitor: make(map[string]instance.Monitor), nodeMonitor: make(map[string]node.Monitor), @@ -385,6 +368,7 @@ func (t *Manager) worker(initialNodes []string) { t.initRelationAvailStatus() t.initResourceMonitor() + t.initLocalResourceFiles() t.updateIsLeader() t.updateIfChange() From ea6c05b84f9979b707bc2646851ad4624e3fd464 Mon Sep 17 00:00:00 2001 From: Christophe Varoqui Date: Wed, 14 Jan 2026 18:52:30 +0100 Subject: [PATCH 7/7] Document ip addr caching in the ip.name kw of ip.host and ip.netns --- drivers/resiphost/text/kw/name | 35 ++++++++++++++++++++++++++++++++ drivers/resipnetns/text/kw/name | 36 +++++++++++++++++++++++++++++++++ 2 files changed, 71 insertions(+) diff --git a/drivers/resiphost/text/kw/name b/drivers/resiphost/text/kw/name index 811cb1da9..58b15da1f 100644 --- a/drivers/resiphost/text/kw/name +++ b/drivers/resiphost/text/kw/name @@ -1,8 +1,43 @@ The DNS name or IP address of the ip resource. +Scoping +------- + Can be different from one node to the other, in which case the `name@` scoping syntax can be used. This is most useful to specify a different ip when the service starts in DRP mode, where subnets are likely to be different than those of the production datacenter. + +DNS name caching +---------------- + +If the value is a DNS name, a lookup is done for every resource status +evaluation. + +* If the lookup succeeds, the addr is store in a cache file. + +* If the lookup fails, the address is fetched from the cache file. + + * If the cache is not expired (refreshed in the past 19 minutes) the + address is trusted: + + * The resource status is degraded to from `up` to `warn`. + + * The resource restarts begin if `restarts > 0`. + The restart is a no-op but the resource status is re-evaluated. + If the lookup succeeds, the resource status to `up`. + + * The monitor action is triggered if all restarts failed and + `monitor_action` is set. + For example, a `freezestop` can still succeed using the cached + ip address. + + * If the cache is expired (older than 19 minutes): + + * The resource status is `warn`, weither the actual untrusted ip + address is plumbed or not. + + * The `stop` and `start` actions no longer work. An admin action + is necessary to repair the configuration or dns lookups. diff --git a/drivers/resipnetns/text/kw/name b/drivers/resipnetns/text/kw/name index 811cb1da9..64fa81f0a 100644 --- a/drivers/resipnetns/text/kw/name +++ b/drivers/resipnetns/text/kw/name @@ -1,8 +1,44 @@ The DNS name or IP address of the ip resource. +Scoping +------- + Can be different from one node to the other, in which case the `name@` scoping syntax can be used. This is most useful to specify a different ip when the service starts in DRP mode, where subnets are likely to be different than those of the production datacenter. + +DNS name caching +---------------- + +If the value is a DNS name, a lookup is done for every resource status +evaluation. + +* If the lookup succeeds, the addr is store in a cache file. + +* If the lookup fails, the address is fetched from the cache file. + + * If the cache is not expired (refreshed in the past 19 minutes) the + address is trusted: + + * The resource status is degraded to from `up` to `warn`. + + * The resource restarts begin if `restarts > 0`. + The restart is a no-op but the resource status is re-evaluated. + If the lookup succeeds, the resource status to `up`. + + * The monitor action is triggered if all restarts failed and + `monitor_action` is set. + For example, a `freezestop` can still succeed using the cached + ip address. + + * If the cache is expired (older than 19 minutes): + + * The resource status is `warn`, weither the actual untrusted ip + address is plumbed or not. + + * The `stop` and `start` actions no longer work. An admin action + is necessary to repair the configuration or dns lookups. +