diff --git a/frontend/csi/node_server.go b/frontend/csi/node_server.go index 1655fa1d3..7db852690 100644 --- a/frontend/csi/node_server.go +++ b/frontend/csi/node_server.go @@ -5,6 +5,7 @@ package csi import ( "context" "crypto/rand" + "encoding/hex" "fmt" "math/big" "os" @@ -1261,6 +1262,26 @@ func (p *Plugin) nodeUnstageISCSIVolume( } } + // Derive the device path using the LunSerial and LunID. + // The publishInfo.DevicePath may be incorrect due to Kernel actions. + // Fallback to using the publishInfo.DevicePath if the multipath device cannot be derived. + fields := LogFields{ + "LunSerial": publishInfo.IscsiLunSerial, + "LunID": publishInfo.IscsiLunNumber, + } + lunSerialHex := hex.EncodeToString([]byte(publishInfo.IscsiLunSerial)) + multipathDevice, err := iscsiUtils.GetMultipathDeviceForLUN(ctx, lunSerialHex, int(publishInfo.IscsiLunNumber)) + if err != nil { + Logc(ctx).WithError(err).WithFields(fields).Debug("Error finding multipath device for lun.") + if errors.IsNotFoundError(err) { + Logc(ctx).WithError(err).WithFields(fields).Debug("no multipath device is found for lun, resetting device info") + publishInfo.DevicePath = "" + } + } else { + Logc(ctx).WithFields(fields).Debugf("Found multipath device %s for lun.", multipathDevice) + publishInfo.DevicePath = "/dev/" + multipathDevice + } + // Delete the device from the host. unmappedMpathDevice, err := utils.PrepareDeviceForRemoval(ctx, publishInfo, nil, p.unsafeDetach, force) if err != nil { diff --git a/utils/devices.go b/utils/devices.go index bb0fd6238..a8c391e3d 100644 --- a/utils/devices.go +++ b/utils/devices.go @@ -956,7 +956,7 @@ func compareWithPublishedSerialNumber(ctx context.Context, publishInfo *VolumePu // Get Device based on the serial number and at the same time identify if it is a ghost device. // Multipath UUID contains LUN serial in hex format lunSerialHex := hex.EncodeToString([]byte(publishInfo.IscsiLunSerial)) - multipathDevice, err := IscsiUtils.GetMultipathDeviceBySerial(ctx, lunSerialHex) + multipathDevice, err := IscsiUtils.GetMultipathDeviceForLUN(ctx, lunSerialHex, int(publishInfo.IscsiLunNumber)) if err != nil { return false, fmt.Errorf("failed to verify multipath device for serial '%v'; %v ", publishInfo.IscsiLunSerial, err) diff --git a/utils/devices_linux.go b/utils/devices_linux.go index 37ac91935..5a1a3662a 100644 --- a/utils/devices_linux.go +++ b/utils/devices_linux.go @@ -13,6 +13,7 @@ import ( "time" "unsafe" + execCmd "github.com/netapp/trident/utils/exec" log "github.com/sirupsen/logrus" "github.com/cenkalti/backoff/v4" @@ -29,6 +30,8 @@ const ( luksType = "luks2" // Return code for "no permission (bad passphrase)" from cryptsetup command luksCryptsetupBadPassphraseReturnCode = 2 + luksCloseDeviceSafelyClosedExitCode = 0 + luksCloseDeviceAlreadyClosedExitCode = 4 ) // flushOneDevice flushes any outstanding I/O to a disk @@ -175,17 +178,28 @@ func (d *LUKSDevice) Close(ctx context.Context) error { } // closeLUKSDevice performs a luksClose on the specified LUKS device +// It gracefully handles the cases where a LUKS device has already been closed or the device doesn't exist. func closeLUKSDevice(ctx context.Context, luksDevicePath string) error { output, err := command.ExecuteWithTimeoutAndInput( ctx, "cryptsetup", luksCommandTimeout, true, "", "luksClose", luksDevicePath, ) if nil != err { - Log().WithFields(LogFields{ - "MappedDeviceName": luksDevicePath, - "error": err.Error(), - "output": string(output), - }).Debug("Failed to Close LUKS device") - return fmt.Errorf("failed to Close LUKS device %s; %v", luksDevicePath, err) + fields := LogFields{"luksDevicePath": luksDevicePath, "output": string(output), "err": err.Error()} + var exitErr execCmd.ExitError + if !errors.As(err, &exitErr) { + Logc(ctx).WithFields(fields).Error("Failed to close LUKS device with unknown error.") + return fmt.Errorf("failed to close LUKS device %s; %w", luksDevicePath, err) + } + + switch exitErr.ExitCode() { + // exit code "0" and "4" are safe to ignore. "0" will likely never be hit but check for it regardless. + case luksCloseDeviceSafelyClosedExitCode, luksCloseDeviceAlreadyClosedExitCode: + Logc(ctx).WithFields(fields).Debug("LUKS device is already closed or did not exist.") + return nil + default: + Logc(ctx).WithFields(fields).Error("Failed to close LUKS device.") + return fmt.Errorf("exit code '%d' when closing LUKS device '%s'; %w", exitErr.ExitCode(), luksDevicePath, err) + } } return nil } @@ -209,21 +223,18 @@ func IsLUKSDeviceOpen(ctx context.Context, luksDevicePath string) (bool, error) // EnsureLUKSDeviceClosed ensures there is not an open LUKS device at the specified path func EnsureLUKSDeviceClosed(ctx context.Context, luksDevicePath string) error { GenerateRequestContextForLayer(ctx, LogLayerUtils) + fields := LogFields{"luksDevicePath": luksDevicePath} - _, err := osFs.Stat(luksDevicePath) - if err == nil { - // Need to Close the LUKS device - return closeLUKSDevice(ctx, luksDevicePath) - } else if os.IsNotExist(err) { - Logc(ctx).WithFields(LogFields{ - "device": luksDevicePath, - }).Debug("LUKS device not found.") - } else { - Logc(ctx).WithFields(LogFields{ - "device": luksDevicePath, - "error": err.Error(), - }).Debug("Failed to stat device") - return fmt.Errorf("could not stat device: %s %v.", luksDevicePath, err) + if err := closeLUKSDevice(ctx, luksDevicePath); err != nil { + Logc(ctx).WithFields(fields).WithError(err).Error("Could not close LUKS device.") + return fmt.Errorf("could not close LUKS device %s; %w", luksDevicePath, err) + } + + // If LUKS close succeeded, the block device node should be gone. + // It's the responsibility of the kernel and udev to manage /dev/mapper entries. + // If the /dev/mapper entry lives on, log a warning and return success. + if _, err := osFs.Stat(luksDevicePath); err != nil { + Logc(ctx).WithFields(fields).Warn("Stale device mapper file found for LUKS device. Is udev is running?") } return nil } diff --git a/utils/exec/command.go b/utils/exec/command.go index 2c3e3706d..8ae876dc9 100644 --- a/utils/exec/command.go +++ b/utils/exec/command.go @@ -19,9 +19,18 @@ import ( var ( xtermControlRegex = regexp.MustCompile(`\x1B\[[0-9;]*[a-zA-Z]`) - _ Command = NewCommand() + // Ensure these structures always implement these interfaces at compilation. + _ Command = NewCommand() + _ ExitError = &exec.ExitError{} ) +// ExitError defines the methods that exec.ExitError implements. +// This enables unit testing and mocking of exit codes. +type ExitError interface { + error + ExitCode() int +} + // Command defines a set of behaviors for executing commands on a host. type Command interface { Execute(ctx context.Context, name string, args ...string) ([]byte, error) diff --git a/utils/iscsi.go b/utils/iscsi.go index 5e6ab0d1c..ad1d2359f 100644 --- a/utils/iscsi.go +++ b/utils/iscsi.go @@ -7,8 +7,10 @@ import ( "encoding/binary" "encoding/hex" "fmt" + "io/fs" "os" "os/exec" + "path/filepath" "regexp" "sort" "strconv" @@ -478,8 +480,12 @@ func (h *IscsiReconcileHelper) GetMultipathDeviceDisks(ctx context.Context, mult diskPath := chrootPathPrefix + fmt.Sprintf("/sys/block/%s/slaves/", multipathDevice) diskDirs, err := os.ReadDir(diskPath) if err != nil { - Logc(ctx).WithError(err).Errorf("Could not read %s", diskDirs) - return nil, fmt.Errorf("failed to identify multipath device disks; unable to read '%s'", diskDirs) + if errors.Is(err, fs.ErrNotExist) { + Logc(ctx).Warningf("multipath device directory %s does not exist, device is already gone?", diskDirs) + return nil, nil + } + Logc(ctx).WithError(err).Errorf("Could not read %s", diskPath) + return nil, fmt.Errorf("failed to identify multipath device disks; unable to read %q :%w", diskPath, err) } for _, diskDir := range diskDirs { @@ -496,7 +502,10 @@ func (h *IscsiReconcileHelper) GetMultipathDeviceDisks(ctx context.Context, mult // GetMultipathDeviceBySerial find DM device whose UUID /sys/block/dmX/dm/uuid contains serial in hex format. func (h *IscsiReconcileHelper) GetMultipathDeviceBySerial(ctx context.Context, hexSerial string) (string, error) { - sysPath := chrootPathPrefix + "/sys/block/" + var ( + sysPath = chrootPathPrefix + "/sys/block/" + multipathCMDTimeout = 10 * time.Second + ) blockDirs, err := os.ReadDir(sysPath) if err != nil { @@ -520,18 +529,108 @@ func (h *IscsiReconcileHelper) GetMultipathDeviceBySerial(ctx context.Context, h continue } - if strings.Contains(uuid, hexSerial) { + if !strings.Contains(uuid, hexSerial) { + continue + } + + // Use 'multipath -l' and check the exit code. A success return + // indicates that it is a not a stale device. Noted that frequently + // running this in a busy environment may time out, and lead to a + // situation that the device is not properly discovered. But it will + // fail the following operations and retries next time. + if _, err := command.ExecuteWithTimeout(ctx, "multipath", multipathCMDTimeout, true, "-l", dmDeviceName); err != nil { Logc(ctx).WithFields(LogFields{ - "UUID": hexSerial, - "multipathDevice": dmDeviceName, - }).Debug("Found multipath device by UUID.") - return dmDeviceName, nil + "device": dmDeviceName, + }).WithError(err).Warn("Candidate is not a valid map known to multipathd, ignoring stale entry.") + continue } + + Logc(ctx).WithFields(LogFields{ + "UUID": hexSerial, + "multipathDevice": dmDeviceName, + }).Debug("Found multipath device by UUID.") + return dmDeviceName, nil } return "", errors.NotFoundError("no multipath device found") } +// GetMultipathDeviceForLUN is the most robust method to find a multipath device. +// It uses a three-factor check (serial, path liveness, LUN ID) to ensure the +// correct and active device is returned, filtering out any stale entries. +func (h *IscsiReconcileHelper) GetMultipathDeviceForLUN(ctx context.Context, hexSerial string, lunID int) (string, error) { + var ( + sysPath = chrootPathPrefix + "/sys/block/" + lunIDString = strconv.Itoa(lunID) + fields = LogFields{ + "lunID": lunID, + "lunSerial": hexSerial, + } + ) + + Logc(ctx).WithFields(fields).Debug(">>>> iscsi.GetMultipathDeviceForLUN") + defer Logc(ctx).WithFields(fields).Debug("<<<< iscsi.GetMultipathDeviceForLUN") + + blockDirs, err := os.ReadDir(sysPath) + if err != nil { + Logc(ctx).WithError(err).WithFields(fields).Errorf("Could not read %s", sysPath) + return "", fmt.Errorf("failed to list block devices in %q", sysPath) + } + + for _, blockDir := range blockDirs { + dmDeviceName := blockDir.Name() + if !strings.HasPrefix(dmDeviceName, "dm-") { + continue + } + + // Check for a WWID match. + uuid, err := h.GetMultipathDeviceUUID(dmDeviceName) + if err != nil || !strings.Contains(uuid, hexSerial) { + // Not a match, or not a multipath device. + continue + } + + // Perform checks whether it's a stale device by checking slaves. + Logc(ctx).WithFields(LogFields{ + "serial": hexSerial, + "candidate": dmDeviceName, + "uuid": uuid, + }).Debug("Found candidate multipath device") + + // A stale device should contain no slaves. + slavesPath := filepath.Join(sysPath, dmDeviceName, "slaves") + slaves, err := os.ReadDir(slavesPath) + if err != nil || len(slaves) == 0 { + Logc(ctx).WithFields(fields).Warnf("Candidate %s is a stale device with no paths, ignoring.", dmDeviceName) + continue + } + + // Verify the LUN ID on an actual path using the 1st slave device info. + scsiDevicePath := filepath.Join(slavesPath, slaves[0].Name(), "device", "scsi_device") + scsiDeviceDirs, err := os.ReadDir(scsiDevicePath) + if err != nil || len(scsiDeviceDirs) == 0 { + Logc(ctx).WithFields(fields).WithError(err).Warnf("Could not read scsi_device %q info for candidate path %s.", scsiDevicePath, dmDeviceName) + continue + } + + scsiDeviceAddress := scsiDeviceDirs[0].Name() + parts := strings.Split(scsiDeviceAddress, ":") + if len(parts) != 4 { + Logc(ctx).WithFields(fields).Warnf("Invalid SCSI device address %s.", scsiDeviceAddress) + continue + } + + currentLunID := parts[3] + if currentLunID == lunIDString { + // All lun ID and serial match the multipath device. + Logc(ctx).WithFields(fields).Debugf("Successfully identified and verified multipath device %s.", dmDeviceName) + return dmDeviceName, nil + } + } + + return "", errors.NotFoundError(fmt.Sprintf("no active multipath device found for serial %s and LUN ID %d", hexSerial, lunID)) +} + // waitForDeviceScan scans all paths to a specific LUN and waits until all // SCSI disk-by-path devices for that LUN are present on the host. func waitForDeviceScan(ctx context.Context, lunID int, iSCSINodeName string) error { diff --git a/utils/iscsi_types.go b/utils/iscsi_types.go index d9c379f31..0231874f2 100644 --- a/utils/iscsi_types.go +++ b/utils/iscsi_types.go @@ -10,6 +10,7 @@ type IscsiReconcileUtils interface { GetISCSIHostSessionMapForTarget(context.Context, string) map[int]int GetSysfsBlockDirsForLUN(int, map[int]int) []string GetMultipathDeviceUUID(string) (string, error) + GetMultipathDeviceForLUN(context.Context, string, int) (string, error) GetMultipathDeviceBySerial(context.Context, string) (string, error) GetMultipathDeviceDisks(context.Context, string) ([]string, error) GetDevicesForLUN(paths []string) ([]string, error)