From 0150d087a32f298872865179813d2bbbb14c8fa7 Mon Sep 17 00:00:00 2001 From: yuval Date: Thu, 10 Apr 2025 16:08:02 +0300 Subject: [PATCH] lightos: Early exit in terminate_connection with retry logic When a volume is deleted in the LightBits cluster and then detached/deleted through OpenStack, the operation will fail and the volume will remain in 'detaching' state. This patch introduces two main improvements: 1. Added _get_lightos_volume_with_retry() method: - Implements retry logic with configurable timeout (default 5 seconds) - Handles transient errors (HTTP 500) by retrying - Returns immediately for definitive status codes (OK, NOT_FOUND, etc.) - Prevents false negatives due to temporary API unavailability 2. Enhanced terminate_connection() to check volume existence: - Uses _get_lightos_volume_with_retry() to verify volume exists - Returns early if volume is NOT_FOUND (already deleted) - Prevents unnecessary ACL operations on non-existent volumes - Avoids leaving volumes stuck in 'detaching' state Also adds a new constant STOP_LIGHTOS_CMD_FOR_RETURN_STATUS_LIST to define HTTP status codes that should stop retries immediately. Test added: test_terminate_connection_no_api_response to verify the early exit behavior when volume is not found. Change-Id: If9cbaf6c73cad9e02a266f3038b64bff37fca00d --- .../drivers/lightos/test_lightos_storage.py | 65 +++++++++++++++ cinder/volume/drivers/lightos.py | 82 ++++++++++++++++++- 2 files changed, 144 insertions(+), 3 deletions(-) diff --git a/cinder/tests/unit/volume/drivers/lightos/test_lightos_storage.py b/cinder/tests/unit/volume/drivers/lightos/test_lightos_storage.py index eb1048fc3ac..d1bf1457b0a 100644 --- a/cinder/tests/unit/volume/drivers/lightos/test_lightos_storage.py +++ b/cinder/tests/unit/volume/drivers/lightos/test_lightos_storage.py @@ -881,6 +881,29 @@ def test_terminate_connection_with_hostnqn(self): self.driver.delete_volume(volume) db.volume_destroy(self.ctxt, volume.id) + def test_terminate_connection_volume_was_deleted_by_admin(self): + InitialConnectorMock.nqn = "hostnqn1" + InitialConnectorMock.found_discovery_client = True + self.driver.do_setup(None) + vol_type = test_utils.create_volume_type(self.ctxt, self, + name='my_vol_type') + volume = test_utils.create_volume(self.ctxt, size=4, + volume_type_id=vol_type.id) + self.driver.create_volume(volume) + + def send_cmd_mock(cmd, **kwargs): + if cmd == "get_volume": + return (httpstatus.NOT_FOUND, {}) + if cmd == "get_volume_by_name": + return (httpstatus.NOT_FOUND, {}) + return cluster_send_cmd(cmd, **kwargs) + cluster_send_cmd = deepcopy(self.driver.cluster.send_cmd) + self.driver.cluster.send_cmd = send_cmd_mock + self.driver.terminate_connection(volume, + get_connector_properties()) + self.driver.delete_volume(volume) + db.volume_destroy(self.ctxt, volume.id) + def test_terminate_connection_with_empty_hostnqn_should_fail(self): InitialConnectorMock.nqn = "" InitialConnectorMock.found_discovery_client = True @@ -1004,3 +1027,45 @@ def test_get_volume_stats(self): assert volumes_data['free_capacity_gb'] == 'infinite', \ "Expected 'infinite', received %s" % \ volumes_data['free_capacity_gb'] + + def test_terminate_connection_volume_lookup_by_name(self): + """Test that terminate_connection looks up volume by name not UUID. + + This test verifies the fix for a bug where terminate_connection + was using volume.id (Cinder's UUID) to look up the LightOS volume, + but should use the volume name instead. The DBMock generates random + UUIDs that don't match Cinder's volume.id, simulating the real + behavior where LightOS assigns its own UUIDs. + """ + InitialConnectorMock.nqn = FAKE_CLIENT_HOSTNQN + InitialConnectorMock.found_discovery_client = True + self.driver.do_setup(None) + + # Create a volume - DBMock will assign a random UUID different from + # volume.id + vol_type = test_utils.create_volume_type(self.ctxt, self, + name='my_vol_type') + volume = test_utils.create_volume(self.ctxt, size=4, + volume_type_id=vol_type.id) + self.driver.create_volume(volume) + + # Initialize connection to add ACL + self.driver.initialize_connection(volume, get_connector_properties()) + + # Terminate connection should work even though volume.id != + # lightos_uuid. This requires looking up by name, not UUID + self.driver.terminate_connection(volume, get_connector_properties()) + + # Verify the volume still exists and ACL was removed + project_name = self.driver._get_lightos_project_name(volume) + lightos_volname = self.driver._lightos_volname(volume) + status, vol_data = self.driver._get_lightos_volume( + project_name, timeout=30, vol_name=lightos_volname) + self.assertEqual(httpstatus.OK, status) + self.assertIsNotNone(vol_data) + # ACL should be ALLOW_NONE after termination + self.assertIn('ALLOW_NONE', vol_data['acl']['values']) + + # Cleanup + self.driver.delete_volume(volume) + db.volume_destroy(self.ctxt, volume.id) diff --git a/cinder/volume/drivers/lightos.py b/cinder/volume/drivers/lightos.py index 0a6f216268b..4aa47b682e7 100644 --- a/cinder/volume/drivers/lightos.py +++ b/cinder/volume/drivers/lightos.py @@ -100,6 +100,11 @@ BLOCK_SIZE = 8 LIGHTOS = "LIGHTOS" INTERM_SNAPSHOT_PREFIX = "for_clone_" +STOP_LIGHTOS_CMD_FOR_RETURN_STATUS_LIST = [httpstatus.OK, + httpstatus.BAD_REQUEST, + httpstatus.UNAUTHORIZED, + httpstatus.FORBIDDEN, + httpstatus.NOT_FOUND] class LightOSConnection(object): @@ -512,6 +517,67 @@ def _get_lightos_volume( timeout=timeout, volume_name=vol_name) + def _get_lightos_volume_with_retry( + self, + project_name, + vol_uuid=None, + vol_name=None, + retry_timeout=5): + """Get LightOS volume with retry logic for 10 seconds by default. + + Args: + project_name: The project name + vol_uuid: Volume UUID (optional) + vol_name: Volume name (optional) + retry_timeout: How long to retry in seconds (default: 10) + + Returns: + Tuple of (status_code, response_data) + """ + assert vol_uuid or vol_name, \ + 'LightOS volume name or UUID must be specified' + + end_time = time.time() + retry_timeout + last_status = None + last_response = None + + while time.time() < end_time: + try: + status, response = self._get_lightos_volume( + project_name=project_name, + timeout=self.logical_op_timeout, + vol_uuid=vol_uuid, + vol_name=vol_name) + + # Return immediately for definitive status codes + # (success or permanent errors) + if (status in STOP_LIGHTOS_CMD_FOR_RETURN_STATUS_LIST and + response): + return status, response + + # For transient errors (500), retry until timeout + # Store the last response for potential return after timeout + if status == httpstatus.INTERNAL_SERVER_ERROR: + LOG.debug( + 'Got transient error %s while getting LightOS ' + 'volume, will retry', status) + else: + LOG.debug( + 'Got unexpected status %s while getting LightOS ' + 'volume, will retry', status) + + last_status = status + last_response = response + + except Exception as e: + LOG.debug( + 'Exception while getting LightOS volume: %s', str(e)) + last_status = None + last_response = str(e) + + # Return the last status and response if we've exhausted retries + return last_status, last_response + def _lightos_volname(self, volume): volid = volume.name_id lightos_volname = CONF.volume_name_template % volid @@ -1585,7 +1651,16 @@ def terminate_connection(self, volume, connector, **kwargs): hostnqn) project_name = self._get_lightos_project_name(volume) - + lightos_volname = self._lightos_volname(volume) + status, _response = self._get_lightos_volume_with_retry( + project_name, vol_name=lightos_volname) + LOG.debug("terminate_connection: requests status %s", status) + # if the volume is missing we can just return + if status == httpstatus.NOT_FOUND: + LOG.debug( + 'terminate_connection: volume %s not found, nothing to do', + volume) + return if not hostnqn: if force: LOG.debug( @@ -1605,9 +1680,10 @@ def terminate_connection(self, volume, connector, **kwargs): host_ips) if not success or not self._wait_for_volume_acl( project_name, lightos_volname, hostnqn, False): - LOG.warning( - 'Could not remove ACL for hostnqn %s LightOS \ + msg = 'Could not remove ACL for hostnqn %s LightOS \ volume %s, limping along', + LOG.warning( + msg, hostnqn, lightos_volname)