From bfbb8bb1ef713b0a4950adab679d26da1d2855e3 Mon Sep 17 00:00:00 2001 From: Pearl Dsilva Date: Mon, 12 Jan 2026 11:46:50 -0500 Subject: [PATCH] Allow cleaning up of networks stuck in Implementing state --- .../jobs/impl/AsyncJobManagerImpl.java | 75 +++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/framework/jobs/src/main/java/org/apache/cloudstack/framework/jobs/impl/AsyncJobManagerImpl.java b/framework/jobs/src/main/java/org/apache/cloudstack/framework/jobs/impl/AsyncJobManagerImpl.java index 80140b0d9502..fb6895b4743e 100644 --- a/framework/jobs/src/main/java/org/apache/cloudstack/framework/jobs/impl/AsyncJobManagerImpl.java +++ b/framework/jobs/src/main/java/org/apache/cloudstack/framework/jobs/impl/AsyncJobManagerImpl.java @@ -969,6 +969,13 @@ public void reallyRun() { } logger.trace("End cleanup expired async-jobs"); + + // 3) Cleanup orphaned networks stuck in Implementing state without async jobs + try { + cleanupOrphanedNetworks(); + } catch (Throwable e) { + logger.error("Unexpected exception when trying to cleanup orphaned networks", e); + } } catch (Throwable e) { logger.error("Unexpected exception when trying to execute queue item, ", e); } @@ -1284,6 +1291,74 @@ private void cleanupFailedSnapshotsCreatedWithDefaultStrategy(final long msid) { } } + /** + * Cleanup networks that are stuck in Implementing state without associated async jobs. + * This only processes networks that have been stuck for longer than the job expiration threshold. + */ + private void cleanupOrphanedNetworks() { + try { + SearchCriteria sc = networkDao.createSearchCriteria(); + sc.addAnd("state", SearchCriteria.Op.EQ, Network.State.Implementing); + sc.addAnd("removed", SearchCriteria.Op.NULL); + List implementingNetworks = networkDao.search(sc, null); + + if (implementingNetworks == null || implementingNetworks.isEmpty()) { + return; + } + + logger.debug("Found {} networks in Implementing state, checking for orphaned networks", implementingNetworks.size()); + + final long expireMinutes = JobExpireMinutes.value(); + final Date cutoffTime = new Date(System.currentTimeMillis() - (expireMinutes * 60 * 1000)); + + for (NetworkVO network : implementingNetworks) { + if (network.getCreated().after(cutoffTime)) { + logger.trace("Network {} in Implementing state is only {} minutes old (threshold: {} minutes), skipping cleanup", + network.getId(), + (System.currentTimeMillis() - network.getCreated().getTime()) / 60000, + expireMinutes); + continue; + } + + List jobs = _jobDao.findInstancePendingAsyncJobs("Network", network.getAccountId()); + boolean hasActiveJob = false; + for (AsyncJobVO job : jobs) { + if (job.getInstanceId() != null && job.getInstanceId().equals(network.getId())) { + hasActiveJob = true; + break; + } + } + + if (hasActiveJob) { + logger.debug("Network {} in Implementing state has active async job, skipping cleanup", network.getId()); + continue; + } + + logger.warn("Found orphaned network {} in Implementing state without async job. " + + "Network created: {}, age: {} minutes, expiration threshold: {} minutes. Transitioning to Shutdown state.", + network.getId(), network.getCreated(), + (System.currentTimeMillis() - network.getCreated().getTime()) / 60000, + expireMinutes); + updateNetworkState(network); + + } + } catch (Exception e) { + logger.error("Error while cleaning up orphaned networks", e); + } + } + + private void updateNetworkState(NetworkVO network) { + try { + networkOrchestrationService.stateTransitTo(network, Network.Event.OperationFailed); + logger.info("Successfully transitioned orphaned network {} to Shutdown state using state machine", network.getId()); + } catch (final NoTransitionException e) { + logger.debug("State transition failed for orphaned network {}, forcing state update", network.getId()); + network.setState(Network.State.Shutdown); + networkDao.update(network.getId(), network); + logger.info("Successfully forced orphaned network {} to Shutdown state", network.getId()); + } + } + @Override public void onManagementNodeJoined(List nodeList, long selfNodeId) { }