From 072640f51c53d8622df40482d988b018696ed178 Mon Sep 17 00:00:00 2001 From: Kittyfisto Date: Tue, 24 Mar 2020 23:08:58 +0100 Subject: [PATCH] Fixed race condition in OutOfProcessQueue #66 The child process was able to survive a call to Dispose() if it crashed right before Dispose() was called. OutOfProcessQueue.Dispose() now performs synchronisation at the cost of of a slightly changed behaviour visible to users, if they call dispose from within a FailureHandler callback. --- .../OutOfProcessSilo/FailureDetectionTest.cs | 10 ++-- .../OutOfProcessSilo/OutOfProcessSiloTest.cs | 55 +++++++++++++++++++ .../Hosting/OutOfProcess/OutOfProcessQueue.cs | 51 ++++++++++++++--- 3 files changed, 104 insertions(+), 12 deletions(-) diff --git a/SharpRemote.SystemTest/OutOfProcessSilo/FailureDetectionTest.cs b/SharpRemote.SystemTest/OutOfProcessSilo/FailureDetectionTest.cs index 92d5b27..d611104 100644 --- a/SharpRemote.SystemTest/OutOfProcessSilo/FailureDetectionTest.cs +++ b/SharpRemote.SystemTest/OutOfProcessSilo/FailureDetectionTest.cs @@ -136,13 +136,13 @@ public void TestFailureDetection11() }; using (var silo = new SharpRemote.Hosting.OutOfProcessSilo(failureSettings: settings, failureHandler: handler.Object)) - using (var handle = new ManualResetEvent(false)) + using (var failureDetectedEvent = new ManualResetEvent(false)) { handler.Setup(x => x.OnResolutionFinished(It.IsAny(), It.IsAny(), It.IsAny())) .Callback((Failure f, Decision d, Resolution r) => { + failureDetectedEvent.Set(); silo.Dispose(); - handle.Set(); }); silo.Start(); @@ -152,10 +152,10 @@ public void TestFailureDetection11() Process hostProcess = Process.GetProcessById(id.Value); hostProcess.Kill(); - handle.WaitOne(TimeSpan.FromSeconds(2)) - .Should().BeTrue("Because the failure should've been detected as well as handled"); + failureDetectedEvent.WaitOne(TimeSpan.FromSeconds(2)) + .Should().BeTrue("Because the failure should've been detected"); - silo.IsDisposed.Should().BeTrue(); + silo.Property(x => x.IsDisposed).ShouldEventually().BeTrue(); silo.HasProcessFailed.Should().BeTrue(); silo.IsProcessRunning.Should().BeFalse(); } diff --git a/SharpRemote.SystemTest/OutOfProcessSilo/OutOfProcessSiloTest.cs b/SharpRemote.SystemTest/OutOfProcessSilo/OutOfProcessSiloTest.cs index 86207fa..0dd63bc 100644 --- a/SharpRemote.SystemTest/OutOfProcessSilo/OutOfProcessSiloTest.cs +++ b/SharpRemote.SystemTest/OutOfProcessSilo/OutOfProcessSiloTest.cs @@ -56,6 +56,37 @@ private static bool IsProcessRunning(int pid) } } + private static string TryGetExecutableName(Process process) + { + try + { + return process.MainModule.FileName; + } + catch (Exception) + { + return null; + } + } + + private static bool IsProcessRunning(string executablePath) + { + try + { + foreach(var process in Process.GetProcesses()) + { + if (String.Equals(executablePath, TryGetExecutableName(process))) + if (!process.HasExited) + return true; + } + + return false; + } + catch (Exception) + { + return false; + } + } + private void ProcessShouldBeRunning(int pid) { this.Property(x => IsProcessRunning(pid)).ShouldEventually().BeTrue(); @@ -580,6 +611,30 @@ public void TestUseByReferenceTypeAfterRestart() } } + [Test] + [Repeat(20)] + public void TestKillProcessBeforeDispose() + { + using (var logCollector = new LogCollector(new []{ "SharpRemote.Hosting.OutOfProcessSilo" }, new []{Level.All})) + using (var silo = new SharpRemote.Hosting.OutOfProcessSilo(failureHandler: new RestartOnFailureStrategy())) + { + logCollector.AutoPrint(TestContext.Progress); + silo.Start(); + + var pid = silo.HostProcessId; + pid.Should().HaveValue("because the process should be running"); + var process = Process.GetProcessById(pid.Value); + var executablePath = process.MainModule.FileName; + + process.Kill(); + Thread.Sleep(TimeSpan.FromMilliseconds(100)); + silo.Dispose(); + + Thread.Sleep(TimeSpan.FromSeconds(1)); + IsProcessRunning(executablePath).Should().BeFalse(); + } + } + private void RestartHost(SharpRemote.Hosting.OutOfProcessSilo silo) { var pid = silo.HostProcessId; diff --git a/SharpRemote/Hosting/OutOfProcess/OutOfProcessQueue.cs b/SharpRemote/Hosting/OutOfProcess/OutOfProcessQueue.cs index 9e43525..4e51efe 100644 --- a/SharpRemote/Hosting/OutOfProcess/OutOfProcessQueue.cs +++ b/SharpRemote/Hosting/OutOfProcess/OutOfProcessQueue.cs @@ -1,6 +1,7 @@ using System; using System.Collections.Concurrent; using System.Collections.Generic; +using System.Diagnostics.Eventing.Reader; using System.Net; using System.Reflection; using System.Threading; @@ -48,6 +49,7 @@ public enum OperationType private ConnectionId _currentConnection; private int _currentPid; private volatile bool _isDisposed; + private volatile bool _isDisposing; private bool _started; public OutOfProcessQueue( @@ -84,8 +86,43 @@ FailureSettings failureSettings public void Dispose() { - _process.OnFaultDetected -= ProcessOnOnFaultDetected; - _isDisposed = true; + lock (_syncRoot) + { + if (_isDisposed) + { + // We don't return _isDisposing is true because that would allow concurrent Disposes to behave differently. + // The first call to Dispose() would block until the worker thread is stopped wheres the second (parallel) call + // to Dispose() would not block. + // This is undesirable. + return; + } + + _process.OnFaultDetected -= ProcessOnOnFaultDetected; + _isDisposing = true; + } + + Log.Debug("Waiting for worker thread to stop..."); + var timeout = TimeSpan.FromSeconds(5); + if (!_thread.Join(timeout)) //< We MUST wait for this thread outside of the lock, otherwise a deadlock becomes incredibly likely + { + // When this happens, it is likely that we're in a deadlock situation and we do not want to risk that. + // (For example a user could have managed to call Dispose() from within the worker thread. + // We have two different ways to continue here: + // 1) Cause a deadlock + // 2) Break the guarantee that Dispose() blocks until the worker thread has finished executing + // Out of the two given choices, 2) is the preferred one which is implemented here. + Log.WarnFormat("Worker thread did not stop after waiting for {0} seconds, continuing regardless...", timeout.TotalSeconds); + } + else + { + Log.Debug("Worker thread successfully stopped"); + } + + lock (_syncRoot) + { + _isDisposed = true; + _isDisposing = false; + } } /// @@ -115,7 +152,7 @@ private void EndPointOnOnFailure(EndPointDisconnectReason endPointReason, Connec { lock (_syncRoot) { - // If we're disposing this silo (or have disposed it alrady), then the heartbeat monitor + // If we're disposing this silo (or have disposed it already), then the heartbeat monitor // reported a failure that we caused intentionally (by killing the host process) and thus // this "failure" musn't be reported. if (_isDisposed) @@ -164,10 +201,10 @@ private void ProcessOnOnFaultDetected(int pid, ProcessFailureReason processFailu { lock (_syncRoot) { - // If we're disposing this silo (or have disposed it alrady), then the heartbeat monitor + // If we're disposing this silo (or have disposed it already), then the heartbeat monitor // reported a failure that we caused intentionally (by killing the host process) and thus // this "failure" musn't be reported. - if (_isDisposed) + if (_isDisposed || _isDisposing) return; } @@ -192,7 +229,7 @@ public int CurrentPid private void Do() { - while (!_isDisposed) + while (!_isDisposed && !_isDisposing) { try { @@ -410,7 +447,7 @@ private bool ProcessStartFailure(int currentRestart, /// internal OperationResult DoHandleFailure(Operation op) { - if (_isDisposed) + if (_isDisposed || _isDisposing) { Log.DebugFormat("Ignoring failure '{0}' because this queue has been disposed of already", op); return OperationResult.Ignored;