From edd7f4dbf3253ce5a95762b4d4a54b27774b56f2 Mon Sep 17 00:00:00 2001 From: Jiangzhou He Date: Mon, 16 Mar 2026 13:04:48 -0700 Subject: [PATCH 1/5] fix: make sure daemons are brought down cleanly --- src/cocoindex_code/cli.py | 10 ++-- src/cocoindex_code/client.py | 111 +++++++++++++++++++++++------------ src/cocoindex_code/daemon.py | 17 ++++-- 3 files changed, 91 insertions(+), 47 deletions(-) diff --git a/src/cocoindex_code/cli.py b/src/cocoindex_code/cli.py index f8e1706..98fd94c 100644 --- a/src/cocoindex_code/cli.py +++ b/src/cocoindex_code/cli.py @@ -467,26 +467,26 @@ def daemon_restart() -> None: @daemon_app.command("stop") def daemon_stop() -> None: """Stop the daemon.""" - from .client import stop_daemon + from .client import is_daemon_running, stop_daemon from .daemon import daemon_pid_path pid_path = daemon_pid_path() - if not pid_path.exists(): + if not pid_path.exists() and not is_daemon_running(): _typer.echo("Daemon is not running.") return stop_daemon() - # Wait for process to exit + # Wait for process to exit (check both pid file and socket) import time deadline = time.monotonic() + 5.0 while time.monotonic() < deadline: - if not pid_path.exists(): + if not pid_path.exists() and not is_daemon_running(): break time.sleep(0.1) - if pid_path.exists(): + if pid_path.exists() or is_daemon_running(): _typer.echo("Warning: daemon may not have stopped cleanly.", err=True) else: _typer.echo("Daemon stopped.") diff --git a/src/cocoindex_code/client.py b/src/cocoindex_code/client.py index bb6f5d7..e5d4e86 100644 --- a/src/cocoindex_code/client.py +++ b/src/cocoindex_code/client.py @@ -205,12 +205,36 @@ def _find_ccc_executable() -> str | None: return None +def _pid_alive(pid: int) -> bool: + """Return True if *pid* is still running.""" + try: + os.kill(pid, 0) # signal 0: check existence without killing + return True + except ProcessLookupError: + return False + except PermissionError: + return True # process exists but we can't signal it + + def stop_daemon() -> None: """Stop the daemon gracefully. - Sends a StopRequest, waits for the process to exit, falls back to SIGTERM. + Sends a StopRequest, waits for the process to exit, falls back to + SIGTERM → SIGKILL. Only removes the PID file after confirming that + the specific PID is no longer alive. """ - # Step 1: try sending StopRequest + pid_path = daemon_pid_path() + + # Read the PID early so we can track the actual process. + pid: int | None = None + try: + pid = int(pid_path.read_text().strip()) + if pid == os.getpid(): + pid = None # safety: never kill ourselves + except (FileNotFoundError, ValueError): + pass + + # Step 1: try sending StopRequest via socket try: client = DaemonClient.connect() client.handshake() @@ -220,65 +244,78 @@ def stop_daemon() -> None: pass # Step 2: wait for process to exit (up to 5s) - pid_path = daemon_pid_path() - deadline = time.monotonic() + 5.0 - while time.monotonic() < deadline and pid_path.exists(): - time.sleep(0.1) - - if not pid_path.exists(): - return # Clean exit + if pid is not None: + deadline = time.monotonic() + 5.0 + while time.monotonic() < deadline and _pid_alive(pid): + time.sleep(0.1) + if not _pid_alive(pid): + _cleanup_stale_files(pid_path, pid) + return # Step 3: if still running, try SIGTERM - pid: int | None = None - if pid_path.exists(): + if pid is not None and _pid_alive(pid): try: - pid = int(pid_path.read_text().strip()) - if pid != os.getpid(): - os.kill(pid, signal.SIGTERM) - else: - pid = None - except (ValueError, ProcessLookupError, PermissionError): + os.kill(pid, signal.SIGTERM) + except (ProcessLookupError, PermissionError): pass - # Wait a bit more deadline = time.monotonic() + 2.0 - while time.monotonic() < deadline and pid_path.exists(): + while time.monotonic() < deadline and _pid_alive(pid): time.sleep(0.1) - # Step 4: if still running, escalate to SIGKILL (Unix only; + if not _pid_alive(pid): + _cleanup_stale_files(pid_path, pid) + return + + # Step 4: escalate to SIGKILL (Unix only; # on Windows SIGTERM already calls TerminateProcess) - if sys.platform != "win32" and pid_path.exists(): + if sys.platform != "win32" and pid is not None and _pid_alive(pid): try: - pid = int(pid_path.read_text().strip()) - if pid != os.getpid(): - os.kill(pid, signal.SIGKILL) - else: - pid = None - except (ValueError, ProcessLookupError, PermissionError): + os.kill(pid, signal.SIGKILL) + except (ProcessLookupError, PermissionError): pass + # SIGKILL is async; give the kernel a moment to reap + deadline = time.monotonic() + 1.0 + while time.monotonic() < deadline and _pid_alive(pid): + time.sleep(0.1) + # Step 4b: on Windows, wait for the process to fully exit after TerminateProcess # so that named pipe handles are released before starting a new daemon. if sys.platform == "win32" and pid is not None: deadline = time.monotonic() + 3.0 - while time.monotonic() < deadline: - try: - os.kill(pid, 0) # Check if process still exists - time.sleep(0.1) - except (ProcessLookupError, PermissionError, OSError): - break # Process has exited + while time.monotonic() < deadline and _pid_alive(pid): + time.sleep(0.1) # Step 5: clean up stale files + _cleanup_stale_files(pid_path, pid) + + +def _cleanup_stale_files(pid_path: Path, pid: int | None) -> None: + """Remove socket and PID file after the daemon has exited. + + Only removes the PID file when *pid* matches what is on disk, to + avoid accidentally deleting a newer daemon's PID file. + """ if sys.platform != "win32": sock = daemon_socket_path() try: Path(sock).unlink(missing_ok=True) except Exception: pass - try: - pid_path.unlink(missing_ok=True) - except Exception: - pass + if pid is not None: + try: + stored = pid_path.read_text().strip() + if stored == str(pid): + pid_path.unlink(missing_ok=True) + except (FileNotFoundError, ValueError): + pass + else: + # No PID known — cautiously remove if file exists + try: + pid_path.unlink(missing_ok=True) + except Exception: + pass def _wait_for_daemon(timeout: float = 30.0) -> None: diff --git a/src/cocoindex_code/daemon.py b/src/cocoindex_code/daemon.py index a1a4286..dcfbcd8 100644 --- a/src/cocoindex_code/daemon.py +++ b/src/cocoindex_code/daemon.py @@ -447,17 +447,24 @@ def run_daemon() -> None: try: asyncio.run(_async_daemon_main(embedder)) finally: - # Clean up PID file and socket (named pipes on Windows clean up automatically) - try: - pid_path.unlink(missing_ok=True) - except Exception: - pass + # Clean up socket first, then PID file last. + # The PID file is the authoritative "daemon is alive" indicator, so it + # must be the very last thing removed to avoid races where a client + # sees the PID gone but the socket (or process) is still lingering. if sys.platform != "win32": sock = daemon_socket_path() try: Path(sock).unlink(missing_ok=True) except Exception: pass + # Only remove the PID file if it still contains *our* PID. + # A new daemon may have already overwritten it during a restart race. + try: + stored = pid_path.read_text().strip() + if stored == str(os.getpid()): + pid_path.unlink(missing_ok=True) + except Exception: + pass logger.info("Daemon stopped") From d5c6c8b6b726536260a93ac4d058f789bda98049 Mon Sep 17 00:00:00 2001 From: Jiangzhou He Date: Mon, 16 Mar 2026 13:19:12 -0700 Subject: [PATCH 2/5] chore: be more verbose in pre-commit workflow --- .github/workflows/pre-commit.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml index b2931a7..4c2c20d 100644 --- a/.github/workflows/pre-commit.yml +++ b/.github/workflows/pre-commit.yml @@ -30,4 +30,4 @@ jobs: run: uv sync --python ${{ matrix.python-version }} --python-preference only-managed - name: Run prek - run: uv run prek run --all-files + run: uv run prek run --all-files --verbose --show-diff-on-failure From c16a364b9e6b26be7a7290d187c5c20d09949bb1 Mon Sep 17 00:00:00 2001 From: Jiangzhou He Date: Mon, 16 Mar 2026 13:41:53 -0700 Subject: [PATCH 3/5] fix: split CI workflow to avoid prek output capture deadlock on Windows prek --verbose captures subprocess stdout via pipe. On Windows, the 4KB pipe buffer fills when pytest produces verbose output, causing a deadlock (pytest blocks writing, prek blocks waiting for exit). Split into two steps: prek --skip pytest for lint checks, and pytest running directly for streaming output. Also: - Add OSError catch in _pid_alive for Windows edge cases - Properly shut down daemon thread in test_daemon.py session fixture Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/workflows/pre-commit.yml | 7 +++++-- src/cocoindex_code/client.py | 29 ++++++++++++++++++++++------- tests/test_daemon.py | 13 +++++++++++++ 3 files changed, 40 insertions(+), 9 deletions(-) diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml index 4c2c20d..e2ee8a4 100644 --- a/.github/workflows/pre-commit.yml +++ b/.github/workflows/pre-commit.yml @@ -29,5 +29,8 @@ jobs: - name: Install dependencies run: uv sync --python ${{ matrix.python-version }} --python-preference only-managed - - name: Run prek - run: uv run prek run --all-files --verbose --show-diff-on-failure + - name: Lint & format (prek) + run: uv run prek run --all-files --verbose --show-diff-on-failure --skip pytest + + - name: Test (pytest) + run: uv run pytest tests/ -v diff --git a/src/cocoindex_code/client.py b/src/cocoindex_code/client.py index e5d4e86..f6b8460 100644 --- a/src/cocoindex_code/client.py +++ b/src/cocoindex_code/client.py @@ -183,13 +183,26 @@ def start_daemon() -> None: cmd = [sys.executable, "-m", "cocoindex_code.cli", "run-daemon"] log_fd = open(log_path, "a") - subprocess.Popen( - cmd, - start_new_session=True, - stdout=log_fd, - stderr=log_fd, - stdin=subprocess.DEVNULL, - ) + if sys.platform == "win32": + # DETACHED_PROCESS fully detaches the daemon from the parent console, + # preventing its exit code from leaking back to the calling shell. + _create_new_process_group = 0x00000200 + _detached_process = 0x00000008 + subprocess.Popen( + cmd, + stdout=log_fd, + stderr=log_fd, + stdin=subprocess.DEVNULL, + creationflags=_create_new_process_group | _detached_process, + ) + else: + subprocess.Popen( + cmd, + start_new_session=True, + stdout=log_fd, + stderr=log_fd, + stdin=subprocess.DEVNULL, + ) log_fd.close() @@ -214,6 +227,8 @@ def _pid_alive(pid: int) -> bool: return False except PermissionError: return True # process exists but we can't signal it + except OSError: + return False # assume dead on unexpected OS errors (e.g. Windows edge cases) def stop_daemon() -> None: diff --git a/tests/test_daemon.py b/tests/test_daemon.py index e376af8..58bb80c 100644 --- a/tests/test_daemon.py +++ b/tests/test_daemon.py @@ -29,6 +29,7 @@ RemoveProjectRequest, Response, SearchRequest, + StopRequest, decode_response, encode_request, ) @@ -92,6 +93,18 @@ def daemon_sock() -> Iterator[str]: yield sock_path + # Gracefully shut down the daemon thread so named pipes are released on Windows + try: + conn = Client(sock_path, family=_connection_family()) + conn.send_bytes(encode_request(HandshakeRequest(version=__version__))) + conn.recv_bytes() + conn.send_bytes(encode_request(StopRequest())) + conn.recv_bytes() + conn.close() + except Exception: + pass + thread.join(timeout=5) + # Restore patches and env var dm.create_embedder = _orig_create_embedder # type: ignore[attr-defined] if old_env is None: From d4fcdb0388b0658aea118d8887026879a3e68619 Mon Sep 17 00:00:00 2001 From: Jiangzhou He Date: Mon, 16 Mar 2026 14:11:19 -0700 Subject: [PATCH 4/5] fix: catch SystemError from os.kill on Windows On Windows, os.kill(pid, 0) raises SystemError when the target process has exited but its handle state is in transition (WinError 87). This corrupts CPython C exception state, causing subsequent built-in calls like time.monotonic() to also raise SystemError. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/cocoindex_code/client.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/cocoindex_code/client.py b/src/cocoindex_code/client.py index f6b8460..9f09bdc 100644 --- a/src/cocoindex_code/client.py +++ b/src/cocoindex_code/client.py @@ -227,8 +227,11 @@ def _pid_alive(pid: int) -> bool: return False except PermissionError: return True # process exists but we can't signal it - except OSError: - return False # assume dead on unexpected OS errors (e.g. Windows edge cases) + except (OSError, SystemError): + # On Windows, os.kill(pid, 0) can raise SystemError or unexpected OSError + # variants (e.g. WinError 87 "The parameter is incorrect") for PIDs that + # have exited but whose handles haven't been fully released. + return False def stop_daemon() -> None: From db70adc32a5735f1f10343981f25f8128cf368a8 Mon Sep 17 00:00:00 2001 From: Jiangzhou He Date: Mon, 16 Mar 2026 14:30:53 -0700 Subject: [PATCH 5/5] fix: avoid os.kill on Windows due to CPython C exception state corruption os.kill(pid, 0) on Windows corrupts CPython C-level exception state even after the raised OSError is caught. This causes subsequent calls to C built-ins (time.monotonic, time.sleep) to raise SystemError. Use ctypes OpenProcess instead, which is the proper Win32 API for checking process existence. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/cocoindex_code/client.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/src/cocoindex_code/client.py b/src/cocoindex_code/client.py index 9f09bdc..f44005c 100644 --- a/src/cocoindex_code/client.py +++ b/src/cocoindex_code/client.py @@ -220,6 +220,19 @@ def _find_ccc_executable() -> str | None: def _pid_alive(pid: int) -> bool: """Return True if *pid* is still running.""" + if sys.platform == "win32": + # Avoid os.kill(pid, 0) on Windows — it has a CPython bug that corrupts + # the C-level exception state, causing subsequent C function calls + # (time.monotonic, time.sleep) to raise SystemError even after the + # OSError is caught. Use OpenProcess via ctypes instead. + import ctypes + + kernel32 = getattr(ctypes, "windll").kernel32 + handle = kernel32.OpenProcess(0x1000, False, pid) # PROCESS_QUERY_LIMITED_INFORMATION + if handle: + kernel32.CloseHandle(handle) + return True + return False try: os.kill(pid, 0) # signal 0: check existence without killing return True @@ -227,11 +240,6 @@ def _pid_alive(pid: int) -> bool: return False except PermissionError: return True # process exists but we can't signal it - except (OSError, SystemError): - # On Windows, os.kill(pid, 0) can raise SystemError or unexpected OSError - # variants (e.g. WinError 87 "The parameter is incorrect") for PIDs that - # have exited but whose handles haven't been fully released. - return False def stop_daemon() -> None: