From 7a265d571b6e2182526167e9bed0bcea6b350c15 Mon Sep 17 00:00:00 2001 From: Edward van Eechoud <41021650+Edwardvaneechoud@users.noreply.github.com> Date: Sat, 31 Jan 2026 14:27:49 +0100 Subject: [PATCH 01/38] Add kernel runtime management with Docker containerization (#281) * Add Docker-based kernel system for isolated Python code execution Introduces two components: - kernel_runtime/: Standalone FastAPI service that runs inside Docker containers, providing code execution with artifact storage and parquet-based data I/O via the flowfile client API - flowfile_core/kernel/: Orchestration layer that manages kernel containers (create, start, stop, delete, execute) using docker-py, with full REST API routes integrated into the core backend https://claude.ai/code/session_013cuypEib6mAn3uGJNmqnfk * Add python_script node type for kernel-based code execution - PythonScriptInput/NodePythonScript schemas in input_schema.py - add_python_script method in flow_graph.py that stages input parquet to shared volume, executes on kernel, reads output back - get_kernel_manager singleton in kernel/__init__.py - python_script node template registered in node_store https://claude.ai/code/session_013cuypEib6mAn3uGJNmqnfk * Add integration tests for Docker-based kernel system - kernel_fixtures.py: builds the flowfile-kernel Docker image, creates a KernelManager with a temp shared volume, starts a container, and tears everything down via a managed_kernel() context manager - conftest.py: adds session-scoped kernel_manager fixture - test_kernel_integration.py: full integration tests covering: - TestKernelRuntime: health check, stdout/stderr capture, syntax errors, artifact publish/list, parquet read/write round-trip, multiple named inputs, execution timing - TestPythonScriptNode: python_script node passthrough and transform via FlowGraph.run_graph(), plus missing kernel_id error handling - manager.py: expose shared_volume_path as public property - flow_graph.py: use public property instead of private attribute https://claude.ai/code/session_013cuypEib6mAn3uGJNmqnfk * update poetry version * Fix kernel system: singleton routing, state machine, sync execution, port resilience - routes.py: use get_kernel_manager() singleton instead of creating a separate KernelManager instance (was causing dual-state bug) - models.py: replace RUNNING with IDLE/EXECUTING states; store memory_gb, cpu_cores, gpu on KernelInfo from KernelConfig - manager.py: add _reclaim_running_containers() on init to discover existing flowfile-kernel-* containers and reclaim their ports; port allocation now scans for available ports instead of incrementing; add execute_sync() using httpx.Client for clean sync usage; state transitions: IDLE -> EXECUTING -> IDLE during execute() - flow_graph.py: use execute_sync() instead of fragile asyncio.run/ThreadPoolExecutor dance - test: update state assertion from "running" to "idle" https://claude.ai/code/session_013cuypEib6mAn3uGJNmqnfk * Fix kernel health check and test fixture resilience - _wait_for_healthy: catch all httpx errors (including RemoteProtocolError) during startup polling, not just ConnectError/ReadError/ConnectTimeout - conftest kernel_manager fixture: wrap managed_kernel() in try/except so container start failures produce pytest.skip instead of ERROR https://claude.ai/code/session_013cuypEib6mAn3uGJNmqnfk * removing breakpoint * Run kernel integration tests in parallel CI worker - Add pytest.mark.kernel marker to test_kernel_integration.py - Register 'kernel' marker in pyproject.toml - Exclude kernel tests from main backend-tests with -m "not kernel" (both Linux and Windows jobs) - Add dedicated kernel-tests job that runs in parallel: builds Docker image, runs -m kernel tests, 15min timeout - Add kernel_runtime paths to change detection filters - Include kernel-tests in test-summary aggregation https://claude.ai/code/session_013cuypEib6mAn3uGJNmqnfk * Remove --timeout flag from kernel CI step (pytest-timeout not installed) The job-level timeout-minutes: 15 already handles this. https://claude.ai/code/session_013cuypEib6mAn3uGJNmqnfk * Add unit tests for kernel_runtime (artifact_store, flowfile_client, endpoints) 42 tests covering the three kernel_runtime modules: - artifact_store: publish/get/list/clear, metadata, thread safety - flowfile_client: context management, parquet I/O, artifacts - main.py endpoints: /health, /execute, /artifacts, /clear, parquet round-trips https://claude.ai/code/session_013cuypEib6mAn3uGJNmqnfk * Add *.egg-info/ to .gitignore https://claude.ai/code/session_013cuypEib6mAn3uGJNmqnfk * Add kernel_runtime unit tests to CI kernel-tests job Installs kernel_runtime with test deps and runs its 42 unit tests before the Docker-dependent integration tests. --- .github/workflows/test.yaml | 57 ++- .gitignore | 3 + .../flowfile_core/configs/node_store/nodes.py | 12 + .../flowfile_core/flowfile/flow_graph.py | 59 +++ .../flowfile_core/kernel/__init__.py | 32 ++ flowfile_core/flowfile_core/kernel/manager.py | 270 +++++++++++ flowfile_core/flowfile_core/kernel/models.py | 52 +++ flowfile_core/flowfile_core/kernel/routes.py | 103 +++++ flowfile_core/flowfile_core/main.py | 2 + .../flowfile_core/schemas/input_schema.py | 13 + flowfile_core/tests/conftest.py | 19 + .../tests/flowfile/test_kernel_integration.py | 431 ++++++++++++++++++ flowfile_core/tests/kernel_fixtures.py | 127 ++++++ kernel_runtime/Dockerfile | 23 + kernel_runtime/entrypoint.sh | 9 + kernel_runtime/kernel_runtime/__init__.py | 0 .../kernel_runtime/artifact_store.py | 41 ++ .../kernel_runtime/flowfile_client.py | 71 +++ kernel_runtime/kernel_runtime/main.py | 102 +++++ kernel_runtime/pyproject.toml | 22 + kernel_runtime/tests/__init__.py | 0 kernel_runtime/tests/conftest.py | 40 ++ kernel_runtime/tests/test_artifact_store.py | 101 ++++ kernel_runtime/tests/test_flowfile_client.py | 168 +++++++ kernel_runtime/tests/test_main.py | 331 ++++++++++++++ poetry.lock | 22 +- pyproject.toml | 4 +- 27 files changed, 2093 insertions(+), 21 deletions(-) create mode 100644 flowfile_core/flowfile_core/kernel/__init__.py create mode 100644 flowfile_core/flowfile_core/kernel/manager.py create mode 100644 flowfile_core/flowfile_core/kernel/models.py create mode 100644 flowfile_core/flowfile_core/kernel/routes.py create mode 100644 flowfile_core/tests/flowfile/test_kernel_integration.py create mode 100644 flowfile_core/tests/kernel_fixtures.py create mode 100644 kernel_runtime/Dockerfile create mode 100755 kernel_runtime/entrypoint.sh create mode 100644 kernel_runtime/kernel_runtime/__init__.py create mode 100644 kernel_runtime/kernel_runtime/artifact_store.py create mode 100644 kernel_runtime/kernel_runtime/flowfile_client.py create mode 100644 kernel_runtime/kernel_runtime/main.py create mode 100644 kernel_runtime/pyproject.toml create mode 100644 kernel_runtime/tests/__init__.py create mode 100644 kernel_runtime/tests/conftest.py create mode 100644 kernel_runtime/tests/test_artifact_store.py create mode 100644 kernel_runtime/tests/test_flowfile_client.py create mode 100644 kernel_runtime/tests/test_main.py diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index bef75991b..2f404997b 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -25,6 +25,7 @@ jobs: backend_worker: ${{ steps.filter.outputs.backend_worker }} backend_frame: ${{ steps.filter.outputs.backend_frame }} backend_flowfile: ${{ steps.filter.outputs.backend_flowfile }} + kernel: ${{ steps.filter.outputs.kernel }} frontend: ${{ steps.filter.outputs.frontend }} docs: ${{ steps.filter.outputs.docs }} shared: ${{ steps.filter.outputs.shared }} @@ -46,6 +47,11 @@ jobs: - 'flowfile_frame/**' backend_flowfile: - 'flowfile/**' + kernel: + - 'kernel_runtime/**' + - 'flowfile_core/flowfile_core/kernel/**' + - 'flowfile_core/tests/flowfile/test_kernel_integration.py' + - 'flowfile_core/tests/kernel_fixtures.py' frontend: - 'flowfile_frontend/**' docs: @@ -145,7 +151,7 @@ jobs: needs.detect-changes.outputs.shared == 'true' || needs.detect-changes.outputs.test_workflow == 'true' || github.event.inputs.run_all_tests == 'true' - run: poetry run pytest flowfile_core/tests --disable-warnings $COV_ARGS + run: poetry run pytest flowfile_core/tests -m "not kernel" --disable-warnings $COV_ARGS env: COV_ARGS: ${{ (matrix.os == 'ubuntu-latest' && matrix.python-version == '3.12') && '--cov --cov-append --cov-report=' || '' }} @@ -271,7 +277,7 @@ jobs: needs.detect-changes.outputs.test_workflow == 'true' || github.event.inputs.run_all_tests == 'true' shell: pwsh - run: poetry run pytest flowfile_core/tests --disable-warnings + run: poetry run pytest flowfile_core/tests -m "not kernel" --disable-warnings - name: Run pytest for flowfile_worker if: | @@ -299,6 +305,48 @@ jobs: shell: pwsh run: poetry run pytest flowfile/tests --disable-warnings + # Kernel integration tests - runs in parallel on a separate worker + kernel-tests: + needs: detect-changes + if: | + needs.detect-changes.outputs.kernel == 'true' || + needs.detect-changes.outputs.backend_core == 'true' || + needs.detect-changes.outputs.shared == 'true' || + needs.detect-changes.outputs.test_workflow == 'true' || + github.event.inputs.run_all_tests == 'true' + runs-on: ubuntu-latest + timeout-minutes: 15 + steps: + - uses: actions/checkout@v4 + + - name: Set up Python 3.11 + uses: actions/setup-python@v5 + with: + python-version: "3.11" + cache: 'pip' + + - name: Install Poetry + run: | + curl -sSL https://install.python-poetry.org | python - + echo "$HOME/.poetry/bin" >> $GITHUB_PATH + + - name: Install Dependencies + run: | + poetry install --no-interaction --no-ansi --with dev + + - name: Build kernel Docker image + run: | + docker build -t flowfile-kernel -f kernel_runtime/Dockerfile kernel_runtime/ + + - name: Run kernel_runtime unit tests + run: | + pip install -e "kernel_runtime/[test]" + python -m pytest kernel_runtime/tests -v --disable-warnings + + - name: Run kernel integration tests + run: | + poetry run pytest flowfile_core/tests -m kernel -v --disable-warnings + # Frontend web build test - runs when frontend changes or test workflow changes test-web: needs: detect-changes @@ -472,7 +520,7 @@ jobs: # Summary job - always runs to provide status test-summary: - needs: [detect-changes, backend-tests, backend-tests-windows, test-web, electron-tests-macos, electron-tests-windows, docs-test] + needs: [detect-changes, backend-tests, backend-tests-windows, kernel-tests, test-web, electron-tests-macos, electron-tests-windows, docs-test] if: always() runs-on: ubuntu-latest steps: @@ -485,6 +533,7 @@ jobs: echo " - Backend Worker: ${{ needs.detect-changes.outputs.backend_worker }}" echo " - Backend Frame: ${{ needs.detect-changes.outputs.backend_frame }}" echo " - Backend Flowfile: ${{ needs.detect-changes.outputs.backend_flowfile }}" + echo " - Kernel: ${{ needs.detect-changes.outputs.kernel }}" echo " - Frontend: ${{ needs.detect-changes.outputs.frontend }}" echo " - Docs: ${{ needs.detect-changes.outputs.docs }}" echo " - Shared/Dependencies: ${{ needs.detect-changes.outputs.shared }}" @@ -493,6 +542,7 @@ jobs: echo "Job results:" echo " - Backend Tests: ${{ needs.backend-tests.result }}" echo " - Backend Tests (Windows): ${{ needs.backend-tests-windows.result }}" + echo " - Kernel Tests: ${{ needs.kernel-tests.result }}" echo " - Web Tests: ${{ needs.test-web.result }}" echo " - Electron Tests (macOS): ${{ needs.electron-tests-macos.result }}" echo " - Electron Tests (Windows): ${{ needs.electron-tests-windows.result }}" @@ -501,6 +551,7 @@ jobs: # Fail if any non-skipped job failed if [[ "${{ needs.backend-tests.result }}" == "failure" ]] || \ [[ "${{ needs.backend-tests-windows.result }}" == "failure" ]] || \ + [[ "${{ needs.kernel-tests.result }}" == "failure" ]] || \ [[ "${{ needs.test-web.result }}" == "failure" ]] || \ [[ "${{ needs.electron-tests-macos.result }}" == "failure" ]] || \ [[ "${{ needs.electron-tests-windows.result }}" == "failure" ]] || \ diff --git a/.gitignore b/.gitignore index 6e586b8cd..fab22f65e 100644 --- a/.gitignore +++ b/.gitignore @@ -67,6 +67,9 @@ htmlcov/ # Docker flowfile_data/ +# Egg info +*.egg-info/ + # Secrets and keys - NEVER commit these master_key.txt *.key diff --git a/flowfile_core/flowfile_core/configs/node_store/nodes.py b/flowfile_core/flowfile_core/configs/node_store/nodes.py index ef6e7840d..6346029db 100644 --- a/flowfile_core/flowfile_core/configs/node_store/nodes.py +++ b/flowfile_core/flowfile_core/configs/node_store/nodes.py @@ -286,6 +286,18 @@ def get_all_standard_nodes() -> tuple[list[NodeTemplate], dict[str, NodeTemplate drawer_title="Polars Code", drawer_intro="Write custom Polars DataFrame transformations", ), + NodeTemplate( + name="Python Script", + item="python_script", + input=1, + output=1, + transform_type="narrow", + image="polars_code.png", + node_group="transform", + node_type="process", + drawer_title="Python Script", + drawer_intro="Execute Python code on an isolated kernel container", + ), NodeTemplate( name="Read from Database", item="database_reader", diff --git a/flowfile_core/flowfile_core/flowfile/flow_graph.py b/flowfile_core/flowfile_core/flowfile/flow_graph.py index 323d79e7a..f62ecafb9 100644 --- a/flowfile_core/flowfile_core/flowfile/flow_graph.py +++ b/flowfile_core/flowfile_core/flowfile/flow_graph.py @@ -1116,6 +1116,65 @@ def _func(*flowfile_tables: FlowDataEngine) -> FlowDataEngine: node = self.get_node(node_id=node_polars_code.node_id) node.results.errors = str(e) + @with_history_capture(HistoryActionType.UPDATE_SETTINGS) + def add_python_script(self, node_python_script: input_schema.NodePythonScript): + """Adds a node that executes Python code on a kernel container.""" + + def _func(flowfile_table: FlowDataEngine) -> FlowDataEngine: + from flowfile_core.kernel import ExecuteRequest, get_kernel_manager + + kernel_id = node_python_script.python_script_input.kernel_id + code = node_python_script.python_script_input.code + + if not kernel_id: + raise ValueError("No kernel selected for python_script node") + + manager = get_kernel_manager() + + node_id = node_python_script.node_id + flow_id = self.flow_id + + shared_base = manager.shared_volume_path + input_dir = os.path.join(shared_base, str(flow_id), str(node_id), "inputs") + output_dir = os.path.join(shared_base, str(flow_id), str(node_id), "outputs") + + os.makedirs(input_dir, exist_ok=True) + os.makedirs(output_dir, exist_ok=True) + + # Write input to parquet + input_paths: dict[str, str] = {} + input_path = os.path.join(input_dir, "main.parquet") + flowfile_table.data_frame.collect().write_parquet(input_path) + input_paths["main"] = f"/shared/{flow_id}/{node_id}/inputs/main.parquet" + + # Execute on kernel (synchronous — no async boundary issues) + request = ExecuteRequest( + node_id=node_id, + code=code, + input_paths=input_paths, + output_dir=f"/shared/{flow_id}/{node_id}/outputs", + ) + result = manager.execute_sync(kernel_id, request) + + if not result.success: + raise RuntimeError(f"Kernel execution failed: {result.error}") + + # Read output + output_path = os.path.join(output_dir, "main.parquet") + if os.path.exists(output_path): + return FlowDataEngine(pl.scan_parquet(output_path)) + + # No output published, pass through input + return flowfile_table + + self.add_node_step( + node_id=node_python_script.node_id, + function=_func, + node_type="python_script", + setting_input=node_python_script, + input_node_ids=[node_python_script.depending_on_id], + ) + def add_dependency_on_polars_lazy_frame(self, lazy_frame: pl.LazyFrame, node_id: int): """Adds a special node that directly injects a Polars LazyFrame into the graph. diff --git a/flowfile_core/flowfile_core/kernel/__init__.py b/flowfile_core/flowfile_core/kernel/__init__.py new file mode 100644 index 000000000..6a275c3f7 --- /dev/null +++ b/flowfile_core/flowfile_core/kernel/__init__.py @@ -0,0 +1,32 @@ +from flowfile_core.kernel.manager import KernelManager +from flowfile_core.kernel.models import ( + ExecuteRequest, + ExecuteResult, + KernelConfig, + KernelInfo, + KernelState, +) +from flowfile_core.kernel.routes import router + +__all__ = [ + "KernelManager", + "KernelConfig", + "KernelInfo", + "KernelState", + "ExecuteRequest", + "ExecuteResult", + "router", + "get_kernel_manager", +] + +_manager: KernelManager | None = None + + +def get_kernel_manager() -> KernelManager: + global _manager + if _manager is None: + from shared.storage_config import storage + + shared_path = str(storage.temp_directory / "kernel_shared") + _manager = KernelManager(shared_volume_path=shared_path) + return _manager diff --git a/flowfile_core/flowfile_core/kernel/manager.py b/flowfile_core/flowfile_core/kernel/manager.py new file mode 100644 index 000000000..6732c5392 --- /dev/null +++ b/flowfile_core/flowfile_core/kernel/manager.py @@ -0,0 +1,270 @@ +import asyncio +import logging +import socket + +import docker +import httpx + +from flowfile_core.kernel.models import ( + ExecuteRequest, + ExecuteResult, + KernelConfig, + KernelInfo, + KernelState, +) +from shared.storage_config import storage + +logger = logging.getLogger(__name__) + +_KERNEL_IMAGE = "flowfile-kernel" +_BASE_PORT = 19000 +_PORT_RANGE = 1000 # 19000-19999 +_HEALTH_TIMEOUT = 120 +_HEALTH_POLL_INTERVAL = 2 + + +def _is_port_available(port: int) -> bool: + """Check whether a TCP port is free on localhost.""" + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + try: + s.bind(("127.0.0.1", port)) + return True + except OSError: + return False + + +class KernelManager: + def __init__(self, shared_volume_path: str | None = None): + self._docker = docker.from_env() + self._kernels: dict[str, KernelInfo] = {} + self._shared_volume = shared_volume_path or str(storage.cache_directory) + self._reclaim_running_containers() + + @property + def shared_volume_path(self) -> str: + return self._shared_volume + + # ------------------------------------------------------------------ + # Port allocation + # ------------------------------------------------------------------ + + def _reclaim_running_containers(self) -> None: + """Discover running flowfile-kernel containers and reclaim their ports.""" + try: + containers = self._docker.containers.list( + filters={"name": "flowfile-kernel-", "status": "running"} + ) + except Exception as exc: + logger.warning("Could not list running containers: %s", exc) + return + + for container in containers: + name = container.name + if not name.startswith("flowfile-kernel-"): + continue + kernel_id = name[len("flowfile-kernel-"):] + + # Determine which host port is mapped + port = None + try: + bindings = container.attrs["NetworkSettings"]["Ports"].get("9999/tcp") + if bindings: + port = int(bindings[0]["HostPort"]) + except (KeyError, IndexError, TypeError, ValueError): + pass + + if port is not None and kernel_id not in self._kernels: + self._kernels[kernel_id] = KernelInfo( + id=kernel_id, + name=kernel_id, + state=KernelState.IDLE, + container_id=container.id, + port=port, + ) + logger.info( + "Reclaimed running kernel '%s' on port %d (container %s)", + kernel_id, port, container.short_id, + ) + + def _allocate_port(self) -> int: + """Find the next available port in the kernel port range.""" + used_ports = {k.port for k in self._kernels.values() if k.port is not None} + for port in range(_BASE_PORT, _BASE_PORT + _PORT_RANGE): + if port not in used_ports and _is_port_available(port): + return port + raise RuntimeError( + f"No available ports in range {_BASE_PORT}-{_BASE_PORT + _PORT_RANGE - 1}" + ) + + # ------------------------------------------------------------------ + # Lifecycle + # ------------------------------------------------------------------ + + async def create_kernel(self, config: KernelConfig) -> KernelInfo: + if config.id in self._kernels: + raise ValueError(f"Kernel '{config.id}' already exists") + + port = self._allocate_port() + kernel = KernelInfo( + id=config.id, + name=config.name, + state=KernelState.STOPPED, + port=port, + packages=config.packages, + memory_gb=config.memory_gb, + cpu_cores=config.cpu_cores, + gpu=config.gpu, + ) + self._kernels[config.id] = kernel + logger.info("Created kernel '%s' on port %d", config.id, port) + return kernel + + async def start_kernel(self, kernel_id: str) -> KernelInfo: + kernel = self._get_kernel_or_raise(kernel_id) + if kernel.state == KernelState.IDLE: + return kernel + + kernel.state = KernelState.STARTING + kernel.error_message = None + + try: + packages_str = " ".join(kernel.packages) + run_kwargs: dict = { + "detach": True, + "name": f"flowfile-kernel-{kernel_id}", + "ports": {"9999/tcp": kernel.port}, + "volumes": {self._shared_volume: {"bind": "/shared", "mode": "rw"}}, + "environment": {"KERNEL_PACKAGES": packages_str}, + "mem_limit": f"{kernel.memory_gb}g", + "nano_cpus": int(kernel.cpu_cores * 1e9), + } + container = self._docker.containers.run(_KERNEL_IMAGE, **run_kwargs) + kernel.container_id = container.id + await self._wait_for_healthy(kernel_id, timeout=_HEALTH_TIMEOUT) + kernel.state = KernelState.IDLE + logger.info("Kernel '%s' is idle (container %s)", kernel_id, container.short_id) + except Exception as exc: + kernel.state = KernelState.ERROR + kernel.error_message = str(exc) + logger.error("Failed to start kernel '%s': %s", kernel_id, exc) + self._cleanup_container(kernel_id) + raise + + return kernel + + async def stop_kernel(self, kernel_id: str) -> None: + kernel = self._get_kernel_or_raise(kernel_id) + self._cleanup_container(kernel_id) + kernel.state = KernelState.STOPPED + kernel.container_id = None + logger.info("Stopped kernel '%s'", kernel_id) + + async def delete_kernel(self, kernel_id: str) -> None: + kernel = self._get_kernel_or_raise(kernel_id) + if kernel.state in (KernelState.IDLE, KernelState.EXECUTING): + await self.stop_kernel(kernel_id) + del self._kernels[kernel_id] + logger.info("Deleted kernel '%s'", kernel_id) + + # ------------------------------------------------------------------ + # Execution + # ------------------------------------------------------------------ + + async def execute(self, kernel_id: str, request: ExecuteRequest) -> ExecuteResult: + kernel = self._get_kernel_or_raise(kernel_id) + if kernel.state not in (KernelState.IDLE, KernelState.EXECUTING): + raise RuntimeError(f"Kernel '{kernel_id}' is not running (state: {kernel.state})") + + kernel.state = KernelState.EXECUTING + try: + url = f"http://localhost:{kernel.port}/execute" + async with httpx.AsyncClient(timeout=httpx.Timeout(300.0)) as client: + response = await client.post(url, json=request.model_dump()) + response.raise_for_status() + return ExecuteResult(**response.json()) + finally: + # Only return to IDLE if we haven't been stopped/errored in the meantime + if kernel.state == KernelState.EXECUTING: + kernel.state = KernelState.IDLE + + def execute_sync(self, kernel_id: str, request: ExecuteRequest) -> ExecuteResult: + """Synchronous wrapper around execute() for use from non-async code.""" + kernel = self._get_kernel_or_raise(kernel_id) + if kernel.state not in (KernelState.IDLE, KernelState.EXECUTING): + raise RuntimeError(f"Kernel '{kernel_id}' is not running (state: {kernel.state})") + + kernel.state = KernelState.EXECUTING + try: + url = f"http://localhost:{kernel.port}/execute" + with httpx.Client(timeout=httpx.Timeout(300.0)) as client: + response = client.post(url, json=request.model_dump()) + response.raise_for_status() + return ExecuteResult(**response.json()) + finally: + if kernel.state == KernelState.EXECUTING: + kernel.state = KernelState.IDLE + + async def clear_artifacts(self, kernel_id: str) -> None: + kernel = self._get_kernel_or_raise(kernel_id) + if kernel.state not in (KernelState.IDLE, KernelState.EXECUTING): + raise RuntimeError(f"Kernel '{kernel_id}' is not running (state: {kernel.state})") + + url = f"http://localhost:{kernel.port}/clear" + async with httpx.AsyncClient(timeout=httpx.Timeout(30.0)) as client: + response = await client.post(url) + response.raise_for_status() + + # ------------------------------------------------------------------ + # Queries + # ------------------------------------------------------------------ + + async def list_kernels(self) -> list[KernelInfo]: + return list(self._kernels.values()) + + async def get_kernel(self, kernel_id: str) -> KernelInfo | None: + return self._kernels.get(kernel_id) + + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + def _get_kernel_or_raise(self, kernel_id: str) -> KernelInfo: + kernel = self._kernels.get(kernel_id) + if kernel is None: + raise KeyError(f"Kernel '{kernel_id}' not found") + return kernel + + def _cleanup_container(self, kernel_id: str) -> None: + kernel = self._kernels.get(kernel_id) + if kernel is None or kernel.container_id is None: + return + try: + container = self._docker.containers.get(kernel.container_id) + container.stop(timeout=10) + container.remove(force=True) + except docker.errors.NotFound: + pass + except Exception as exc: + logger.warning("Error cleaning up container for kernel '%s': %s", kernel_id, exc) + + async def _wait_for_healthy(self, kernel_id: str, timeout: int = _HEALTH_TIMEOUT) -> None: + kernel = self._get_kernel_or_raise(kernel_id) + url = f"http://localhost:{kernel.port}/health" + deadline = asyncio.get_event_loop().time() + timeout + + while asyncio.get_event_loop().time() < deadline: + try: + async with httpx.AsyncClient(timeout=httpx.Timeout(5.0)) as client: + response = await client.get(url) + if response.status_code == 200: + return + except httpx.HTTPError: + # Catches all transient errors: ConnectError, ReadError, + # ConnectTimeout, RemoteProtocolError, etc. + pass + except Exception: + # Safety net for unexpected errors during startup polling + pass + await asyncio.sleep(_HEALTH_POLL_INTERVAL) + + raise TimeoutError(f"Kernel '{kernel_id}' did not become healthy within {timeout}s") diff --git a/flowfile_core/flowfile_core/kernel/models.py b/flowfile_core/flowfile_core/kernel/models.py new file mode 100644 index 000000000..c753b1198 --- /dev/null +++ b/flowfile_core/flowfile_core/kernel/models.py @@ -0,0 +1,52 @@ +from datetime import datetime +from enum import Enum + +from pydantic import BaseModel, Field + + +class KernelState(str, Enum): + STOPPED = "stopped" + STARTING = "starting" + IDLE = "idle" + EXECUTING = "executing" + ERROR = "error" + + +class KernelConfig(BaseModel): + id: str + name: str + packages: list[str] = Field(default_factory=list) + cpu_cores: float = 2.0 + memory_gb: float = 4.0 + gpu: bool = False + + +class KernelInfo(BaseModel): + id: str + name: str + state: KernelState = KernelState.STOPPED + container_id: str | None = None + port: int | None = None + packages: list[str] = Field(default_factory=list) + memory_gb: float = 4.0 + cpu_cores: float = 2.0 + gpu: bool = False + created_at: datetime = Field(default_factory=datetime.now) + error_message: str | None = None + + +class ExecuteRequest(BaseModel): + node_id: int + code: str + input_paths: dict[str, str] = Field(default_factory=dict) + output_dir: str = "" + + +class ExecuteResult(BaseModel): + success: bool + output_paths: list[str] = Field(default_factory=list) + artifacts_published: list[str] = Field(default_factory=list) + stdout: str = "" + stderr: str = "" + error: str | None = None + execution_time_ms: float = 0.0 diff --git a/flowfile_core/flowfile_core/kernel/routes.py b/flowfile_core/flowfile_core/kernel/routes.py new file mode 100644 index 000000000..0058adc43 --- /dev/null +++ b/flowfile_core/flowfile_core/kernel/routes.py @@ -0,0 +1,103 @@ +from fastapi import APIRouter, HTTPException + +from flowfile_core.kernel.models import ExecuteRequest, ExecuteResult, KernelConfig, KernelInfo + + +def _get_manager(): + from flowfile_core.kernel import get_kernel_manager + + return get_kernel_manager() + + +router = APIRouter(prefix="/kernels") + + +@router.get("/", response_model=list[KernelInfo]) +async def list_kernels(): + return await _get_manager().list_kernels() + + +@router.post("/", response_model=KernelInfo) +async def create_kernel(config: KernelConfig): + try: + return await _get_manager().create_kernel(config) + except ValueError as exc: + raise HTTPException(status_code=409, detail=str(exc)) + + +@router.get("/{kernel_id}", response_model=KernelInfo) +async def get_kernel(kernel_id: str): + kernel = await _get_manager().get_kernel(kernel_id) + if kernel is None: + raise HTTPException(status_code=404, detail=f"Kernel '{kernel_id}' not found") + return kernel + + +@router.delete("/{kernel_id}") +async def delete_kernel(kernel_id: str): + try: + await _get_manager().delete_kernel(kernel_id) + return {"status": "deleted", "kernel_id": kernel_id} + except KeyError as exc: + raise HTTPException(status_code=404, detail=str(exc)) + + +@router.post("/{kernel_id}/start", response_model=KernelInfo) +async def start_kernel(kernel_id: str): + try: + return await _get_manager().start_kernel(kernel_id) + except KeyError as exc: + raise HTTPException(status_code=404, detail=str(exc)) + except Exception as exc: + raise HTTPException(status_code=500, detail=str(exc)) + + +@router.post("/{kernel_id}/stop") +async def stop_kernel(kernel_id: str): + try: + await _get_manager().stop_kernel(kernel_id) + return {"status": "stopped", "kernel_id": kernel_id} + except KeyError as exc: + raise HTTPException(status_code=404, detail=str(exc)) + + +@router.post("/{kernel_id}/execute", response_model=ExecuteResult) +async def execute_code(kernel_id: str, request: ExecuteRequest): + try: + return await _get_manager().execute(kernel_id, request) + except KeyError as exc: + raise HTTPException(status_code=404, detail=str(exc)) + except RuntimeError as exc: + raise HTTPException(status_code=400, detail=str(exc)) + + +@router.get("/{kernel_id}/artifacts") +async def get_artifacts(kernel_id: str): + manager = _get_manager() + kernel = await manager.get_kernel(kernel_id) + if kernel is None: + raise HTTPException(status_code=404, detail=f"Kernel '{kernel_id}' not found") + if kernel.state.value not in ("idle", "executing"): + raise HTTPException(status_code=400, detail=f"Kernel '{kernel_id}' is not running") + + try: + import httpx + + url = f"http://localhost:{kernel.port}/artifacts" + async with httpx.AsyncClient(timeout=httpx.Timeout(30.0)) as client: + response = await client.get(url) + response.raise_for_status() + return response.json() + except Exception as exc: + raise HTTPException(status_code=500, detail=str(exc)) + + +@router.post("/{kernel_id}/clear") +async def clear_artifacts(kernel_id: str): + try: + await _get_manager().clear_artifacts(kernel_id) + return {"status": "cleared", "kernel_id": kernel_id} + except KeyError as exc: + raise HTTPException(status_code=404, detail=str(exc)) + except RuntimeError as exc: + raise HTTPException(status_code=400, detail=str(exc)) diff --git a/flowfile_core/flowfile_core/main.py b/flowfile_core/flowfile_core/main.py index 24c4011e1..9f772f829 100644 --- a/flowfile_core/flowfile_core/main.py +++ b/flowfile_core/flowfile_core/main.py @@ -15,6 +15,7 @@ WORKER_PORT, WORKER_URL, ) +from flowfile_core.kernel import router as kernel_router from flowfile_core.routes.auth import router as auth_router from flowfile_core.routes.cloud_connections import router as cloud_connections_router from flowfile_core.routes.logs import router as logs_router @@ -87,6 +88,7 @@ async def shutdown_handler(app: FastAPI): app.include_router(secrets_router, prefix="/secrets", tags=["secrets"]) app.include_router(cloud_connections_router, prefix="/cloud_connections", tags=["cloud_connections"]) app.include_router(user_defined_components_router, prefix="/user_defined_components", tags=["user_defined_components"]) +app.include_router(kernel_router, tags=["kernels"]) @app.post("/shutdown") diff --git a/flowfile_core/flowfile_core/schemas/input_schema.py b/flowfile_core/flowfile_core/schemas/input_schema.py index b1fecc288..6ee8ba372 100644 --- a/flowfile_core/flowfile_core/schemas/input_schema.py +++ b/flowfile_core/flowfile_core/schemas/input_schema.py @@ -887,6 +887,19 @@ class NodePolarsCode(NodeMultiInput): polars_code_input: transform_schema.PolarsCodeInput +class PythonScriptInput(BaseModel): + """Settings for Python code execution on a kernel.""" + + code: str = "" + kernel_id: str | None = None + + +class NodePythonScript(NodeSingleInput): + """Node that executes Python code on a kernel container.""" + + python_script_input: PythonScriptInput = PythonScriptInput() + + class UserDefinedNode(NodeMultiInput): """Settings for a node that contains the user defined node information""" diff --git a/flowfile_core/tests/conftest.py b/flowfile_core/tests/conftest.py index 99b1754e0..d4f993f03 100644 --- a/flowfile_core/tests/conftest.py +++ b/flowfile_core/tests/conftest.py @@ -28,6 +28,7 @@ def _patched_hashpw(password, salt): from test_utils.postgres import fixtures as pg_fixtures from tests.flowfile_core_test_utils import is_docker_available +from tests.kernel_fixtures import managed_kernel def is_port_in_use(port, host='localhost'): @@ -263,3 +264,21 @@ def postgres_db(): if not db_info: pytest.fail("PostgreSQL container could not be started") yield db_info + + +@pytest.fixture(scope="session") +def kernel_manager(): + """ + Pytest fixture that builds the flowfile-kernel Docker image, creates a + KernelManager, starts a test kernel, and tears everything down afterwards. + + Yields a (KernelManager, kernel_id) tuple. + """ + if not is_docker_available(): + pytest.skip("Docker is not available, skipping kernel tests") + + try: + with managed_kernel() as ctx: + yield ctx + except Exception as exc: + pytest.skip(f"Kernel container could not be started: {exc}") diff --git a/flowfile_core/tests/flowfile/test_kernel_integration.py b/flowfile_core/tests/flowfile/test_kernel_integration.py new file mode 100644 index 000000000..37827aa48 --- /dev/null +++ b/flowfile_core/tests/flowfile/test_kernel_integration.py @@ -0,0 +1,431 @@ +""" +Integration tests for the Docker-based kernel system. + +These tests require Docker to be available. The ``kernel_manager`` fixture +(session-scoped, defined in conftest.py) builds the flowfile-kernel image, +starts a container, and tears it down after all tests in this module finish. +""" + +import asyncio +import os +from pathlib import Path +from typing import Literal + +import polars as pl +import pytest + +from flowfile_core.flowfile.flow_data_engine.flow_data_engine import FlowDataEngine +from flowfile_core.flowfile.flow_graph import FlowGraph, RunInformation, add_connection +from flowfile_core.flowfile.handler import FlowfileHandler +from flowfile_core.kernel.manager import KernelManager +from flowfile_core.kernel.models import ExecuteRequest, ExecuteResult +from flowfile_core.schemas import input_schema, schemas + +pytestmark = pytest.mark.kernel + + +# --------------------------------------------------------------------------- +# Helpers (same pattern as test_flowfile.py) +# --------------------------------------------------------------------------- + + +def _run(coro): + """Run an async coroutine from sync test code.""" + loop = asyncio.new_event_loop() + try: + return loop.run_until_complete(coro) + finally: + loop.close() + + +def _create_graph( + flow_id: int = 1, + execution_mode: Literal["Development", "Performance"] = "Development", + execution_location: Literal["local", "remote"] | None = "remote", +) -> FlowGraph: + handler = FlowfileHandler() + handler.register_flow( + schemas.FlowSettings( + flow_id=flow_id, + name="kernel_test_flow", + path=".", + execution_mode=execution_mode, + execution_location=execution_location, + ) + ) + return handler.get_flow(flow_id) + + +def _handle_run_info(run_info: RunInformation): + if not run_info.success: + errors = "errors:" + for step in run_info.node_step_result: + if not step.success: + errors += f"\n node_id:{step.node_id}, error: {step.error}" + raise ValueError(f"Graph should run successfully:\n{errors}") + + +# --------------------------------------------------------------------------- +# Tests — kernel runtime (direct manager interaction) +# --------------------------------------------------------------------------- + + +class TestKernelRuntime: + """Tests that exercise the kernel container directly via KernelManager.""" + + def test_health_check(self, kernel_manager: tuple[KernelManager, str]): + """Kernel container responds to health checks.""" + manager, kernel_id = kernel_manager + info = _run(manager.get_kernel(kernel_id)) + assert info is not None + assert info.state.value == "idle" + + def test_execute_print(self, kernel_manager: tuple[KernelManager, str]): + """Simple print() produces stdout.""" + manager, kernel_id = kernel_manager + result: ExecuteResult = _run( + manager.execute( + kernel_id, + ExecuteRequest( + node_id=1, + code='print("hello from kernel")', + input_paths={}, + output_dir="/shared/test_print", + ), + ) + ) + assert result.success + assert "hello from kernel" in result.stdout + assert result.error is None + + def test_execute_syntax_error(self, kernel_manager: tuple[KernelManager, str]): + """Syntax errors are captured, not raised.""" + manager, kernel_id = kernel_manager + result: ExecuteResult = _run( + manager.execute( + kernel_id, + ExecuteRequest( + node_id=2, + code="def broken(", + input_paths={}, + output_dir="/shared/test_syntax_err", + ), + ) + ) + assert not result.success + assert result.error is not None + + def test_publish_and_list_artifacts(self, kernel_manager: tuple[KernelManager, str]): + """publish_artifact stores an object; list_artifacts returns metadata.""" + manager, kernel_id = kernel_manager + + # Clear any leftover artifacts from previous tests + _run(manager.clear_artifacts(kernel_id)) + + result: ExecuteResult = _run( + manager.execute( + kernel_id, + ExecuteRequest( + node_id=3, + code='flowfile.publish_artifact("my_dict", {"a": 1, "b": 2})', + input_paths={}, + output_dir="/shared/test_artifact", + ), + ) + ) + assert result.success + assert "my_dict" in result.artifacts_published + + def test_read_and_write_parquet(self, kernel_manager: tuple[KernelManager, str]): + """Kernel can read input parquet and write output parquet.""" + manager, kernel_id = kernel_manager + shared = manager.shared_volume_path + + # Prepare input parquet + input_dir = os.path.join(shared, "test_rw", "inputs") + output_dir = os.path.join(shared, "test_rw", "outputs") + os.makedirs(input_dir, exist_ok=True) + os.makedirs(output_dir, exist_ok=True) + + df_in = pl.DataFrame({"x": [1, 2, 3], "y": [10, 20, 30]}) + df_in.write_parquet(os.path.join(input_dir, "main.parquet")) + + code = """ +import polars as pl +df = flowfile.read_input() +df = df.with_columns((pl.col("x") * pl.col("y")).alias("product")) +flowfile.publish_output(df) +""" + + result: ExecuteResult = _run( + manager.execute( + kernel_id, + ExecuteRequest( + node_id=4, + code=code, + input_paths={"main": "/shared/test_rw/inputs/main.parquet"}, + output_dir="/shared/test_rw/outputs", + ), + ) + ) + assert result.success, f"Kernel execution failed: {result.error}" + assert len(result.output_paths) > 0 + + # Verify output + out_path = os.path.join(output_dir, "main.parquet") + assert os.path.exists(out_path), f"Expected output parquet at {out_path}" + df_out = pl.read_parquet(out_path) + assert "product" in df_out.columns + assert df_out["product"].to_list() == [10, 40, 90] + + def test_multiple_inputs(self, kernel_manager: tuple[KernelManager, str]): + """Kernel can read multiple named inputs.""" + manager, kernel_id = kernel_manager + shared = manager.shared_volume_path + + input_dir = os.path.join(shared, "test_multi", "inputs") + output_dir = os.path.join(shared, "test_multi", "outputs") + os.makedirs(input_dir, exist_ok=True) + os.makedirs(output_dir, exist_ok=True) + + pl.DataFrame({"id": [1, 2], "name": ["a", "b"]}).write_parquet( + os.path.join(input_dir, "left.parquet") + ) + pl.DataFrame({"id": [1, 2], "score": [90, 80]}).write_parquet( + os.path.join(input_dir, "right.parquet") + ) + + code = """ +inputs = flowfile.read_inputs() +left = inputs["left"].collect() +right = inputs["right"].collect() +merged = left.join(right, on="id") +flowfile.publish_output(merged) +""" + result = _run( + manager.execute( + kernel_id, + ExecuteRequest( + node_id=5, + code=code, + input_paths={ + "left": "/shared/test_multi/inputs/left.parquet", + "right": "/shared/test_multi/inputs/right.parquet", + }, + output_dir="/shared/test_multi/outputs", + ), + ) + ) + assert result.success, f"Kernel execution failed: {result.error}" + + df_out = pl.read_parquet(os.path.join(output_dir, "main.parquet")) + assert set(df_out.columns) == {"id", "name", "score"} + assert len(df_out) == 2 + + def test_stderr_captured(self, kernel_manager: tuple[KernelManager, str]): + """Writes to stderr are captured.""" + manager, kernel_id = kernel_manager + result = _run( + manager.execute( + kernel_id, + ExecuteRequest( + node_id=6, + code='import sys; sys.stderr.write("warn\\n")', + input_paths={}, + output_dir="/shared/test_stderr", + ), + ) + ) + assert result.success + assert "warn" in result.stderr + + def test_execution_time_tracked(self, kernel_manager: tuple[KernelManager, str]): + """execution_time_ms is populated.""" + manager, kernel_id = kernel_manager + result = _run( + manager.execute( + kernel_id, + ExecuteRequest( + node_id=7, + code="x = sum(range(100000))", + input_paths={}, + output_dir="/shared/test_timing", + ), + ) + ) + assert result.success + assert result.execution_time_ms > 0 + + +# --------------------------------------------------------------------------- +# Tests — python_script node in FlowGraph +# --------------------------------------------------------------------------- + + +class TestPythonScriptNode: + """ + Tests that wire up the python_script node type inside a FlowGraph and + run the graph end-to-end against a real kernel container. + """ + + def test_python_script_passthrough(self, kernel_manager: tuple[KernelManager, str]): + """ + python_script node reads input, passes it through, and writes output. + """ + manager, kernel_id = kernel_manager + # Patch the singleton so flow_graph picks up *this* manager + import flowfile_core.kernel as _kernel_mod + + _prev = _kernel_mod._manager + _kernel_mod._manager = manager + + try: + graph = _create_graph() + + # Node 1: manual input + data = [{"name": "Alice", "age": 30}, {"name": "Bob", "age": 25}] + node_promise = input_schema.NodePromise(flow_id=1, node_id=1, node_type="manual_input") + graph.add_node_promise(node_promise) + graph.add_manual_input( + input_schema.NodeManualInput( + flow_id=1, + node_id=1, + raw_data_format=input_schema.RawData.from_pylist(data), + ) + ) + + # Node 2: python_script + node_promise_2 = input_schema.NodePromise(flow_id=1, node_id=2, node_type="python_script") + graph.add_node_promise(node_promise_2) + + code = """ +df = flowfile.read_input() +flowfile.publish_output(df) +""" + graph.add_python_script( + input_schema.NodePythonScript( + flow_id=1, + node_id=2, + depending_on_id=1, + python_script_input=input_schema.PythonScriptInput( + code=code, + kernel_id=kernel_id, + ), + ) + ) + + add_connection(graph, input_schema.NodeConnection.create_from_simple_input(1, 2)) + + run_info = graph.run_graph() + _handle_run_info(run_info) + + result = graph.get_node(2).get_resulting_data() + assert result is not None + df = result.data_frame + if hasattr(df, "collect"): + df = df.collect() + assert len(df) == 2 + assert set(df.columns) >= {"name", "age"} + + finally: + _kernel_mod._manager = _prev + + def test_python_script_transform(self, kernel_manager: tuple[KernelManager, str]): + """ + python_script node transforms data (adds a column). + """ + manager, kernel_id = kernel_manager + + import flowfile_core.kernel as _kernel_mod + + _prev = _kernel_mod._manager + _kernel_mod._manager = manager + + try: + graph = _create_graph() + + data = [{"val": 1}, {"val": 2}, {"val": 3}] + node_promise = input_schema.NodePromise(flow_id=1, node_id=1, node_type="manual_input") + graph.add_node_promise(node_promise) + graph.add_manual_input( + input_schema.NodeManualInput( + flow_id=1, + node_id=1, + raw_data_format=input_schema.RawData.from_pylist(data), + ) + ) + + node_promise_2 = input_schema.NodePromise(flow_id=1, node_id=2, node_type="python_script") + graph.add_node_promise(node_promise_2) + + code = """ +import polars as pl +df = flowfile.read_input().collect() +df = df.with_columns((pl.col("val") * 10).alias("val_x10")) +flowfile.publish_output(df) +""" + graph.add_python_script( + input_schema.NodePythonScript( + flow_id=1, + node_id=2, + depending_on_id=1, + python_script_input=input_schema.PythonScriptInput( + code=code, + kernel_id=kernel_id, + ), + ) + ) + + add_connection(graph, input_schema.NodeConnection.create_from_simple_input(1, 2)) + + run_info = graph.run_graph() + _handle_run_info(run_info) + + result = graph.get_node(2).get_resulting_data() + assert result is not None + df = result.data_frame + if hasattr(df, "collect"): + df = df.collect() + assert "val_x10" in df.columns + assert df["val_x10"].to_list() == [10, 20, 30] + + finally: + _kernel_mod._manager = _prev + + def test_python_script_no_kernel_raises(self): + """ + If no kernel_id is set, the node should raise at execution time. + """ + graph = _create_graph() + + data = [{"a": 1}] + node_promise = input_schema.NodePromise(flow_id=1, node_id=1, node_type="manual_input") + graph.add_node_promise(node_promise) + graph.add_manual_input( + input_schema.NodeManualInput( + flow_id=1, + node_id=1, + raw_data_format=input_schema.RawData.from_pylist(data), + ) + ) + + node_promise_2 = input_schema.NodePromise(flow_id=1, node_id=2, node_type="python_script") + graph.add_node_promise(node_promise_2) + + graph.add_python_script( + input_schema.NodePythonScript( + flow_id=1, + node_id=2, + depending_on_id=1, + python_script_input=input_schema.PythonScriptInput( + code='print("hi")', + kernel_id=None, # intentionally no kernel + ), + ) + ) + + add_connection(graph, input_schema.NodeConnection.create_from_simple_input(1, 2)) + + run_info = graph.run_graph() + # Should fail because no kernel is selected + assert not run_info.success diff --git a/flowfile_core/tests/kernel_fixtures.py b/flowfile_core/tests/kernel_fixtures.py new file mode 100644 index 000000000..38afc2516 --- /dev/null +++ b/flowfile_core/tests/kernel_fixtures.py @@ -0,0 +1,127 @@ +""" +Kernel test fixtures. + +Provides utilities to build the flowfile-kernel Docker image, +create a KernelManager, start/stop kernels, and clean up. +""" + +import asyncio +import logging +import os +import subprocess +import tempfile +from collections.abc import Generator +from contextlib import contextmanager +from pathlib import Path + +logger = logging.getLogger("kernel_fixture") + +KERNEL_IMAGE = "flowfile-kernel" +KERNEL_TEST_ID = "integration-test" +KERNEL_CONTAINER_NAME = f"flowfile-kernel-{KERNEL_TEST_ID}" + +_REPO_ROOT = Path(__file__).resolve().parent.parent.parent + + +def _build_kernel_image() -> bool: + """Build the flowfile-kernel Docker image from kernel_runtime/.""" + dockerfile = _REPO_ROOT / "kernel_runtime" / "Dockerfile" + context = _REPO_ROOT / "kernel_runtime" + + if not dockerfile.exists(): + logger.error("Dockerfile not found at %s", dockerfile) + return False + + logger.info("Building Docker image '%s' ...", KERNEL_IMAGE) + try: + subprocess.run( + ["docker", "build", "-t", KERNEL_IMAGE, "-f", str(dockerfile), str(context)], + check=True, + capture_output=True, + text=True, + timeout=300, + ) + logger.info("Docker image '%s' built successfully", KERNEL_IMAGE) + return True + except subprocess.CalledProcessError as exc: + logger.error("Failed to build Docker image: %s\nstdout: %s\nstderr: %s", exc, exc.stdout, exc.stderr) + return False + except subprocess.TimeoutExpired: + logger.error("Docker build timed out") + return False + + +def _remove_container(name: str) -> None: + """Force-remove a container by name (ignore errors if it doesn't exist).""" + subprocess.run( + ["docker", "rm", "-f", name], + capture_output=True, + check=False, + ) + + +@contextmanager +def managed_kernel( + packages: list[str] | None = None, +) -> Generator[tuple, None, None]: + """ + Context manager that: + 1. Builds the flowfile-kernel Docker image + 2. Creates a KernelManager with a temp shared volume + 3. Creates and starts a kernel + 4. Yields (manager, kernel_id) + 5. Stops + deletes the kernel and cleans up + + Usage:: + + with managed_kernel(packages=["scikit-learn"]) as (manager, kernel_id): + result = await manager.execute(kernel_id, request) + """ + from flowfile_core.kernel.manager import KernelManager + from flowfile_core.kernel.models import KernelConfig + + # 1 — Build image + if not _build_kernel_image(): + raise RuntimeError("Could not build flowfile-kernel Docker image") + + # 2 — Ensure stale container is removed + _remove_container(KERNEL_CONTAINER_NAME) + + # 3 — Temp shared volume + shared_dir = tempfile.mkdtemp(prefix="kernel_test_shared_") + + manager = KernelManager(shared_volume_path=shared_dir) + kernel_id = KERNEL_TEST_ID + + try: + # 4 — Create + start + loop = asyncio.new_event_loop() + config = KernelConfig( + id=kernel_id, + name="Integration Test Kernel", + packages=packages or [], + ) + loop.run_until_complete(manager.create_kernel(config)) + loop.run_until_complete(manager.start_kernel(kernel_id)) + + yield manager, kernel_id + + finally: + # 5 — Tear down + try: + loop.run_until_complete(manager.stop_kernel(kernel_id)) + except Exception as exc: + logger.warning("Error stopping kernel during teardown: %s", exc) + try: + loop.run_until_complete(manager.delete_kernel(kernel_id)) + except Exception as exc: + logger.warning("Error deleting kernel during teardown: %s", exc) + loop.close() + + # Belt-and-suspenders: force-remove the container + _remove_container(KERNEL_CONTAINER_NAME) + + # Clean up shared dir + import shutil + + shutil.rmtree(shared_dir, ignore_errors=True) diff --git a/kernel_runtime/Dockerfile b/kernel_runtime/Dockerfile new file mode 100644 index 000000000..a5e509c33 --- /dev/null +++ b/kernel_runtime/Dockerfile @@ -0,0 +1,23 @@ +FROM python:3.12-slim + +WORKDIR /app + +RUN apt-get update && apt-get install -y --no-install-recommends \ + curl build-essential && rm -rf /var/lib/apt/lists/* + +RUN pip install --no-cache-dir \ + polars>=1.0.0 pyarrow>=14.0.0 numpy>=1.24.0 \ + fastapi>=0.115.0 uvicorn>=0.32.0 httpx>=0.24.0 + +COPY kernel_runtime /app/kernel_runtime +COPY entrypoint.sh /app/entrypoint.sh +RUN chmod +x /app/entrypoint.sh + +ENV KERNEL_PACKAGES="" +VOLUME ["/shared"] +EXPOSE 9999 + +HEALTHCHECK --interval=10s --timeout=5s --start-period=30s \ + CMD curl -f http://localhost:9999/health || exit 1 + +ENTRYPOINT ["/app/entrypoint.sh"] diff --git a/kernel_runtime/entrypoint.sh b/kernel_runtime/entrypoint.sh new file mode 100755 index 000000000..70da434fe --- /dev/null +++ b/kernel_runtime/entrypoint.sh @@ -0,0 +1,9 @@ +#!/bin/bash +set -e + +if [ -n "$KERNEL_PACKAGES" ]; then + echo "Installing packages: $KERNEL_PACKAGES" + pip install --no-cache-dir $KERNEL_PACKAGES +fi + +exec uvicorn kernel_runtime.main:app --host 0.0.0.0 --port 9999 diff --git a/kernel_runtime/kernel_runtime/__init__.py b/kernel_runtime/kernel_runtime/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/kernel_runtime/kernel_runtime/artifact_store.py b/kernel_runtime/kernel_runtime/artifact_store.py new file mode 100644 index 000000000..5814b9350 --- /dev/null +++ b/kernel_runtime/kernel_runtime/artifact_store.py @@ -0,0 +1,41 @@ +import sys +import threading +from datetime import datetime, timezone +from typing import Any + + +class ArtifactStore: + """Thread-safe in-memory store for Python artifacts produced during kernel execution.""" + + def __init__(self): + self._lock = threading.Lock() + self._artifacts: dict[str, dict[str, Any]] = {} + + def publish(self, name: str, obj: Any, node_id: int) -> None: + with self._lock: + self._artifacts[name] = { + "object": obj, + "name": name, + "type_name": type(obj).__name__, + "module": type(obj).__module__, + "node_id": node_id, + "created_at": datetime.now(timezone.utc).isoformat(), + "size_bytes": sys.getsizeof(obj), + } + + def get(self, name: str) -> Any: + with self._lock: + if name not in self._artifacts: + raise KeyError(f"Artifact '{name}' not found") + return self._artifacts[name]["object"] + + def list_all(self) -> dict[str, dict[str, Any]]: + with self._lock: + return { + name: {k: v for k, v in meta.items() if k != "object"} + for name, meta in self._artifacts.items() + } + + def clear(self) -> None: + with self._lock: + self._artifacts.clear() diff --git a/kernel_runtime/kernel_runtime/flowfile_client.py b/kernel_runtime/kernel_runtime/flowfile_client.py new file mode 100644 index 000000000..4d1daacc3 --- /dev/null +++ b/kernel_runtime/kernel_runtime/flowfile_client.py @@ -0,0 +1,71 @@ +from __future__ import annotations + +import os +from pathlib import Path +from typing import Any + +import polars as pl + +from kernel_runtime.artifact_store import ArtifactStore + +_context: dict[str, Any] = {} + + +def _set_context( + node_id: int, + input_paths: dict[str, str], + output_dir: str, + artifact_store: ArtifactStore, +) -> None: + _context["node_id"] = node_id + _context["input_paths"] = input_paths + _context["output_dir"] = output_dir + _context["artifact_store"] = artifact_store + + +def _clear_context() -> None: + _context.clear() + + +def _get_context_value(key: str) -> Any: + if key not in _context: + raise RuntimeError(f"flowfile context not initialized (missing '{key}'). This API is only available during /execute.") + return _context[key] + + +def read_input(name: str = "main") -> pl.LazyFrame: + input_paths: dict[str, str] = _get_context_value("input_paths") + if name not in input_paths: + available = list(input_paths.keys()) + raise KeyError(f"Input '{name}' not found. Available inputs: {available}") + return pl.scan_parquet(input_paths[name]) + + +def read_inputs() -> dict[str, pl.LazyFrame]: + input_paths: dict[str, str] = _get_context_value("input_paths") + return {name: pl.scan_parquet(path) for name, path in input_paths.items()} + + +def publish_output(df: pl.LazyFrame | pl.DataFrame, name: str = "main") -> None: + output_dir = _get_context_value("output_dir") + os.makedirs(output_dir, exist_ok=True) + output_path = Path(output_dir) / f"{name}.parquet" + if isinstance(df, pl.LazyFrame): + df = df.collect() + df.write_parquet(str(output_path)) + + +def publish_artifact(name: str, obj: Any) -> None: + store: ArtifactStore = _get_context_value("artifact_store") + node_id: int = _get_context_value("node_id") + store.publish(name, obj, node_id) + + +def read_artifact(name: str) -> Any: + store: ArtifactStore = _get_context_value("artifact_store") + return store.get(name) + + +def list_artifacts() -> dict: + store: ArtifactStore = _get_context_value("artifact_store") + return store.list_all() diff --git a/kernel_runtime/kernel_runtime/main.py b/kernel_runtime/kernel_runtime/main.py new file mode 100644 index 000000000..f518510d5 --- /dev/null +++ b/kernel_runtime/kernel_runtime/main.py @@ -0,0 +1,102 @@ +import contextlib +import io +import os +import time +from pathlib import Path + +from fastapi import FastAPI +from pydantic import BaseModel + +from kernel_runtime import flowfile_client +from kernel_runtime.artifact_store import ArtifactStore + +app = FastAPI(title="FlowFile Kernel Runtime") +artifact_store = ArtifactStore() + + +class ExecuteRequest(BaseModel): + node_id: int + code: str + input_paths: dict[str, str] = {} + output_dir: str = "" + + +class ExecuteResponse(BaseModel): + success: bool + output_paths: list[str] = [] + artifacts_published: list[str] = [] + stdout: str = "" + stderr: str = "" + error: str | None = None + execution_time_ms: float = 0.0 + + +@app.post("/execute", response_model=ExecuteResponse) +async def execute(request: ExecuteRequest): + start = time.perf_counter() + stdout_buf = io.StringIO() + stderr_buf = io.StringIO() + + output_dir = request.output_dir + if output_dir: + os.makedirs(output_dir, exist_ok=True) + + artifacts_before = set(artifact_store.list_all().keys()) + + try: + flowfile_client._set_context( + node_id=request.node_id, + input_paths=request.input_paths, + output_dir=output_dir, + artifact_store=artifact_store, + ) + + with contextlib.redirect_stdout(stdout_buf), contextlib.redirect_stderr(stderr_buf): + exec(request.code, {"flowfile": flowfile_client}) # noqa: S102 + + # Collect output parquet files + output_paths: list[str] = [] + if output_dir and Path(output_dir).exists(): + output_paths = [ + str(p) for p in sorted(Path(output_dir).glob("*.parquet")) + ] + + artifacts_after = set(artifact_store.list_all().keys()) + new_artifacts = sorted(artifacts_after - artifacts_before) + + elapsed = (time.perf_counter() - start) * 1000 + return ExecuteResponse( + success=True, + output_paths=output_paths, + artifacts_published=new_artifacts, + stdout=stdout_buf.getvalue(), + stderr=stderr_buf.getvalue(), + execution_time_ms=elapsed, + ) + except Exception as exc: + elapsed = (time.perf_counter() - start) * 1000 + return ExecuteResponse( + success=False, + stdout=stdout_buf.getvalue(), + stderr=stderr_buf.getvalue(), + error=f"{type(exc).__name__}: {exc}", + execution_time_ms=elapsed, + ) + finally: + flowfile_client._clear_context() + + +@app.post("/clear") +async def clear_artifacts(): + artifact_store.clear() + return {"status": "cleared"} + + +@app.get("/artifacts") +async def list_artifacts(): + return artifact_store.list_all() + + +@app.get("/health") +async def health(): + return {"status": "healthy", "artifact_count": len(artifact_store.list_all())} diff --git a/kernel_runtime/pyproject.toml b/kernel_runtime/pyproject.toml new file mode 100644 index 000000000..4c87b7cd6 --- /dev/null +++ b/kernel_runtime/pyproject.toml @@ -0,0 +1,22 @@ +[project] +name = "kernel_runtime" +version = "0.1.0" +description = "FlowFile kernel runtime - executes Python code in isolated Docker containers" +requires-python = ">=3.10" +dependencies = [ + "fastapi>=0.115.0", + "uvicorn>=0.32.0", + "polars>=1.0.0", + "pyarrow>=14.0.0", + "httpx>=0.24.0", +] + +[project.optional-dependencies] +test = [ + "pytest>=7.0.0", + "httpx>=0.24.0", +] + +[build-system] +requires = ["setuptools>=68.0"] +build-backend = "setuptools.build_meta" diff --git a/kernel_runtime/tests/__init__.py b/kernel_runtime/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/kernel_runtime/tests/conftest.py b/kernel_runtime/tests/conftest.py new file mode 100644 index 000000000..2218fa0dc --- /dev/null +++ b/kernel_runtime/tests/conftest.py @@ -0,0 +1,40 @@ +"""Shared fixtures for kernel_runtime tests.""" + +import os +import tempfile +from collections.abc import Generator +from pathlib import Path + +import pytest +from fastapi.testclient import TestClient + +from kernel_runtime.artifact_store import ArtifactStore +from kernel_runtime.main import app, artifact_store + + +@pytest.fixture() +def store() -> ArtifactStore: + """Fresh ArtifactStore for each test.""" + return ArtifactStore() + + +@pytest.fixture(autouse=True) +def _clear_global_artifacts(): + """Reset the global artifact_store between tests.""" + artifact_store.clear() + yield + artifact_store.clear() + + +@pytest.fixture() +def client() -> Generator[TestClient, None, None]: + """FastAPI TestClient for the kernel runtime app.""" + with TestClient(app) as c: + yield c + + +@pytest.fixture() +def tmp_dir() -> Generator[Path, None, None]: + """Temporary directory cleaned up after each test.""" + with tempfile.TemporaryDirectory(prefix="kernel_test_") as d: + yield Path(d) diff --git a/kernel_runtime/tests/test_artifact_store.py b/kernel_runtime/tests/test_artifact_store.py new file mode 100644 index 000000000..61e1d7c13 --- /dev/null +++ b/kernel_runtime/tests/test_artifact_store.py @@ -0,0 +1,101 @@ +"""Tests for kernel_runtime.artifact_store.""" + +import threading + +import pytest + +from kernel_runtime.artifact_store import ArtifactStore + + +class TestPublishAndGet: + def test_publish_and_retrieve(self, store: ArtifactStore): + store.publish("my_obj", {"a": 1}, node_id=1) + assert store.get("my_obj") == {"a": 1} + + def test_publish_overwrites(self, store: ArtifactStore): + store.publish("key", "first", node_id=1) + store.publish("key", "second", node_id=2) + assert store.get("key") == "second" + + def test_get_missing_raises(self, store: ArtifactStore): + with pytest.raises(KeyError, match="not found"): + store.get("nonexistent") + + def test_publish_various_types(self, store: ArtifactStore): + store.publish("int_val", 42, node_id=1) + store.publish("list_val", [1, 2, 3], node_id=1) + store.publish("none_val", None, node_id=1) + assert store.get("int_val") == 42 + assert store.get("list_val") == [1, 2, 3] + assert store.get("none_val") is None + + +class TestListAll: + def test_empty_store(self, store: ArtifactStore): + assert store.list_all() == {} + + def test_list_excludes_object(self, store: ArtifactStore): + store.publish("item", {"secret": "data"}, node_id=5) + listing = store.list_all() + assert "item" in listing + assert "object" not in listing["item"] + + def test_list_metadata_fields(self, store: ArtifactStore): + store.publish("item", [1, 2], node_id=3) + meta = store.list_all()["item"] + assert meta["name"] == "item" + assert meta["type_name"] == "list" + assert meta["module"] == "builtins" + assert meta["node_id"] == 3 + assert "created_at" in meta + assert "size_bytes" in meta + + def test_list_multiple_items(self, store: ArtifactStore): + store.publish("a", 1, node_id=1) + store.publish("b", 2, node_id=2) + listing = store.list_all() + assert set(listing.keys()) == {"a", "b"} + + +class TestClear: + def test_clear_empties_store(self, store: ArtifactStore): + store.publish("x", 1, node_id=1) + store.publish("y", 2, node_id=1) + store.clear() + assert store.list_all() == {} + + def test_clear_then_get_raises(self, store: ArtifactStore): + store.publish("x", 1, node_id=1) + store.clear() + with pytest.raises(KeyError): + store.get("x") + + def test_clear_idempotent(self, store: ArtifactStore): + store.clear() + store.clear() + assert store.list_all() == {} + + +class TestThreadSafety: + def test_concurrent_publishes(self, store: ArtifactStore): + errors = [] + + def publish_range(start: int, count: int): + try: + for i in range(start, start + count): + store.publish(f"item_{i}", i, node_id=i) + except Exception as exc: + errors.append(exc) + + threads = [ + threading.Thread(target=publish_range, args=(i * 100, 100)) + for i in range(4) + ] + for t in threads: + t.start() + for t in threads: + t.join() + + assert not errors + listing = store.list_all() + assert len(listing) == 400 diff --git a/kernel_runtime/tests/test_flowfile_client.py b/kernel_runtime/tests/test_flowfile_client.py new file mode 100644 index 000000000..5d745de8a --- /dev/null +++ b/kernel_runtime/tests/test_flowfile_client.py @@ -0,0 +1,168 @@ +"""Tests for kernel_runtime.flowfile_client.""" + +from pathlib import Path + +import polars as pl +import pytest + +from kernel_runtime.artifact_store import ArtifactStore +from kernel_runtime import flowfile_client + + +@pytest.fixture(autouse=True) +def _reset_context(): + """Ensure context is cleared before and after each test.""" + flowfile_client._clear_context() + yield + flowfile_client._clear_context() + + +@pytest.fixture() +def ctx(tmp_dir: Path) -> dict: + """Set up a standard context and return its parameters.""" + store = ArtifactStore() + input_dir = tmp_dir / "inputs" + output_dir = tmp_dir / "outputs" + input_dir.mkdir() + output_dir.mkdir() + + # Write a default input parquet + df = pl.DataFrame({"x": [1, 2, 3], "y": [10, 20, 30]}) + main_path = input_dir / "main.parquet" + df.write_parquet(str(main_path)) + + flowfile_client._set_context( + node_id=1, + input_paths={"main": str(main_path)}, + output_dir=str(output_dir), + artifact_store=store, + ) + return { + "store": store, + "input_dir": input_dir, + "output_dir": output_dir, + "main_path": main_path, + } + + +class TestContextManagement: + def test_missing_context_raises(self): + with pytest.raises(RuntimeError, match="context not initialized"): + flowfile_client.read_input() + + def test_set_and_clear(self, tmp_dir: Path): + store = ArtifactStore() + flowfile_client._set_context( + node_id=1, + input_paths={}, + output_dir=str(tmp_dir), + artifact_store=store, + ) + # Should not raise + flowfile_client._get_context_value("node_id") + + flowfile_client._clear_context() + with pytest.raises(RuntimeError): + flowfile_client._get_context_value("node_id") + + +class TestReadInput: + def test_read_main_input(self, ctx: dict): + lf = flowfile_client.read_input() + assert isinstance(lf, pl.LazyFrame) + df = lf.collect() + assert set(df.columns) == {"x", "y"} + assert len(df) == 3 + + def test_read_named_input(self, ctx: dict): + lf = flowfile_client.read_input("main") + df = lf.collect() + assert df["x"].to_list() == [1, 2, 3] + + def test_read_missing_input_raises(self, ctx: dict): + with pytest.raises(KeyError, match="not found"): + flowfile_client.read_input("nonexistent") + + def test_read_inputs_returns_dict(self, ctx: dict): + inputs = flowfile_client.read_inputs() + assert isinstance(inputs, dict) + assert "main" in inputs + assert isinstance(inputs["main"], pl.LazyFrame) + + +class TestReadMultipleInputs: + def test_multiple_named_inputs(self, tmp_dir: Path): + store = ArtifactStore() + input_dir = tmp_dir / "inputs" + input_dir.mkdir(exist_ok=True) + + left_path = input_dir / "left.parquet" + right_path = input_dir / "right.parquet" + pl.DataFrame({"id": [1, 2]}).write_parquet(str(left_path)) + pl.DataFrame({"id": [3, 4]}).write_parquet(str(right_path)) + + flowfile_client._set_context( + node_id=2, + input_paths={"left": str(left_path), "right": str(right_path)}, + output_dir=str(tmp_dir / "outputs"), + artifact_store=store, + ) + + inputs = flowfile_client.read_inputs() + assert set(inputs.keys()) == {"left", "right"} + assert inputs["left"].collect()["id"].to_list() == [1, 2] + assert inputs["right"].collect()["id"].to_list() == [3, 4] + + +class TestPublishOutput: + def test_publish_dataframe(self, ctx: dict): + df = pl.DataFrame({"a": [1, 2]}) + flowfile_client.publish_output(df) + out = Path(ctx["output_dir"]) / "main.parquet" + assert out.exists() + result = pl.read_parquet(str(out)) + assert result["a"].to_list() == [1, 2] + + def test_publish_lazyframe(self, ctx: dict): + lf = pl.LazyFrame({"b": [10, 20]}) + flowfile_client.publish_output(lf) + out = Path(ctx["output_dir"]) / "main.parquet" + assert out.exists() + result = pl.read_parquet(str(out)) + assert result["b"].to_list() == [10, 20] + + def test_publish_named_output(self, ctx: dict): + df = pl.DataFrame({"c": [5]}) + flowfile_client.publish_output(df, name="custom") + out = Path(ctx["output_dir"]) / "custom.parquet" + assert out.exists() + + def test_publish_creates_output_dir(self, tmp_dir: Path): + store = ArtifactStore() + new_output = tmp_dir / "new" / "nested" + flowfile_client._set_context( + node_id=1, + input_paths={}, + output_dir=str(new_output), + artifact_store=store, + ) + df = pl.DataFrame({"v": [1]}) + flowfile_client.publish_output(df) + assert (new_output / "main.parquet").exists() + + +class TestArtifacts: + def test_publish_and_read_artifact(self, ctx: dict): + flowfile_client.publish_artifact("my_dict", {"key": "value"}) + result = flowfile_client.read_artifact("my_dict") + assert result == {"key": "value"} + + def test_list_artifacts(self, ctx: dict): + flowfile_client.publish_artifact("a", 1) + flowfile_client.publish_artifact("b", [2, 3]) + listing = flowfile_client.list_artifacts() + assert set(listing.keys()) == {"a", "b"} + + def test_read_missing_artifact_raises(self, ctx: dict): + with pytest.raises(KeyError, match="not found"): + flowfile_client.read_artifact("missing") diff --git a/kernel_runtime/tests/test_main.py b/kernel_runtime/tests/test_main.py new file mode 100644 index 000000000..6e56ce99b --- /dev/null +++ b/kernel_runtime/tests/test_main.py @@ -0,0 +1,331 @@ +"""Tests for kernel_runtime.main (FastAPI endpoints).""" + +import os +from pathlib import Path + +import polars as pl +import pytest +from fastapi.testclient import TestClient + + +class TestHealthEndpoint: + def test_health_returns_200(self, client: TestClient): + resp = client.get("/health") + assert resp.status_code == 200 + data = resp.json() + assert data["status"] == "healthy" + assert data["artifact_count"] == 0 + + +class TestExecuteEndpoint: + def test_simple_print(self, client: TestClient): + resp = client.post( + "/execute", + json={ + "node_id": 1, + "code": 'print("hello")', + "input_paths": {}, + "output_dir": "", + }, + ) + assert resp.status_code == 200 + data = resp.json() + assert data["success"] is True + assert "hello" in data["stdout"] + assert data["error"] is None + + def test_syntax_error(self, client: TestClient): + resp = client.post( + "/execute", + json={ + "node_id": 2, + "code": "def broken(", + "input_paths": {}, + "output_dir": "", + }, + ) + data = resp.json() + assert data["success"] is False + assert data["error"] is not None + assert "SyntaxError" in data["error"] + + def test_runtime_error(self, client: TestClient): + resp = client.post( + "/execute", + json={ + "node_id": 3, + "code": "1 / 0", + "input_paths": {}, + "output_dir": "", + }, + ) + data = resp.json() + assert data["success"] is False + assert "ZeroDivisionError" in data["error"] + + def test_stderr_captured(self, client: TestClient): + resp = client.post( + "/execute", + json={ + "node_id": 4, + "code": 'import sys; sys.stderr.write("warning\\n")', + "input_paths": {}, + "output_dir": "", + }, + ) + data = resp.json() + assert data["success"] is True + assert "warning" in data["stderr"] + + def test_execution_time_tracked(self, client: TestClient): + resp = client.post( + "/execute", + json={ + "node_id": 5, + "code": "x = sum(range(1000))", + "input_paths": {}, + "output_dir": "", + }, + ) + data = resp.json() + assert data["success"] is True + assert data["execution_time_ms"] > 0 + + def test_flowfile_module_available(self, client: TestClient): + resp = client.post( + "/execute", + json={ + "node_id": 6, + "code": "print(type(flowfile).__name__)", + "input_paths": {}, + "output_dir": "", + }, + ) + data = resp.json() + assert data["success"] is True + assert "module" in data["stdout"] + + +class TestExecuteWithParquet: + def test_read_and_write_parquet(self, client: TestClient, tmp_dir: Path): + input_dir = tmp_dir / "inputs" + output_dir = tmp_dir / "outputs" + input_dir.mkdir() + output_dir.mkdir() + + df_in = pl.DataFrame({"x": [1, 2, 3], "y": [10, 20, 30]}) + input_path = input_dir / "main.parquet" + df_in.write_parquet(str(input_path)) + + code = ( + "import polars as pl\n" + "df = flowfile.read_input()\n" + "df = df.collect().with_columns((pl.col('x') * pl.col('y')).alias('product'))\n" + "flowfile.publish_output(df)\n" + ) + + resp = client.post( + "/execute", + json={ + "node_id": 10, + "code": code, + "input_paths": {"main": str(input_path)}, + "output_dir": str(output_dir), + }, + ) + data = resp.json() + assert data["success"] is True, f"Execution failed: {data['error']}" + assert len(data["output_paths"]) > 0 + + out_path = output_dir / "main.parquet" + assert out_path.exists() + df_out = pl.read_parquet(str(out_path)) + assert "product" in df_out.columns + assert df_out["product"].to_list() == [10, 40, 90] + + def test_multiple_inputs(self, client: TestClient, tmp_dir: Path): + input_dir = tmp_dir / "inputs" + output_dir = tmp_dir / "outputs" + input_dir.mkdir() + output_dir.mkdir() + + pl.DataFrame({"id": [1, 2], "name": ["a", "b"]}).write_parquet( + str(input_dir / "left.parquet") + ) + pl.DataFrame({"id": [1, 2], "score": [90, 80]}).write_parquet( + str(input_dir / "right.parquet") + ) + + code = ( + "inputs = flowfile.read_inputs()\n" + "left = inputs['left'].collect()\n" + "right = inputs['right'].collect()\n" + "merged = left.join(right, on='id')\n" + "flowfile.publish_output(merged)\n" + ) + + resp = client.post( + "/execute", + json={ + "node_id": 11, + "code": code, + "input_paths": { + "left": str(input_dir / "left.parquet"), + "right": str(input_dir / "right.parquet"), + }, + "output_dir": str(output_dir), + }, + ) + data = resp.json() + assert data["success"] is True, f"Execution failed: {data['error']}" + + df_out = pl.read_parquet(str(output_dir / "main.parquet")) + assert set(df_out.columns) == {"id", "name", "score"} + assert len(df_out) == 2 + + def test_publish_lazyframe_output(self, client: TestClient, tmp_dir: Path): + input_dir = tmp_dir / "inputs" + output_dir = tmp_dir / "outputs" + input_dir.mkdir() + output_dir.mkdir() + + pl.DataFrame({"v": [10, 20]}).write_parquet(str(input_dir / "main.parquet")) + + code = ( + "lf = flowfile.read_input()\n" + "flowfile.publish_output(lf)\n" + ) + + resp = client.post( + "/execute", + json={ + "node_id": 12, + "code": code, + "input_paths": {"main": str(input_dir / "main.parquet")}, + "output_dir": str(output_dir), + }, + ) + data = resp.json() + assert data["success"] is True + df_out = pl.read_parquet(str(output_dir / "main.parquet")) + assert df_out["v"].to_list() == [10, 20] + + +class TestArtifactEndpoints: + def test_publish_artifact_via_execute(self, client: TestClient): + resp = client.post( + "/execute", + json={ + "node_id": 20, + "code": 'flowfile.publish_artifact("my_dict", {"a": 1})', + "input_paths": {}, + "output_dir": "", + }, + ) + data = resp.json() + assert data["success"] is True + assert "my_dict" in data["artifacts_published"] + + def test_list_artifacts(self, client: TestClient): + # Publish via execute + client.post( + "/execute", + json={ + "node_id": 21, + "code": ( + 'flowfile.publish_artifact("item_a", [1, 2])\n' + 'flowfile.publish_artifact("item_b", "hello")\n' + ), + "input_paths": {}, + "output_dir": "", + }, + ) + + resp = client.get("/artifacts") + assert resp.status_code == 200 + data = resp.json() + assert "item_a" in data + assert "item_b" in data + # The object itself should not be in the listing + assert "object" not in data["item_a"] + + def test_clear_artifacts(self, client: TestClient): + client.post( + "/execute", + json={ + "node_id": 22, + "code": 'flowfile.publish_artifact("tmp", 42)', + "input_paths": {}, + "output_dir": "", + }, + ) + + resp = client.post("/clear") + assert resp.status_code == 200 + assert resp.json()["status"] == "cleared" + + resp = client.get("/artifacts") + assert resp.json() == {} + + def test_health_shows_artifact_count(self, client: TestClient): + client.post( + "/execute", + json={ + "node_id": 23, + "code": 'flowfile.publish_artifact("x", 1)', + "input_paths": {}, + "output_dir": "", + }, + ) + resp = client.get("/health") + assert resp.json()["artifact_count"] == 1 + + +class TestContextCleanup: + def test_context_cleared_after_success(self, client: TestClient): + """After a successful /execute, the flowfile context should be cleared.""" + client.post( + "/execute", + json={ + "node_id": 30, + "code": "x = 1", + "input_paths": {}, + "output_dir": "", + }, + ) + # A second call that tries to use context should still work + # (context is re-set for each request) + resp = client.post( + "/execute", + json={ + "node_id": 31, + "code": 'print("ok")', + "input_paths": {}, + "output_dir": "", + }, + ) + assert resp.json()["success"] is True + + def test_context_cleared_after_error(self, client: TestClient): + """After a failed /execute, the flowfile context should still be cleared.""" + client.post( + "/execute", + json={ + "node_id": 32, + "code": "raise ValueError('boom')", + "input_paths": {}, + "output_dir": "", + }, + ) + resp = client.post( + "/execute", + json={ + "node_id": 33, + "code": 'print("still works")', + "input_paths": {}, + "output_dir": "", + }, + ) + data = resp.json() + assert data["success"] is True + assert "still works" in data["stdout"] diff --git a/poetry.lock b/poetry.lock index 311677d12..c328c90e4 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand. [[package]] name = "aiobotocore" @@ -2849,10 +2849,8 @@ files = [ {file = "psycopg2_binary-2.9.11-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:c47676e5b485393f069b4d7a811267d3168ce46f988fa602658b8bb901e9e64d"}, {file = "psycopg2_binary-2.9.11-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:a28d8c01a7b27a1e3265b11250ba7557e5f72b5ee9e5f3a2fa8d2949c29bf5d2"}, {file = "psycopg2_binary-2.9.11-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5f3f2732cf504a1aa9e9609d02f79bea1067d99edf844ab92c247bbca143303b"}, - {file = "psycopg2_binary-2.9.11-cp310-cp310-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:865f9945ed1b3950d968ec4690ce68c55019d79e4497366d36e090327ce7db14"}, {file = "psycopg2_binary-2.9.11-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:91537a8df2bde69b1c1db01d6d944c831ca793952e4f57892600e96cee95f2cd"}, {file = "psycopg2_binary-2.9.11-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:4dca1f356a67ecb68c81a7bc7809f1569ad9e152ce7fd02c2f2036862ca9f66b"}, - {file = "psycopg2_binary-2.9.11-cp310-cp310-musllinux_1_2_riscv64.whl", hash = "sha256:0da4de5c1ac69d94ed4364b6cbe7190c1a70d325f112ba783d83f8440285f152"}, {file = "psycopg2_binary-2.9.11-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:37d8412565a7267f7d79e29ab66876e55cb5e8e7b3bbf94f8206f6795f8f7e7e"}, {file = "psycopg2_binary-2.9.11-cp310-cp310-win_amd64.whl", hash = "sha256:c665f01ec8ab273a61c62beeb8cce3014c214429ced8a308ca1fc410ecac3a39"}, {file = "psycopg2_binary-2.9.11-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:0e8480afd62362d0a6a27dd09e4ca2def6fa50ed3a4e7c09165266106b2ffa10"}, @@ -2860,10 +2858,8 @@ files = [ {file = "psycopg2_binary-2.9.11-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:2e164359396576a3cc701ba8af4751ae68a07235d7a380c631184a611220d9a4"}, {file = "psycopg2_binary-2.9.11-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:d57c9c387660b8893093459738b6abddbb30a7eab058b77b0d0d1c7d521ddfd7"}, {file = "psycopg2_binary-2.9.11-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:2c226ef95eb2250974bf6fa7a842082b31f68385c4f3268370e3f3870e7859ee"}, - {file = "psycopg2_binary-2.9.11-cp311-cp311-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:a311f1edc9967723d3511ea7d2708e2c3592e3405677bf53d5c7246753591fbb"}, {file = "psycopg2_binary-2.9.11-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:ebb415404821b6d1c47353ebe9c8645967a5235e6d88f914147e7fd411419e6f"}, {file = "psycopg2_binary-2.9.11-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:f07c9c4a5093258a03b28fab9b4f151aa376989e7f35f855088234e656ee6a94"}, - {file = "psycopg2_binary-2.9.11-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:00ce1830d971f43b667abe4a56e42c1e2d594b32da4802e44a73bacacb25535f"}, {file = "psycopg2_binary-2.9.11-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:cffe9d7697ae7456649617e8bb8d7a45afb71cd13f7ab22af3e5c61f04840908"}, {file = "psycopg2_binary-2.9.11-cp311-cp311-win_amd64.whl", hash = "sha256:304fd7b7f97eef30e91b8f7e720b3db75fee010b520e434ea35ed1ff22501d03"}, {file = "psycopg2_binary-2.9.11-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:be9b840ac0525a283a96b556616f5b4820e0526addb8dcf6525a0fa162730be4"}, @@ -2871,10 +2867,8 @@ files = [ {file = "psycopg2_binary-2.9.11-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ab8905b5dcb05bf3fb22e0cf90e10f469563486ffb6a96569e51f897c750a76a"}, {file = "psycopg2_binary-2.9.11-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:bf940cd7e7fec19181fdbc29d76911741153d51cab52e5c21165f3262125685e"}, {file = "psycopg2_binary-2.9.11-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:fa0f693d3c68ae925966f0b14b8edda71696608039f4ed61b1fe9ffa468d16db"}, - {file = "psycopg2_binary-2.9.11-cp312-cp312-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:a1cf393f1cdaf6a9b57c0a719a1068ba1069f022a59b8b1fe44b006745b59757"}, {file = "psycopg2_binary-2.9.11-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:ef7a6beb4beaa62f88592ccc65df20328029d721db309cb3250b0aae0fa146c3"}, {file = "psycopg2_binary-2.9.11-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:31b32c457a6025e74d233957cc9736742ac5a6cb196c6b68499f6bb51390bd6a"}, - {file = "psycopg2_binary-2.9.11-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:edcb3aeb11cb4bf13a2af3c53a15b3d612edeb6409047ea0b5d6a21a9d744b34"}, {file = "psycopg2_binary-2.9.11-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:62b6d93d7c0b61a1dd6197d208ab613eb7dcfdcca0a49c42ceb082257991de9d"}, {file = "psycopg2_binary-2.9.11-cp312-cp312-win_amd64.whl", hash = "sha256:b33fabeb1fde21180479b2d4667e994de7bbf0eec22832ba5d9b5e4cf65b6c6d"}, {file = "psycopg2_binary-2.9.11-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:b8fb3db325435d34235b044b199e56cdf9ff41223a4b9752e8576465170bb38c"}, @@ -2882,10 +2876,8 @@ files = [ {file = "psycopg2_binary-2.9.11-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:8c55b385daa2f92cb64b12ec4536c66954ac53654c7f15a203578da4e78105c0"}, {file = "psycopg2_binary-2.9.11-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:c0377174bf1dd416993d16edc15357f6eb17ac998244cca19bc67cdc0e2e5766"}, {file = "psycopg2_binary-2.9.11-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5c6ff3335ce08c75afaed19e08699e8aacf95d4a260b495a4a8545244fe2ceb3"}, - {file = "psycopg2_binary-2.9.11-cp313-cp313-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:84011ba3109e06ac412f95399b704d3d6950e386b7994475b231cf61eec2fc1f"}, {file = "psycopg2_binary-2.9.11-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:ba34475ceb08cccbdd98f6b46916917ae6eeb92b5ae111df10b544c3a4621dc4"}, {file = "psycopg2_binary-2.9.11-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:b31e90fdd0f968c2de3b26ab014314fe814225b6c324f770952f7d38abf17e3c"}, - {file = "psycopg2_binary-2.9.11-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:d526864e0f67f74937a8fce859bd56c979f5e2ec57ca7c627f5f1071ef7fee60"}, {file = "psycopg2_binary-2.9.11-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:04195548662fa544626c8ea0f06561eb6203f1984ba5b4562764fbeb4c3d14b1"}, {file = "psycopg2_binary-2.9.11-cp313-cp313-win_amd64.whl", hash = "sha256:efff12b432179443f54e230fdf60de1f6cc726b6c832db8701227d089310e8aa"}, {file = "psycopg2_binary-2.9.11-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:92e3b669236327083a2e33ccfa0d320dd01b9803b3e14dd986a4fc54aa00f4e1"}, @@ -2893,10 +2885,8 @@ files = [ {file = "psycopg2_binary-2.9.11-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:9b52a3f9bb540a3e4ec0f6ba6d31339727b2950c9772850d6545b7eae0b9d7c5"}, {file = "psycopg2_binary-2.9.11-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:db4fd476874ccfdbb630a54426964959e58da4c61c9feba73e6094d51303d7d8"}, {file = "psycopg2_binary-2.9.11-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:47f212c1d3be608a12937cc131bd85502954398aaa1320cb4c14421a0ffccf4c"}, - {file = "psycopg2_binary-2.9.11-cp314-cp314-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:e35b7abae2b0adab776add56111df1735ccc71406e56203515e228a8dc07089f"}, {file = "psycopg2_binary-2.9.11-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:fcf21be3ce5f5659daefd2b3b3b6e4727b028221ddc94e6c1523425579664747"}, {file = "psycopg2_binary-2.9.11-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:9bd81e64e8de111237737b29d68039b9c813bdf520156af36d26819c9a979e5f"}, - {file = "psycopg2_binary-2.9.11-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:32770a4d666fbdafab017086655bcddab791d7cb260a16679cc5a7338b64343b"}, {file = "psycopg2_binary-2.9.11-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:c3cb3a676873d7506825221045bd70e0427c905b9c8ee8d6acd70cfcbd6e576d"}, {file = "psycopg2_binary-2.9.11-cp314-cp314-win_amd64.whl", hash = "sha256:4012c9c954dfaccd28f94e84ab9f94e12df76b4afb22331b1f0d3154893a6316"}, {file = "psycopg2_binary-2.9.11-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:20e7fb94e20b03dcc783f76c0865f9da39559dcc0c28dd1a3fce0d01902a6b9c"}, @@ -2904,10 +2894,8 @@ files = [ {file = "psycopg2_binary-2.9.11-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:9d3a9edcfbe77a3ed4bc72836d466dfce4174beb79eda79ea155cc77237ed9e8"}, {file = "psycopg2_binary-2.9.11-cp39-cp39-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:44fc5c2b8fa871ce7f0023f619f1349a0aa03a0857f2c96fbc01c657dcbbdb49"}, {file = "psycopg2_binary-2.9.11-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:9c55460033867b4622cda1b6872edf445809535144152e5d14941ef591980edf"}, - {file = "psycopg2_binary-2.9.11-cp39-cp39-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:2d11098a83cca92deaeaed3d58cfd150d49b3b06ee0d0852be466bf87596899e"}, {file = "psycopg2_binary-2.9.11-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:691c807d94aecfbc76a14e1408847d59ff5b5906a04a23e12a89007672b9e819"}, {file = "psycopg2_binary-2.9.11-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:8b81627b691f29c4c30a8f322546ad039c40c328373b11dff7490a3e1b517855"}, - {file = "psycopg2_binary-2.9.11-cp39-cp39-musllinux_1_2_riscv64.whl", hash = "sha256:b637d6d941209e8d96a072d7977238eea128046effbf37d1d8b2c0764750017d"}, {file = "psycopg2_binary-2.9.11-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:41360b01c140c2a03d346cec3280cf8a71aa07d94f3b1509fa0161c366af66b4"}, {file = "psycopg2_binary-2.9.11-cp39-cp39-win_amd64.whl", hash = "sha256:875039274f8a2361e5207857899706da840768e2a775bf8c65e82f60b197df02"}, ] @@ -3950,13 +3938,13 @@ files = [ [[package]] name = "tqdm" -version = "4.67.1" +version = "4.67.2" description = "Fast, Extensible Progress Meter" optional = false python-versions = ">=3.7" files = [ - {file = "tqdm-4.67.1-py3-none-any.whl", hash = "sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2"}, - {file = "tqdm-4.67.1.tar.gz", hash = "sha256:f8aef9c52c08c13a65f30ea34f4e5aac3fd1a34959879d7e59e63027286627f2"}, + {file = "tqdm-4.67.2-py3-none-any.whl", hash = "sha256:9a12abcbbff58b6036b2167d9d3853042b9d436fe7330f06ae047867f2f8e0a7"}, + {file = "tqdm-4.67.2.tar.gz", hash = "sha256:649aac53964b2cb8dec76a14b405a4c0d13612cb8933aae547dd144eacc99653"}, ] [package.dependencies] @@ -4405,4 +4393,4 @@ propcache = ">=0.2.1" [metadata] lock-version = "2.0" python-versions = ">=3.10,<3.14" -content-hash = "0368b7bb3231134e2c9d78e4d79c30da6abd199094c6e69c1a02102188509de8" +content-hash = "b9627d3d6426127ba47aea057bd8e6878ef7cd1f96d4bae0171ebe69f60b94ff" diff --git a/pyproject.toml b/pyproject.toml index fa21ec2a0..07a5dd91f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -56,6 +56,7 @@ pyiceberg = {extras = ["hadoop"], version = "^0.9.1"} boto3 = ">=1.38.40,<1.38.47" cryptography = "^45.0.5" httpx = "^0.28.1" +docker = ">=7.0.0" tqdm = "^4.67.1" s3fs = "^2025.7.0" pl-fuzzy-frame-match = ">=0.4.0" @@ -102,7 +103,8 @@ build-backend = "poetry.core.masonry.api" [tool.pytest.ini_options] markers = [ "worker: Tests for the flowfile_worker package", - "core: Tests for the flowfile_core package" + "core: Tests for the flowfile_core package", + "kernel: Integration tests requiring Docker kernel containers" ] [tool.coverage.run] From b24fa276a2a62a4b24ad246cfad9de68ffe5b74a Mon Sep 17 00:00:00 2001 From: Edward van Eechoud <41021650+Edwardvaneechoud@users.noreply.github.com> Date: Sat, 31 Jan 2026 16:49:38 +0100 Subject: [PATCH 02/38] Add Kernel Manager UI for Python execution environments (#282) * Add kernel management UI for Python execution environments Provides a visual interface for managing Docker-based kernel containers used by Python Script nodes. Users can create kernels with custom packages and resource limits, monitor status (stopped/starting/idle/executing/error), and control lifecycle (start/stop/delete) with auto-polling for live updates. https://claude.ai/code/session_01VvVDasZtEyGEa6TjE3Xjxm * Update package-lock.json version to match package.json https://claude.ai/code/session_01VvVDasZtEyGEa6TjE3Xjxm * Handle Docker unavailable gracefully with 503 and error banner The kernel routes now catch DockerException during manager init and return a 503 with a clear message instead of crashing with a 500. The frontend surfaces this as a red error banner at the top of the Kernel Manager page so users know Docker needs to be running. https://claude.ai/code/session_01VvVDasZtEyGEa6TjE3Xjxm * Add /kernels/docker-status endpoint and proactive UI feedback New GET /kernels/docker-status endpoint checks Docker daemon reachability and whether the flowfile-kernel image exists. The UI calls this on page load and shows targeted banners: red for Docker not running, yellow for missing kernel image, so users know exactly what to fix before creating kernels. https://claude.ai/code/session_01VvVDasZtEyGEa6TjE3Xjxm * Center kernel manager page with margin auto and padding Match the layout pattern used by other views (SecretsView, DatabaseView) with max-width 1200px, margin 0 auto, and standard spacing-5 padding. https://claude.ai/code/session_01VvVDasZtEyGEa6TjE3Xjxm --- .../flowfile_core/kernel/__init__.py | 2 + flowfile_core/flowfile_core/kernel/models.py | 6 + flowfile_core/flowfile_core/kernel/routes.py | 39 ++- flowfile_frontend/package-lock.json | 4 +- .../src/renderer/app/api/kernel.api.ts | 81 +++++++ .../layout/Sidebar/NavigationRoutes.ts | 7 + .../src/renderer/app/i18n/locales/gb.json | 1 + .../src/renderer/app/router/index.ts | 5 + .../src/renderer/app/types/index.ts | 1 + .../src/renderer/app/types/kernel.types.ts | 32 +++ .../KernelManagerView/CreateKernelForm.vue | 196 +++++++++++++++ .../views/KernelManagerView/KernelCard.vue | 226 +++++++++++++++++ .../KernelManagerView/KernelManagerView.vue | 227 ++++++++++++++++++ .../KernelManagerView/KernelStatusBadge.vue | 68 ++++++ .../KernelManagerView/useKernelManager.ts | 118 +++++++++ 15 files changed, 1009 insertions(+), 4 deletions(-) create mode 100644 flowfile_frontend/src/renderer/app/api/kernel.api.ts create mode 100644 flowfile_frontend/src/renderer/app/types/kernel.types.ts create mode 100644 flowfile_frontend/src/renderer/app/views/KernelManagerView/CreateKernelForm.vue create mode 100644 flowfile_frontend/src/renderer/app/views/KernelManagerView/KernelCard.vue create mode 100644 flowfile_frontend/src/renderer/app/views/KernelManagerView/KernelManagerView.vue create mode 100644 flowfile_frontend/src/renderer/app/views/KernelManagerView/KernelStatusBadge.vue create mode 100644 flowfile_frontend/src/renderer/app/views/KernelManagerView/useKernelManager.ts diff --git a/flowfile_core/flowfile_core/kernel/__init__.py b/flowfile_core/flowfile_core/kernel/__init__.py index 6a275c3f7..bb730d1bb 100644 --- a/flowfile_core/flowfile_core/kernel/__init__.py +++ b/flowfile_core/flowfile_core/kernel/__init__.py @@ -1,5 +1,6 @@ from flowfile_core.kernel.manager import KernelManager from flowfile_core.kernel.models import ( + DockerStatus, ExecuteRequest, ExecuteResult, KernelConfig, @@ -10,6 +11,7 @@ __all__ = [ "KernelManager", + "DockerStatus", "KernelConfig", "KernelInfo", "KernelState", diff --git a/flowfile_core/flowfile_core/kernel/models.py b/flowfile_core/flowfile_core/kernel/models.py index c753b1198..c110ff839 100644 --- a/flowfile_core/flowfile_core/kernel/models.py +++ b/flowfile_core/flowfile_core/kernel/models.py @@ -35,6 +35,12 @@ class KernelInfo(BaseModel): error_message: str | None = None +class DockerStatus(BaseModel): + available: bool + image_available: bool + error: str | None = None + + class ExecuteRequest(BaseModel): node_id: int code: str diff --git a/flowfile_core/flowfile_core/kernel/routes.py b/flowfile_core/flowfile_core/kernel/routes.py index 0058adc43..54ab14277 100644 --- a/flowfile_core/flowfile_core/kernel/routes.py +++ b/flowfile_core/flowfile_core/kernel/routes.py @@ -1,12 +1,23 @@ +import logging + from fastapi import APIRouter, HTTPException -from flowfile_core.kernel.models import ExecuteRequest, ExecuteResult, KernelConfig, KernelInfo +from flowfile_core.kernel.models import DockerStatus, ExecuteRequest, ExecuteResult, KernelConfig, KernelInfo + +logger = logging.getLogger(__name__) def _get_manager(): from flowfile_core.kernel import get_kernel_manager - return get_kernel_manager() + try: + return get_kernel_manager() + except Exception as exc: + logger.error("Kernel manager unavailable: %s", exc) + raise HTTPException( + status_code=503, + detail="Docker is not available. Please ensure Docker is installed and running.", + ) router = APIRouter(prefix="/kernels") @@ -25,6 +36,30 @@ async def create_kernel(config: KernelConfig): raise HTTPException(status_code=409, detail=str(exc)) +@router.get("/docker-status", response_model=DockerStatus) +async def docker_status(): + """Check if Docker is reachable and the kernel image is available.""" + import docker as _docker + + try: + client = _docker.from_env() + client.ping() + except Exception as exc: + return DockerStatus(available=False, image_available=False, error=str(exc)) + + from flowfile_core.kernel.manager import _KERNEL_IMAGE + + try: + client.images.get(_KERNEL_IMAGE) + image_available = True + except _docker.errors.ImageNotFound: + image_available = False + except Exception: + image_available = False + + return DockerStatus(available=True, image_available=image_available) + + @router.get("/{kernel_id}", response_model=KernelInfo) async def get_kernel(kernel_id: str): kernel = await _get_manager().get_kernel(kernel_id) diff --git a/flowfile_frontend/package-lock.json b/flowfile_frontend/package-lock.json index 74331c405..9218aac16 100644 --- a/flowfile_frontend/package-lock.json +++ b/flowfile_frontend/package-lock.json @@ -1,12 +1,12 @@ { "name": "Flowfile", - "version": "0.5.6", + "version": "0.6.2", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "Flowfile", - "version": "0.5.6", + "version": "0.6.2", "dependencies": { "@ag-grid-community/client-side-row-model": "^31.1.1", "@ag-grid-community/core": "^31.1.1", diff --git a/flowfile_frontend/src/renderer/app/api/kernel.api.ts b/flowfile_frontend/src/renderer/app/api/kernel.api.ts new file mode 100644 index 000000000..aed1942f5 --- /dev/null +++ b/flowfile_frontend/src/renderer/app/api/kernel.api.ts @@ -0,0 +1,81 @@ +import axios from "../services/axios.config"; +import type { DockerStatus, KernelConfig, KernelInfo } from "../types"; + +const API_BASE_URL = "/kernels"; + +export class KernelApi { + static async getAll(): Promise { + try { + const response = await axios.get(`${API_BASE_URL}/`); + return response.data; + } catch (error) { + console.error("API Error: Failed to load kernels:", error); + const errorMsg = (error as any).response?.data?.detail || "Failed to load kernels"; + throw new Error(errorMsg); + } + } + + static async get(kernelId: string): Promise { + try { + const response = await axios.get( + `${API_BASE_URL}/${encodeURIComponent(kernelId)}`, + ); + return response.data; + } catch (error) { + console.error("API Error: Failed to get kernel:", error); + throw error; + } + } + + static async create(config: KernelConfig): Promise { + try { + const response = await axios.post(`${API_BASE_URL}/`, config); + return response.data; + } catch (error) { + console.error("API Error: Failed to create kernel:", error); + const errorMsg = (error as any).response?.data?.detail || "Failed to create kernel"; + throw new Error(errorMsg); + } + } + + static async delete(kernelId: string): Promise { + try { + await axios.delete(`${API_BASE_URL}/${encodeURIComponent(kernelId)}`); + } catch (error) { + console.error("API Error: Failed to delete kernel:", error); + throw error; + } + } + + static async start(kernelId: string): Promise { + try { + const response = await axios.post( + `${API_BASE_URL}/${encodeURIComponent(kernelId)}/start`, + ); + return response.data; + } catch (error) { + console.error("API Error: Failed to start kernel:", error); + const errorMsg = (error as any).response?.data?.detail || "Failed to start kernel"; + throw new Error(errorMsg); + } + } + + static async stop(kernelId: string): Promise { + try { + await axios.post(`${API_BASE_URL}/${encodeURIComponent(kernelId)}/stop`); + } catch (error) { + console.error("API Error: Failed to stop kernel:", error); + throw error; + } + } + + static async getDockerStatus(): Promise { + try { + const response = await axios.get(`${API_BASE_URL}/docker-status`); + return response.data; + } catch (error) { + console.error("API Error: Failed to check Docker status:", error); + return { available: false, image_available: false, error: "Failed to reach server" }; + } + } +} diff --git a/flowfile_frontend/src/renderer/app/components/layout/Sidebar/NavigationRoutes.ts b/flowfile_frontend/src/renderer/app/components/layout/Sidebar/NavigationRoutes.ts index 7b2d4c43d..da1bc8025 100644 --- a/flowfile_frontend/src/renderer/app/components/layout/Sidebar/NavigationRoutes.ts +++ b/flowfile_frontend/src/renderer/app/components/layout/Sidebar/NavigationRoutes.ts @@ -49,6 +49,13 @@ export default { icon: "fa-solid fa-key", }, }, + { + name: "kernelManager", + displayName: "menu.kernelManager", + meta: { + icon: "fa-solid fa-server", + }, + }, { name: "nodeDesigner", displayName: "menu.nodeDesigner", diff --git a/flowfile_frontend/src/renderer/app/i18n/locales/gb.json b/flowfile_frontend/src/renderer/app/i18n/locales/gb.json index a3b2a47f3..932349538 100644 --- a/flowfile_frontend/src/renderer/app/i18n/locales/gb.json +++ b/flowfile_frontend/src/renderer/app/i18n/locales/gb.json @@ -337,6 +337,7 @@ "nodeDesigner": "Node Designer", "databaseManager": "Database Connection Manager", "cloudConnectionManager": "Cloud Connection Manager", + "kernelManager": "Kernel Manager", "admin": "User Management", "popovers": "Popovers", "chat": "Chat", diff --git a/flowfile_frontend/src/renderer/app/router/index.ts b/flowfile_frontend/src/renderer/app/router/index.ts index 2f3b7dd82..4ce63f524 100644 --- a/flowfile_frontend/src/renderer/app/router/index.ts +++ b/flowfile_frontend/src/renderer/app/router/index.ts @@ -61,6 +61,11 @@ const routes: Array = [ path: "secretManager", component: () => import("../views/SecretsView/SecretsView.vue"), }, + { + name: "kernelManager", + path: "kernelManager", + component: () => import("../views/KernelManagerView/KernelManagerView.vue"), + }, { name: "nodeDesigner", path: "nodeDesigner", diff --git a/flowfile_frontend/src/renderer/app/types/index.ts b/flowfile_frontend/src/renderer/app/types/index.ts index 2970ea657..dba853a4b 100644 --- a/flowfile_frontend/src/renderer/app/types/index.ts +++ b/flowfile_frontend/src/renderer/app/types/index.ts @@ -6,3 +6,4 @@ export * from "./canvas.types"; export * from "./editor.types"; export * from "./file.types"; export * from "./secrets.types"; +export * from "./kernel.types"; diff --git a/flowfile_frontend/src/renderer/app/types/kernel.types.ts b/flowfile_frontend/src/renderer/app/types/kernel.types.ts new file mode 100644 index 000000000..8a6d05aa7 --- /dev/null +++ b/flowfile_frontend/src/renderer/app/types/kernel.types.ts @@ -0,0 +1,32 @@ +// Kernel management related TypeScript interfaces and types + +export type KernelState = "stopped" | "starting" | "idle" | "executing" | "error"; + +export interface KernelConfig { + id: string; + name: string; + packages: string[]; + cpu_cores: number; + memory_gb: number; + gpu: boolean; +} + +export interface DockerStatus { + available: boolean; + image_available: boolean; + error: string | null; +} + +export interface KernelInfo { + id: string; + name: string; + state: KernelState; + container_id: string | null; + port: number | null; + packages: string[]; + memory_gb: number; + cpu_cores: number; + gpu: boolean; + created_at: string; + error_message: string | null; +} diff --git a/flowfile_frontend/src/renderer/app/views/KernelManagerView/CreateKernelForm.vue b/flowfile_frontend/src/renderer/app/views/KernelManagerView/CreateKernelForm.vue new file mode 100644 index 000000000..071c66c3c --- /dev/null +++ b/flowfile_frontend/src/renderer/app/views/KernelManagerView/CreateKernelForm.vue @@ -0,0 +1,196 @@ + + + + + diff --git a/flowfile_frontend/src/renderer/app/views/KernelManagerView/KernelCard.vue b/flowfile_frontend/src/renderer/app/views/KernelManagerView/KernelCard.vue new file mode 100644 index 000000000..e4ddedbf0 --- /dev/null +++ b/flowfile_frontend/src/renderer/app/views/KernelManagerView/KernelCard.vue @@ -0,0 +1,226 @@ + + + + + diff --git a/flowfile_frontend/src/renderer/app/views/KernelManagerView/KernelManagerView.vue b/flowfile_frontend/src/renderer/app/views/KernelManagerView/KernelManagerView.vue new file mode 100644 index 000000000..258ccd312 --- /dev/null +++ b/flowfile_frontend/src/renderer/app/views/KernelManagerView/KernelManagerView.vue @@ -0,0 +1,227 @@ + + + + + diff --git a/flowfile_frontend/src/renderer/app/views/KernelManagerView/KernelStatusBadge.vue b/flowfile_frontend/src/renderer/app/views/KernelManagerView/KernelStatusBadge.vue new file mode 100644 index 000000000..f238c1cab --- /dev/null +++ b/flowfile_frontend/src/renderer/app/views/KernelManagerView/KernelStatusBadge.vue @@ -0,0 +1,68 @@ + + + + + diff --git a/flowfile_frontend/src/renderer/app/views/KernelManagerView/useKernelManager.ts b/flowfile_frontend/src/renderer/app/views/KernelManagerView/useKernelManager.ts new file mode 100644 index 000000000..86efb5697 --- /dev/null +++ b/flowfile_frontend/src/renderer/app/views/KernelManagerView/useKernelManager.ts @@ -0,0 +1,118 @@ +import { ref, onMounted, onUnmounted } from "vue"; +import type { Ref } from "vue"; +import { KernelApi } from "../../api/kernel.api"; +import type { DockerStatus, KernelInfo, KernelConfig } from "../../types"; + +const POLL_INTERVAL_MS = 5000; + +export function useKernelManager() { + const kernels: Ref = ref([]); + const isLoading = ref(true); + const errorMessage: Ref = ref(null); + const dockerStatus: Ref = ref(null); + const actionInProgress: Ref> = ref({}); + let pollTimer: ReturnType | null = null; + + const checkDockerStatus = async () => { + dockerStatus.value = await KernelApi.getDockerStatus(); + }; + + const loadKernels = async () => { + try { + kernels.value = await KernelApi.getAll(); + errorMessage.value = null; + } catch (error: any) { + console.error("Failed to load kernels:", error); + errorMessage.value = error.message || "Failed to load kernels"; + throw error; + } finally { + isLoading.value = false; + } + }; + + const createKernel = async (config: KernelConfig): Promise => { + const kernel = await KernelApi.create(config); + await loadKernels(); + return kernel; + }; + + const startKernel = async (kernelId: string) => { + actionInProgress.value[kernelId] = true; + try { + await KernelApi.start(kernelId); + await loadKernels(); + } finally { + actionInProgress.value[kernelId] = false; + } + }; + + const stopKernel = async (kernelId: string) => { + actionInProgress.value[kernelId] = true; + try { + await KernelApi.stop(kernelId); + await loadKernels(); + } finally { + actionInProgress.value[kernelId] = false; + } + }; + + const deleteKernel = async (kernelId: string) => { + actionInProgress.value[kernelId] = true; + try { + await KernelApi.delete(kernelId); + await loadKernels(); + } finally { + delete actionInProgress.value[kernelId]; + } + }; + + const isActionInProgress = (kernelId: string): boolean => { + return !!actionInProgress.value[kernelId]; + }; + + const startPolling = () => { + stopPolling(); + pollTimer = setInterval(async () => { + try { + kernels.value = await KernelApi.getAll(); + } catch { + // Silently ignore poll errors to avoid spamming the user + } + }, POLL_INTERVAL_MS); + }; + + const stopPolling = () => { + if (pollTimer !== null) { + clearInterval(pollTimer); + pollTimer = null; + } + }; + + onMounted(async () => { + await checkDockerStatus(); + try { + await loadKernels(); + } catch { + // Error already captured in errorMessage + } + startPolling(); + }); + + onUnmounted(() => { + stopPolling(); + }); + + return { + kernels, + isLoading, + errorMessage, + dockerStatus, + actionInProgress, + loadKernels, + createKernel, + startKernel, + stopKernel, + deleteKernel, + isActionInProgress, + }; +} From 2f1ae51674c9414740a7b8365f81d97763d30e9c Mon Sep 17 00:00:00 2001 From: Edward van Eechoud <41021650+Edwardvaneechoud@users.noreply.github.com> Date: Sat, 31 Jan 2026 21:16:24 +0100 Subject: [PATCH 03/38] Add artifact context tracking for python_script nodes (#283) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add ArtifactContext for tracking artifact metadata across FlowGraph Introduces an ArtifactContext class that tracks which Python artifacts are published and consumed by python_script nodes, enabling visibility into artifact availability based on graph topology and kernel isolation. - Create artifacts.py with ArtifactRef, NodeArtifactState, ArtifactContext - Integrate ArtifactContext into FlowGraph.__init__ - Add _get_upstream_node_ids and _get_required_kernel_ids helpers - Clear artifact context at flow start in run_graph() - Compute available artifacts before and record published after execution - Add clear_artifacts_sync to KernelManager for non-async clearing - Add 32 unit tests for ArtifactContext (test_artifact_context.py) - Add 7 FlowGraph integration tests (test_flowfile.py) - Add 5 kernel integration tests (test_kernel_integration.py) https://claude.ai/code/session_01Pz85VwuSqBzovhjEEtweYN * Add delete_artifact support, duplicate publish prevention, and model training integration test - ArtifactStore.publish() now raises ValueError if artifact name already exists - Added ArtifactStore.delete() and flowfile_client.delete_artifact() - ExecuteResult/ExecuteResponse track artifacts_deleted alongside artifacts_published - ArtifactContext.record_deleted() removes artifacts from kernel index and published lists - flow_graph.add_python_script records deletions from execution results - Integration test: train numpy linear regression in node A, apply predictions in node B - Integration test: publish -> use & delete -> republish -> access flow - Integration test: duplicate publish without delete raises error - Unit tests for all new functionality across kernel_runtime and flowfile_core https://claude.ai/code/session_01Pz85VwuSqBzovhjEEtweYN * Support N inputs per name in kernel execution with read_first convenience method - Change input_paths from dict[str, str] to dict[str, list[str]] across ExecuteRequest models (kernel_runtime and flowfile_core) - read_input() now scans all paths for a name and concatenates them (union), supporting N upstream inputs under the same key (e.g. "main") - Add read_first() convenience method that reads only input_paths[name][0] - read_inputs() updated to handle list-based paths - add_python_script now accepts *flowfile_tables (varargs) and writes each input to main_0.parquet, main_1.parquet, etc. - All existing tests updated to use list-based input_paths format - New tests: multi-main union, read_first, read_inputs with N paths https://claude.ai/code/session_01Pz85VwuSqBzovhjEEtweYN * adding multiple paths * Fix O(N) deletion, deprecated asyncio, naive datetimes, broad exceptions, global context, and hardcoded timeout - ArtifactContext: add _publisher_index reverse map (kernel_id, name) → node_ids so record_deleted and clear_kernel avoid scanning all node states - Replace asyncio.get_event_loop() with asyncio.get_running_loop() in _wait_for_healthy (deprecated since Python 3.10) - Use datetime.now(timezone.utc) in artifacts.py and models.py instead of naive datetime.now() - Narrow except Exception to specific types: docker.errors.DockerException, httpx.HTTPError, OSError, TimeoutError in manager.py - Add debug logging for health poll failures instead of silent pass - Replace global _context dict with contextvars.ContextVar in flowfile_client for safe concurrent request handling - Make health timeout configurable via KernelConfig.health_timeout and KernelInfo.health_timeout (default 120s), wired through create/start_kernel https://claude.ai/code/session_01Pz85VwuSqBzovhjEEtweYN * fix binding to input_id * remove breakpoint * Preserve artifact state for cached nodes and add multi-input integration tests Snapshot artifact context before clear_all() in run_graph() and restore state for nodes that were cached/skipped (their _func never re-executed so record_published was never called). Also adds two integration tests exercising multi-input python_script nodes: one using read_input() for union and one using read_first() for single-input access. https://claude.ai/code/session_01Pz85VwuSqBzovhjEEtweYN * Allow python_script node to accept multiple main inputs Change the python_script NodeTemplate input from 1 to 10, matching polars_code and union nodes. With input=1, add_node_connection always replaced main_inputs instead of appending, so only the last connection was retained. https://claude.ai/code/session_01Pz85VwuSqBzovhjEEtweYN * adding fix * Scope artifact restore to graph nodes only The snapshot/restore logic was restoring artifact state for node IDs that were not part of the graph (e.g. manually injected via record_published). --- .../flowfile_core/configs/node_store/nodes.py | 4 +- .../flowfile_core/flowfile/artifacts.py | 308 ++++++++ .../flowfile_core/flowfile/flow_graph.py | 114 ++- flowfile_core/flowfile_core/kernel/manager.py | 34 +- flowfile_core/flowfile_core/kernel/models.py | 9 +- .../flowfile_core/schemas/input_schema.py | 2 +- .../tests/flowfile/test_artifact_context.py | 346 +++++++++ flowfile_core/tests/flowfile/test_flowfile.py | 104 +++ .../tests/flowfile/test_kernel_integration.py | 728 +++++++++++++++++- .../kernel_runtime/artifact_store.py | 13 + .../kernel_runtime/flowfile_client.py | 67 +- kernel_runtime/kernel_runtime/main.py | 5 +- kernel_runtime/tests/test_artifact_store.py | 32 +- kernel_runtime/tests/test_flowfile_client.py | 95 ++- kernel_runtime/tests/test_main.py | 180 ++++- 15 files changed, 1987 insertions(+), 54 deletions(-) create mode 100644 flowfile_core/flowfile_core/flowfile/artifacts.py create mode 100644 flowfile_core/tests/flowfile/test_artifact_context.py diff --git a/flowfile_core/flowfile_core/configs/node_store/nodes.py b/flowfile_core/flowfile_core/configs/node_store/nodes.py index 6346029db..e985e930a 100644 --- a/flowfile_core/flowfile_core/configs/node_store/nodes.py +++ b/flowfile_core/flowfile_core/configs/node_store/nodes.py @@ -289,11 +289,13 @@ def get_all_standard_nodes() -> tuple[list[NodeTemplate], dict[str, NodeTemplate NodeTemplate( name="Python Script", item="python_script", - input=1, + input=10, output=1, transform_type="narrow", image="polars_code.png", node_group="transform", + multi=True, + can_be_start=True, node_type="process", drawer_title="Python Script", drawer_intro="Execute Python code on an isolated kernel container", diff --git a/flowfile_core/flowfile_core/flowfile/artifacts.py b/flowfile_core/flowfile_core/flowfile/artifacts.py new file mode 100644 index 000000000..1381643a2 --- /dev/null +++ b/flowfile_core/flowfile_core/flowfile/artifacts.py @@ -0,0 +1,308 @@ +"""Artifact context tracking for the FlowGraph. + +This module provides metadata tracking for Python artifacts that are +published and consumed by ``python_script`` nodes running on kernel +containers. The actual objects remain in kernel memory; this module +only tracks *references* (name, source node, type info, etc.) so the +FlowGraph can reason about artifact availability across the DAG. +""" + +from __future__ import annotations + +import logging +from dataclasses import dataclass, field +from datetime import datetime, timezone +from typing import Any + +logger = logging.getLogger(__name__) + + +@dataclass(frozen=True) +class ArtifactRef: + """Metadata reference to an artifact (not the object itself).""" + + name: str + source_node_id: int + kernel_id: str = "" + type_name: str = "" + module: str = "" + size_bytes: int = 0 + created_at: datetime = field(default_factory=datetime.now) + + def to_dict(self) -> dict[str, Any]: + return { + "name": self.name, + "source_node_id": self.source_node_id, + "kernel_id": self.kernel_id, + "type_name": self.type_name, + "module": self.module, + "size_bytes": self.size_bytes, + "created_at": self.created_at.isoformat(), + } + + +@dataclass +class NodeArtifactState: + """Artifact state for a single node.""" + + published: list[ArtifactRef] = field(default_factory=list) + available: dict[str, ArtifactRef] = field(default_factory=dict) + consumed: list[str] = field(default_factory=list) + deleted: list[str] = field(default_factory=list) + + def to_dict(self) -> dict[str, Any]: + return { + "published": [r.to_dict() for r in self.published], + "available": {k: v.to_dict() for k, v in self.available.items()}, + "consumed": list(self.consumed), + "deleted": list(self.deleted), + } + + +class ArtifactContext: + """Tracks artifact availability across the flow graph. + + This is a metadata-only tracker. Actual Python objects stay inside + the kernel container's ``ArtifactStore``. + """ + + def __init__(self) -> None: + self._node_states: dict[int, NodeArtifactState] = {} + self._kernel_artifacts: dict[str, dict[str, ArtifactRef]] = {} + # Reverse index: (kernel_id, artifact_name) → set of node_ids that + # published it. Avoids O(N) scan in record_deleted / clear_kernel. + self._publisher_index: dict[tuple[str, str], set[int]] = {} + + # ------------------------------------------------------------------ + # Recording + # ------------------------------------------------------------------ + + def record_published( + self, + node_id: int, + kernel_id: str, + artifacts: list[dict[str, Any] | str], + ) -> list[ArtifactRef]: + """Record artifacts published by *node_id*. + + ``artifacts`` may be a list of dicts (with at least a ``"name"`` key) + or a plain list of artifact name strings. + + Returns the created :class:`ArtifactRef` objects. + """ + state = self._get_or_create_state(node_id) + refs: list[ArtifactRef] = [] + for item in artifacts: + if isinstance(item, str): + item = {"name": item} + ref = ArtifactRef( + name=item["name"], + source_node_id=node_id, + kernel_id=kernel_id, + type_name=item.get("type_name", ""), + module=item.get("module", ""), + size_bytes=item.get("size_bytes", 0), + created_at=datetime.now(timezone.utc), + ) + refs.append(ref) + state.published.append(ref) + + # Update the per-kernel index + kernel_map = self._kernel_artifacts.setdefault(kernel_id, {}) + kernel_map[ref.name] = ref + + # Update the reverse index + key = (kernel_id, ref.name) + self._publisher_index.setdefault(key, set()).add(node_id) + + logger.debug( + "Node %s published %d artifact(s) on kernel '%s': %s", + node_id, + len(refs), + kernel_id, + [r.name for r in refs], + ) + return refs + + def record_consumed(self, node_id: int, artifact_names: list[str]) -> None: + """Record that *node_id* consumed (read) the given artifact names.""" + state = self._get_or_create_state(node_id) + state.consumed.extend(artifact_names) + + def record_deleted( + self, + node_id: int, + kernel_id: str, + artifact_names: list[str], + ) -> None: + """Record that *node_id* deleted the given artifacts from *kernel_id*. + + Removes the artifacts from the kernel index and from published + lists of the publishing nodes (looked up via reverse index). + """ + state = self._get_or_create_state(node_id) + state.deleted.extend(artifact_names) + + kernel_map = self._kernel_artifacts.get(kernel_id, {}) + for name in artifact_names: + kernel_map.pop(name, None) + + # Use the reverse index to update only the affected nodes + key = (kernel_id, name) + publisher_ids = self._publisher_index.pop(key, set()) + for pid in publisher_ids: + ns = self._node_states.get(pid) + if ns is not None: + ns.published = [ + r for r in ns.published + if not (r.kernel_id == kernel_id and r.name == name) + ] + + logger.debug( + "Node %s deleted %d artifact(s) on kernel '%s': %s", + node_id, + len(artifact_names), + kernel_id, + artifact_names, + ) + + # ------------------------------------------------------------------ + # Availability computation + # ------------------------------------------------------------------ + + def compute_available( + self, + node_id: int, + kernel_id: str, + upstream_node_ids: list[int], + ) -> dict[str, ArtifactRef]: + """Compute which artifacts are available to *node_id*. + + An artifact is available if it was published by an upstream node + (direct or transitive) that used the **same** ``kernel_id``. + + The result is stored on the node's :class:`NodeArtifactState` and + also returned. + """ + available: dict[str, ArtifactRef] = {} + for uid in upstream_node_ids: + upstream_state = self._node_states.get(uid) + if upstream_state is None: + continue + for ref in upstream_state.published: + if ref.kernel_id == kernel_id: + available[ref.name] = ref + + state = self._get_or_create_state(node_id) + state.available = available + + logger.debug( + "Node %s has %d available artifact(s): %s", + node_id, + len(available), + list(available.keys()), + ) + return available + + # ------------------------------------------------------------------ + # Queries + # ------------------------------------------------------------------ + + def get_published_by_node(self, node_id: int) -> list[ArtifactRef]: + """Return artifacts published by *node_id* (empty list if unknown).""" + state = self._node_states.get(node_id) + if state is None: + return [] + return list(state.published) + + def get_available_for_node(self, node_id: int) -> dict[str, ArtifactRef]: + """Return the availability map for *node_id* (empty dict if unknown).""" + state = self._node_states.get(node_id) + if state is None: + return {} + return dict(state.available) + + def get_kernel_artifacts(self, kernel_id: str) -> dict[str, ArtifactRef]: + """Return all known artifacts for a given kernel.""" + return dict(self._kernel_artifacts.get(kernel_id, {})) + + def get_all_artifacts(self) -> dict[str, ArtifactRef]: + """Return every tracked artifact across all kernels.""" + result: dict[str, ArtifactRef] = {} + for kernel_map in self._kernel_artifacts.values(): + result.update(kernel_map) + return result + + # ------------------------------------------------------------------ + # Clearing + # ------------------------------------------------------------------ + + def clear_kernel(self, kernel_id: str) -> None: + """Remove tracking for a specific kernel. + + Also removes the corresponding published refs from node states + and cleans up the reverse index. + """ + # Clean reverse index entries for this kernel + keys_to_remove = [k for k in self._publisher_index if k[0] == kernel_id] + for k in keys_to_remove: + del self._publisher_index[k] + + self._kernel_artifacts.pop(kernel_id, None) + for state in self._node_states.values(): + state.published = [r for r in state.published if r.kernel_id != kernel_id] + state.available = { + k: v for k, v in state.available.items() if v.kernel_id != kernel_id + } + + def clear_all(self) -> None: + """Remove all tracking data.""" + self._node_states.clear() + self._kernel_artifacts.clear() + self._publisher_index.clear() + + def snapshot_node_states(self) -> dict[int, NodeArtifactState]: + """Return a shallow copy of the current per-node states. + + Useful for saving state before ``clear_all()`` so cached + (skipped) nodes can have their artifact state restored afterwards. + """ + return dict(self._node_states) + + def restore_node_state(self, node_id: int, state: NodeArtifactState) -> None: + """Re-insert a previously-snapshotted node state. + + Rebuilds the kernel index and reverse index entries for every + published artifact in *state*. + """ + self._node_states[node_id] = state + for ref in state.published: + kernel_map = self._kernel_artifacts.setdefault(ref.kernel_id, {}) + kernel_map[ref.name] = ref + key = (ref.kernel_id, ref.name) + self._publisher_index.setdefault(key, set()).add(node_id) + + # ------------------------------------------------------------------ + # Serialisation + # ------------------------------------------------------------------ + + def to_dict(self) -> dict[str, Any]: + """Return a JSON-serialisable summary of the context.""" + return { + "nodes": { + str(nid): state.to_dict() for nid, state in self._node_states.items() + }, + "kernels": { + kid: {name: ref.to_dict() for name, ref in refs.items()} + for kid, refs in self._kernel_artifacts.items() + }, + } + + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + def _get_or_create_state(self, node_id: int) -> NodeArtifactState: + if node_id not in self._node_states: + self._node_states[node_id] = NodeArtifactState() + return self._node_states[node_id] diff --git a/flowfile_core/flowfile_core/flowfile/flow_graph.py b/flowfile_core/flowfile_core/flowfile/flow_graph.py index f62ecafb9..9a20db5bc 100644 --- a/flowfile_core/flowfile_core/flowfile/flow_graph.py +++ b/flowfile_core/flowfile_core/flowfile/flow_graph.py @@ -41,6 +41,7 @@ ExternalDatabaseWriter, ExternalDfFetcher, ) +from flowfile_core.flowfile.artifacts import ArtifactContext from flowfile_core.flowfile.flow_node.flow_node import FlowNode from flowfile_core.flowfile.flow_node.schema_utils import create_schema_callback_with_output_config from flowfile_core.flowfile.graph_tree.graph_tree import ( @@ -356,6 +357,7 @@ def __init__( self.cache_results = cache_results self.__name__ = name if name else "flow_" + str(id(self)) self.depends_on = {} + self.artifact_context = ArtifactContext() # Initialize history manager for undo/redo support from flowfile_core.flowfile.history_manager import HistoryManager @@ -1120,7 +1122,7 @@ def _func(*flowfile_tables: FlowDataEngine) -> FlowDataEngine: def add_python_script(self, node_python_script: input_schema.NodePythonScript): """Adds a node that executes Python code on a kernel container.""" - def _func(flowfile_table: FlowDataEngine) -> FlowDataEngine: + def _func(*flowfile_tables: FlowDataEngine) -> FlowDataEngine: from flowfile_core.kernel import ExecuteRequest, get_kernel_manager kernel_id = node_python_script.python_script_input.kernel_id @@ -1134,6 +1136,14 @@ def _func(flowfile_table: FlowDataEngine) -> FlowDataEngine: node_id = node_python_script.node_id flow_id = self.flow_id + # Compute available artifacts before execution + upstream_ids = self._get_upstream_node_ids(node_id) + self.artifact_context.compute_available( + node_id=node_id, + kernel_id=kernel_id, + upstream_node_ids=upstream_ids, + ) + shared_base = manager.shared_volume_path input_dir = os.path.join(shared_base, str(flow_id), str(node_id), "inputs") output_dir = os.path.join(shared_base, str(flow_id), str(node_id), "outputs") @@ -1141,11 +1151,15 @@ def _func(flowfile_table: FlowDataEngine) -> FlowDataEngine: os.makedirs(input_dir, exist_ok=True) os.makedirs(output_dir, exist_ok=True) - # Write input to parquet - input_paths: dict[str, str] = {} - input_path = os.path.join(input_dir, "main.parquet") - flowfile_table.data_frame.collect().write_parquet(input_path) - input_paths["main"] = f"/shared/{flow_id}/{node_id}/inputs/main.parquet" + # Write inputs to parquet — supports N inputs under "main" + input_paths: dict[str, list[str]] = {} + main_paths: list[str] = [] + for idx, ft in enumerate(flowfile_tables): + filename = f"main_{idx}.parquet" + local_path = os.path.join(input_dir, filename) + ft.data_frame.collect().write_parquet(local_path) + main_paths.append(f"/shared/{flow_id}/{node_id}/inputs/{filename}") + input_paths["main"] = main_paths # Execute on kernel (synchronous — no async boundary issues) request = ExecuteRequest( @@ -1159,20 +1173,36 @@ def _func(flowfile_table: FlowDataEngine) -> FlowDataEngine: if not result.success: raise RuntimeError(f"Kernel execution failed: {result.error}") + # Record published artifacts after successful execution + if result.artifacts_published: + self.artifact_context.record_published( + node_id=node_id, + kernel_id=kernel_id, + artifacts=[{"name": n} for n in result.artifacts_published], + ) + + # Record deleted artifacts after successful execution + if result.artifacts_deleted: + self.artifact_context.record_deleted( + node_id=node_id, + kernel_id=kernel_id, + artifact_names=result.artifacts_deleted, + ) + # Read output output_path = os.path.join(output_dir, "main.parquet") if os.path.exists(output_path): return FlowDataEngine(pl.scan_parquet(output_path)) - # No output published, pass through input - return flowfile_table + # No output published, pass through first input + return flowfile_tables[0] if flowfile_tables else FlowDataEngine(pl.LazyFrame()) self.add_node_step( node_id=node_python_script.node_id, function=_func, node_type="python_script", setting_input=node_python_script, - input_node_ids=[node_python_script.depending_on_id], + input_node_ids=node_python_script.depending_on_ids, ) def add_dependency_on_polars_lazy_frame(self, lazy_frame: pl.LazyFrame, node_id: int): @@ -2351,6 +2381,47 @@ def trigger_fetch_node(self, node_id: int) -> RunInformation | None: finally: self.flow_settings.is_running = False + # ------------------------------------------------------------------ + # Artifact helpers + # ------------------------------------------------------------------ + + def _get_upstream_node_ids(self, node_id: int) -> list[int]: + """Get all upstream node IDs (direct and transitive) for *node_id*. + + Traverses the ``all_inputs`` links recursively and returns a + deduplicated list in breadth-first order. + """ + node = self.get_node(node_id) + if node is None: + return [] + + visited: set[int] = set() + result: list[int] = [] + queue = list(node.all_inputs) + while queue: + current = queue.pop(0) + cid = current.node_id + if cid in visited: + continue + visited.add(cid) + result.append(cid) + queue.extend(current.all_inputs) + return result + + def _get_required_kernel_ids(self) -> set[str]: + """Return the set of kernel IDs used by ``python_script`` nodes.""" + kernel_ids: set[str] = set() + for node in self.nodes: + if node.node_type == "python_script" and node.setting_input is not None: + kid = getattr( + getattr(node.setting_input, "python_script_input", None), + "kernel_id", + None, + ) + if kid: + kernel_ids.add(kid) + return kernel_ids + def _execute_single_node( self, node: FlowNode, @@ -2425,6 +2496,19 @@ def run_graph(self) -> RunInformation | None: self.flow_settings.is_canceled = False self.flow_logger.clear_log_file() self.flow_logger.info("Starting to run flowfile flow...") + + # Clear artifact tracking for a fresh run. + # Snapshot first so we can restore state for cached (skipped) nodes. + _prev_artifact_states = self.artifact_context.snapshot_node_states() + self.artifact_context.clear_all() + for kid in self._get_required_kernel_ids(): + self.artifact_context.clear_kernel(kid) + try: + from flowfile_core.kernel import get_kernel_manager + manager = get_kernel_manager() + manager.clear_artifacts_sync(kid) + except Exception: + logger.debug("Could not clear kernel artifacts for '%s'", kid) execution_plan = compute_execution_plan( nodes=self.nodes, flow_starts=self._flow_starts + self.get_implicit_starter_nodes() ) @@ -2483,6 +2567,18 @@ def run_graph(self) -> RunInformation | None: for dep in node.get_all_dependent_nodes(): skip_node_ids.add(dep.node_id) + # Restore artifact state for graph nodes that were cached (skipped). + # Their _func didn't re-execute, so record_published was never + # called — replay their state from the pre-clear snapshot. + # Only restore nodes that actually belong to this graph to avoid + # resurrecting stale entries injected outside the graph. + graph_node_ids = set(self._node_db.keys()) + for nid, prev_state in _prev_artifact_states.items(): + if (nid in graph_node_ids + and nid not in self.artifact_context._node_states + and prev_state.published): + self.artifact_context.restore_node_state(nid, prev_state) + self.latest_run_info.end_time = datetime.datetime.now() self.flow_logger.info("Flow completed!") self.end_datetime = datetime.datetime.now() diff --git a/flowfile_core/flowfile_core/kernel/manager.py b/flowfile_core/flowfile_core/kernel/manager.py index 6732c5392..a01adf47c 100644 --- a/flowfile_core/flowfile_core/kernel/manager.py +++ b/flowfile_core/flowfile_core/kernel/manager.py @@ -54,7 +54,7 @@ def _reclaim_running_containers(self) -> None: containers = self._docker.containers.list( filters={"name": "flowfile-kernel-", "status": "running"} ) - except Exception as exc: + except (docker.errors.APIError, docker.errors.DockerException) as exc: logger.warning("Could not list running containers: %s", exc) return @@ -114,6 +114,7 @@ async def create_kernel(self, config: KernelConfig) -> KernelInfo: memory_gb=config.memory_gb, cpu_cores=config.cpu_cores, gpu=config.gpu, + health_timeout=config.health_timeout, ) self._kernels[config.id] = kernel logger.info("Created kernel '%s' on port %d", config.id, port) @@ -140,10 +141,10 @@ async def start_kernel(self, kernel_id: str) -> KernelInfo: } container = self._docker.containers.run(_KERNEL_IMAGE, **run_kwargs) kernel.container_id = container.id - await self._wait_for_healthy(kernel_id, timeout=_HEALTH_TIMEOUT) + await self._wait_for_healthy(kernel_id, timeout=kernel.health_timeout) kernel.state = KernelState.IDLE logger.info("Kernel '%s' is idle (container %s)", kernel_id, container.short_id) - except Exception as exc: + except (docker.errors.DockerException, httpx.HTTPError, TimeoutError, OSError) as exc: kernel.state = KernelState.ERROR kernel.error_message = str(exc) logger.error("Failed to start kernel '%s': %s", kernel_id, exc) @@ -214,6 +215,17 @@ async def clear_artifacts(self, kernel_id: str) -> None: response = await client.post(url) response.raise_for_status() + def clear_artifacts_sync(self, kernel_id: str) -> None: + """Synchronous wrapper around clear_artifacts() for use from non-async code.""" + kernel = self._get_kernel_or_raise(kernel_id) + if kernel.state not in (KernelState.IDLE, KernelState.EXECUTING): + raise RuntimeError(f"Kernel '{kernel_id}' is not running (state: {kernel.state})") + + url = f"http://localhost:{kernel.port}/clear" + with httpx.Client(timeout=httpx.Timeout(30.0)) as client: + response = client.post(url) + response.raise_for_status() + # ------------------------------------------------------------------ # Queries # ------------------------------------------------------------------ @@ -244,27 +256,23 @@ def _cleanup_container(self, kernel_id: str) -> None: container.remove(force=True) except docker.errors.NotFound: pass - except Exception as exc: + except (docker.errors.APIError, docker.errors.DockerException) as exc: logger.warning("Error cleaning up container for kernel '%s': %s", kernel_id, exc) async def _wait_for_healthy(self, kernel_id: str, timeout: int = _HEALTH_TIMEOUT) -> None: kernel = self._get_kernel_or_raise(kernel_id) url = f"http://localhost:{kernel.port}/health" - deadline = asyncio.get_event_loop().time() + timeout + loop = asyncio.get_running_loop() + deadline = loop.time() + timeout - while asyncio.get_event_loop().time() < deadline: + while loop.time() < deadline: try: async with httpx.AsyncClient(timeout=httpx.Timeout(5.0)) as client: response = await client.get(url) if response.status_code == 200: return - except httpx.HTTPError: - # Catches all transient errors: ConnectError, ReadError, - # ConnectTimeout, RemoteProtocolError, etc. - pass - except Exception: - # Safety net for unexpected errors during startup polling - pass + except (httpx.HTTPError, OSError) as exc: + logger.debug("Health poll for kernel '%s' failed: %s", kernel_id, exc) await asyncio.sleep(_HEALTH_POLL_INTERVAL) raise TimeoutError(f"Kernel '{kernel_id}' did not become healthy within {timeout}s") diff --git a/flowfile_core/flowfile_core/kernel/models.py b/flowfile_core/flowfile_core/kernel/models.py index c110ff839..c7d71389b 100644 --- a/flowfile_core/flowfile_core/kernel/models.py +++ b/flowfile_core/flowfile_core/kernel/models.py @@ -1,4 +1,4 @@ -from datetime import datetime +from datetime import datetime, timezone from enum import Enum from pydantic import BaseModel, Field @@ -19,6 +19,7 @@ class KernelConfig(BaseModel): cpu_cores: float = 2.0 memory_gb: float = 4.0 gpu: bool = False + health_timeout: int = 120 class KernelInfo(BaseModel): @@ -31,7 +32,8 @@ class KernelInfo(BaseModel): memory_gb: float = 4.0 cpu_cores: float = 2.0 gpu: bool = False - created_at: datetime = Field(default_factory=datetime.now) + health_timeout: int = 120 + created_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc)) error_message: str | None = None @@ -44,7 +46,7 @@ class DockerStatus(BaseModel): class ExecuteRequest(BaseModel): node_id: int code: str - input_paths: dict[str, str] = Field(default_factory=dict) + input_paths: dict[str, list[str]] = Field(default_factory=dict) output_dir: str = "" @@ -52,6 +54,7 @@ class ExecuteResult(BaseModel): success: bool output_paths: list[str] = Field(default_factory=list) artifacts_published: list[str] = Field(default_factory=list) + artifacts_deleted: list[str] = Field(default_factory=list) stdout: str = "" stderr: str = "" error: str | None = None diff --git a/flowfile_core/flowfile_core/schemas/input_schema.py b/flowfile_core/flowfile_core/schemas/input_schema.py index 6ee8ba372..46fbae118 100644 --- a/flowfile_core/flowfile_core/schemas/input_schema.py +++ b/flowfile_core/flowfile_core/schemas/input_schema.py @@ -894,7 +894,7 @@ class PythonScriptInput(BaseModel): kernel_id: str | None = None -class NodePythonScript(NodeSingleInput): +class NodePythonScript(NodeMultiInput): """Node that executes Python code on a kernel container.""" python_script_input: PythonScriptInput = PythonScriptInput() diff --git a/flowfile_core/tests/flowfile/test_artifact_context.py b/flowfile_core/tests/flowfile/test_artifact_context.py new file mode 100644 index 000000000..5fac95bd5 --- /dev/null +++ b/flowfile_core/tests/flowfile/test_artifact_context.py @@ -0,0 +1,346 @@ +"""Unit tests for flowfile_core.flowfile.artifacts.""" + +from datetime import datetime + +import pytest + +from flowfile_core.flowfile.artifacts import ArtifactContext, ArtifactRef, NodeArtifactState + + +# --------------------------------------------------------------------------- +# ArtifactRef +# --------------------------------------------------------------------------- + + +class TestArtifactRef: + def test_create_ref(self): + ref = ArtifactRef(name="model", source_node_id=1, kernel_id="k1") + assert ref.name == "model" + assert ref.source_node_id == 1 + assert ref.kernel_id == "k1" + assert isinstance(ref.created_at, datetime) + + def test_refs_are_hashable(self): + """Frozen dataclass instances can be used in sets / as dict keys.""" + ref = ArtifactRef(name="model", source_node_id=1) + assert hash(ref) is not None + s = {ref} + assert ref in s + + def test_refs_equality(self): + ts = datetime(2025, 1, 1) + a = ArtifactRef(name="x", source_node_id=1, created_at=ts) + b = ArtifactRef(name="x", source_node_id=1, created_at=ts) + assert a == b + + def test_to_dict(self): + ref = ArtifactRef( + name="model", + source_node_id=1, + kernel_id="k1", + type_name="RandomForest", + module="sklearn.ensemble", + size_bytes=1024, + ) + d = ref.to_dict() + assert d["name"] == "model" + assert d["source_node_id"] == 1 + assert d["kernel_id"] == "k1" + assert d["type_name"] == "RandomForest" + assert d["module"] == "sklearn.ensemble" + assert d["size_bytes"] == 1024 + assert "created_at" in d + + +# --------------------------------------------------------------------------- +# NodeArtifactState +# --------------------------------------------------------------------------- + + +class TestNodeArtifactState: + def test_defaults(self): + state = NodeArtifactState() + assert state.published == [] + assert state.available == {} + assert state.consumed == [] + + def test_to_dict(self): + ref = ArtifactRef(name="m", source_node_id=1, kernel_id="k") + state = NodeArtifactState(published=[ref], available={"m": ref}, consumed=["m"]) + d = state.to_dict() + assert len(d["published"]) == 1 + assert "m" in d["available"] + assert d["consumed"] == ["m"] + + +# --------------------------------------------------------------------------- +# ArtifactContext — Recording +# --------------------------------------------------------------------------- + + +class TestArtifactContextRecording: + def test_record_published_with_dict(self): + ctx = ArtifactContext() + refs = ctx.record_published( + node_id=1, + kernel_id="k1", + artifacts=[{"name": "model", "type_name": "RF"}], + ) + assert len(refs) == 1 + assert refs[0].name == "model" + assert refs[0].type_name == "RF" + assert refs[0].source_node_id == 1 + assert refs[0].kernel_id == "k1" + + def test_record_published_with_string_list(self): + ctx = ArtifactContext() + refs = ctx.record_published(node_id=2, kernel_id="k1", artifacts=["a", "b"]) + assert len(refs) == 2 + assert refs[0].name == "a" + assert refs[1].name == "b" + + def test_record_published_multiple_nodes(self): + ctx = ArtifactContext() + ctx.record_published(1, "k1", ["model"]) + ctx.record_published(2, "k1", ["encoder"]) + assert len(ctx.get_published_by_node(1)) == 1 + assert len(ctx.get_published_by_node(2)) == 1 + + def test_record_published_updates_kernel_artifacts(self): + ctx = ArtifactContext() + ctx.record_published(1, "k1", ["model"]) + ka = ctx.get_kernel_artifacts("k1") + assert "model" in ka + assert ka["model"].source_node_id == 1 + + def test_record_consumed(self): + ctx = ArtifactContext() + ctx.record_consumed(5, ["model", "scaler"]) + state = ctx._node_states[5] + assert state.consumed == ["model", "scaler"] + + +# --------------------------------------------------------------------------- +# ArtifactContext — Availability +# --------------------------------------------------------------------------- + + +class TestArtifactContextAvailability: + def test_compute_available_from_direct_upstream(self): + ctx = ArtifactContext() + ctx.record_published(1, "k1", ["model"]) + avail = ctx.compute_available(node_id=2, kernel_id="k1", upstream_node_ids=[1]) + assert "model" in avail + assert avail["model"].source_node_id == 1 + + def test_compute_available_transitive(self): + """Node 3 should see artifacts from node 1 via node 2.""" + ctx = ArtifactContext() + ctx.record_published(1, "k1", ["model"]) + # Node 2 doesn't publish anything + # Node 3 lists both 1 and 2 as upstream + avail = ctx.compute_available(node_id=3, kernel_id="k1", upstream_node_ids=[1, 2]) + assert "model" in avail + + def test_compute_available_different_kernels_isolated(self): + ctx = ArtifactContext() + ctx.record_published(1, "k1", ["model"]) + avail = ctx.compute_available(node_id=2, kernel_id="k2", upstream_node_ids=[1]) + assert avail == {} + + def test_compute_available_same_kernel_visible(self): + ctx = ArtifactContext() + ctx.record_published(1, "k1", ["model"]) + avail = ctx.compute_available(node_id=2, kernel_id="k1", upstream_node_ids=[1]) + assert "model" in avail + + def test_compute_available_stores_on_node_state(self): + ctx = ArtifactContext() + ctx.record_published(1, "k1", ["model"]) + ctx.compute_available(node_id=2, kernel_id="k1", upstream_node_ids=[1]) + assert "model" in ctx.get_available_for_node(2) + + def test_compute_available_no_upstream_returns_empty(self): + ctx = ArtifactContext() + avail = ctx.compute_available(node_id=1, kernel_id="k1", upstream_node_ids=[]) + assert avail == {} + + def test_compute_available_multiple_artifacts(self): + ctx = ArtifactContext() + ctx.record_published(1, "k1", ["model", "scaler"]) + ctx.record_published(2, "k1", ["encoder"]) + avail = ctx.compute_available(node_id=3, kernel_id="k1", upstream_node_ids=[1, 2]) + assert set(avail.keys()) == {"model", "scaler", "encoder"} + + def test_compute_available_overwrites_previous(self): + """Re-computing availability replaces old data.""" + ctx = ArtifactContext() + ctx.record_published(1, "k1", ["model"]) + ctx.compute_available(node_id=2, kernel_id="k1", upstream_node_ids=[1]) + # Re-compute with no upstream + ctx.compute_available(node_id=2, kernel_id="k1", upstream_node_ids=[]) + assert ctx.get_available_for_node(2) == {} + + +# --------------------------------------------------------------------------- +# ArtifactContext — Deletion tracking +# --------------------------------------------------------------------------- + + +class TestArtifactContextDeletion: + def test_record_deleted_removes_from_kernel_index(self): + ctx = ArtifactContext() + ctx.record_published(1, "k1", ["model"]) + ctx.record_deleted(2, "k1", ["model"]) + assert ctx.get_kernel_artifacts("k1") == {} + + def test_record_deleted_removes_from_published_lists(self): + ctx = ArtifactContext() + ctx.record_published(1, "k1", ["model", "scaler"]) + ctx.record_deleted(2, "k1", ["model"]) + published = ctx.get_published_by_node(1) + names = [r.name for r in published] + assert "model" not in names + assert "scaler" in names + + def test_record_deleted_tracks_on_node_state(self): + ctx = ArtifactContext() + ctx.record_published(1, "k1", ["model"]) + ctx.record_deleted(2, "k1", ["model"]) + state = ctx._node_states[2] + assert "model" in state.deleted + + def test_deleted_artifact_not_available_downstream(self): + """If node 2 deletes an artifact published by node 1, + node 3 should not see it as available.""" + ctx = ArtifactContext() + ctx.record_published(1, "k1", ["model"]) + ctx.record_deleted(2, "k1", ["model"]) + avail = ctx.compute_available(node_id=3, kernel_id="k1", upstream_node_ids=[1, 2]) + assert "model" not in avail + + def test_delete_and_republish_flow(self): + """Node 1 publishes, node 2 deletes, node 3 re-publishes, + node 4 should see the new version.""" + ctx = ArtifactContext() + ctx.record_published(1, "k1", ["model"]) + ctx.record_deleted(2, "k1", ["model"]) + ctx.record_published(3, "k1", ["model"]) + avail = ctx.compute_available(node_id=4, kernel_id="k1", upstream_node_ids=[1, 2, 3]) + assert "model" in avail + assert avail["model"].source_node_id == 3 + + +# --------------------------------------------------------------------------- +# ArtifactContext — Clearing +# --------------------------------------------------------------------------- + + +class TestArtifactContextClearing: + def test_clear_kernel_removes_only_that_kernel(self): + ctx = ArtifactContext() + ctx.record_published(1, "k1", ["model"]) + ctx.record_published(2, "k2", ["encoder"]) + ctx.clear_kernel("k1") + assert ctx.get_kernel_artifacts("k1") == {} + assert "encoder" in ctx.get_kernel_artifacts("k2") + + def test_clear_kernel_removes_from_node_states(self): + ctx = ArtifactContext() + ctx.record_published(1, "k1", ["model"]) + ctx.record_published(1, "k2", ["encoder"]) + ctx.clear_kernel("k1") + published = ctx.get_published_by_node(1) + names = [r.name for r in published] + assert "model" not in names + assert "encoder" in names + + def test_clear_kernel_removes_from_available(self): + ctx = ArtifactContext() + ctx.record_published(1, "k1", ["model"]) + ctx.compute_available(node_id=2, kernel_id="k1", upstream_node_ids=[1]) + ctx.clear_kernel("k1") + assert ctx.get_available_for_node(2) == {} + + def test_clear_all_removes_everything(self): + ctx = ArtifactContext() + ctx.record_published(1, "k1", ["model"]) + ctx.record_published(2, "k2", ["encoder"]) + ctx.compute_available(node_id=3, kernel_id="k1", upstream_node_ids=[1]) + ctx.clear_all() + assert ctx.get_published_by_node(1) == [] + assert ctx.get_published_by_node(2) == [] + assert ctx.get_available_for_node(3) == {} + assert ctx.get_kernel_artifacts("k1") == {} + assert ctx.get_kernel_artifacts("k2") == {} + assert ctx.get_all_artifacts() == {} + + +# --------------------------------------------------------------------------- +# ArtifactContext — Queries +# --------------------------------------------------------------------------- + + +class TestArtifactContextQueries: + def test_get_published_by_node_returns_empty_for_unknown(self): + ctx = ArtifactContext() + assert ctx.get_published_by_node(999) == [] + + def test_get_available_for_node_returns_empty_for_unknown(self): + ctx = ArtifactContext() + assert ctx.get_available_for_node(999) == {} + + def test_get_kernel_artifacts(self): + ctx = ArtifactContext() + ctx.record_published(1, "k1", ["a", "b"]) + ka = ctx.get_kernel_artifacts("k1") + assert set(ka.keys()) == {"a", "b"} + + def test_get_kernel_artifacts_empty(self): + ctx = ArtifactContext() + assert ctx.get_kernel_artifacts("nonexistent") == {} + + def test_get_all_artifacts(self): + ctx = ArtifactContext() + ctx.record_published(1, "k1", ["model"]) + ctx.record_published(2, "k2", ["encoder"]) + all_arts = ctx.get_all_artifacts() + assert set(all_arts.keys()) == {"model", "encoder"} + + def test_get_all_artifacts_empty(self): + ctx = ArtifactContext() + assert ctx.get_all_artifacts() == {} + + +# --------------------------------------------------------------------------- +# ArtifactContext — Serialisation +# --------------------------------------------------------------------------- + + +class TestArtifactContextSerialization: + def test_to_dict_structure(self): + ctx = ArtifactContext() + ctx.record_published(1, "k1", [{"name": "model", "type_name": "RF"}]) + ctx.compute_available(node_id=2, kernel_id="k1", upstream_node_ids=[1]) + d = ctx.to_dict() + assert "nodes" in d + assert "kernels" in d + assert "1" in d["nodes"] + assert "2" in d["nodes"] + assert "k1" in d["kernels"] + assert "model" in d["kernels"]["k1"] + + def test_to_dict_empty_context(self): + ctx = ArtifactContext() + d = ctx.to_dict() + assert d == {"nodes": {}, "kernels": {}} + + def test_to_dict_is_json_serialisable(self): + import json + + ctx = ArtifactContext() + ctx.record_published(1, "k1", ["model"]) + d = ctx.to_dict() + # Should not raise + serialised = json.dumps(d) + assert isinstance(serialised, str) diff --git a/flowfile_core/tests/flowfile/test_flowfile.py b/flowfile_core/tests/flowfile/test_flowfile.py index f1a386b14..f489cf6fd 100644 --- a/flowfile_core/tests/flowfile/test_flowfile.py +++ b/flowfile_core/tests/flowfile/test_flowfile.py @@ -1750,3 +1750,107 @@ def test_fetch_before_run_debug(): assert len(example_data_after_run) > 0, "There should be data after fetch operation" + +# --------------------------------------------------------------------------- +# FlowGraph — ArtifactContext integration +# --------------------------------------------------------------------------- + + +class TestFlowGraphArtifactContext: + """Tests for ArtifactContext integration on FlowGraph.""" + + def test_flowgraph_has_artifact_context(self): + """FlowGraph initializes with an ArtifactContext.""" + from flowfile_core.flowfile.artifacts import ArtifactContext + + graph = create_graph() + assert hasattr(graph, "artifact_context") + assert isinstance(graph.artifact_context, ArtifactContext) + + def test_get_upstream_node_ids_direct(self): + """Returns direct upstream dependencies.""" + data = [{"a": 1}] + graph = create_graph() + add_manual_input(graph, data, node_id=1) + # Add node 2 depending on node 1 + node_promise = input_schema.NodePromise(flow_id=1, node_id=2, node_type="sample") + graph.add_node_promise(node_promise) + graph.add_sample(input_schema.NodeSample(flow_id=1, node_id=2, depending_on_id=1)) + add_connection(graph, input_schema.NodeConnection.create_from_simple_input(1, 2)) + + upstream = graph._get_upstream_node_ids(2) + assert 1 in upstream + + def test_get_upstream_node_ids_transitive(self): + """Returns transitive upstream dependencies (1 -> 2 -> 3).""" + data = [{"a": 1}] + graph = create_graph() + add_manual_input(graph, data, node_id=1) + + # Node 2 depends on 1 + node_promise_2 = input_schema.NodePromise(flow_id=1, node_id=2, node_type="sample") + graph.add_node_promise(node_promise_2) + graph.add_sample(input_schema.NodeSample(flow_id=1, node_id=2, depending_on_id=1)) + add_connection(graph, input_schema.NodeConnection.create_from_simple_input(1, 2)) + + # Node 3 depends on 2 + node_promise_3 = input_schema.NodePromise(flow_id=1, node_id=3, node_type="sample") + graph.add_node_promise(node_promise_3) + graph.add_sample(input_schema.NodeSample(flow_id=1, node_id=3, depending_on_id=2)) + add_connection(graph, input_schema.NodeConnection.create_from_simple_input(2, 3)) + + upstream = graph._get_upstream_node_ids(3) + assert 1 in upstream + assert 2 in upstream + + def test_get_upstream_node_ids_unknown_returns_empty(self): + """Unknown node returns empty list.""" + graph = create_graph() + assert graph._get_upstream_node_ids(999) == [] + + def test_get_required_kernel_ids_no_python_nodes(self): + """Returns empty set when no python_script nodes exist.""" + data = [{"a": 1}] + graph = create_graph() + add_manual_input(graph, data, node_id=1) + assert graph._get_required_kernel_ids() == set() + + def test_get_required_kernel_ids_with_python_nodes(self): + """Returns kernel IDs from python_script nodes.""" + data = [{"a": 1}] + graph = create_graph() + add_manual_input(graph, data, node_id=1) + + node_promise = input_schema.NodePromise(flow_id=1, node_id=2, node_type="python_script") + graph.add_node_promise(node_promise) + graph.add_python_script( + input_schema.NodePythonScript( + flow_id=1, + node_id=2, + depending_on_id=1, + python_script_input=input_schema.PythonScriptInput( + code='print("hi")', + kernel_id="ml_kernel", + ), + ) + ) + add_connection(graph, input_schema.NodeConnection.create_from_simple_input(1, 2)) + + assert "ml_kernel" in graph._get_required_kernel_ids() + + def test_run_graph_clears_artifact_context(self): + """Artifact context is cleared at flow start.""" + data = [{"a": 1}] + graph = create_graph() + add_manual_input(graph, data, node_id=1) + + # Pre-populate artifact_context + graph.artifact_context.record_published(99, "test", [{"name": "old"}]) + assert len(graph.artifact_context.get_published_by_node(99)) == 1 + + # Run graph + graph.run_graph() + + # Context should be cleared + assert graph.artifact_context.get_published_by_node(99) == [] + diff --git a/flowfile_core/tests/flowfile/test_kernel_integration.py b/flowfile_core/tests/flowfile/test_kernel_integration.py index 37827aa48..1b6130ede 100644 --- a/flowfile_core/tests/flowfile/test_kernel_integration.py +++ b/flowfile_core/tests/flowfile/test_kernel_integration.py @@ -163,7 +163,7 @@ def test_read_and_write_parquet(self, kernel_manager: tuple[KernelManager, str]) ExecuteRequest( node_id=4, code=code, - input_paths={"main": "/shared/test_rw/inputs/main.parquet"}, + input_paths={"main": ["/shared/test_rw/inputs/main.parquet"]}, output_dir="/shared/test_rw/outputs", ), ) @@ -209,8 +209,8 @@ def test_multiple_inputs(self, kernel_manager: tuple[KernelManager, str]): node_id=5, code=code, input_paths={ - "left": "/shared/test_multi/inputs/left.parquet", - "right": "/shared/test_multi/inputs/right.parquet", + "left": ["/shared/test_multi/inputs/left.parquet"], + "right": ["/shared/test_multi/inputs/right.parquet"], }, output_dir="/shared/test_multi/outputs", ), @@ -306,7 +306,7 @@ def test_python_script_passthrough(self, kernel_manager: tuple[KernelManager, st input_schema.NodePythonScript( flow_id=1, node_id=2, - depending_on_id=1, + depending_on_ids=[1], python_script_input=input_schema.PythonScriptInput( code=code, kernel_id=kernel_id, @@ -368,7 +368,7 @@ def test_python_script_transform(self, kernel_manager: tuple[KernelManager, str] input_schema.NodePythonScript( flow_id=1, node_id=2, - depending_on_id=1, + depending_on_ids=[1], python_script_input=input_schema.PythonScriptInput( code=code, kernel_id=kernel_id, @@ -416,7 +416,7 @@ def test_python_script_no_kernel_raises(self): input_schema.NodePythonScript( flow_id=1, node_id=2, - depending_on_id=1, + depending_on_ids=[1], python_script_input=input_schema.PythonScriptInput( code='print("hi")', kernel_id=None, # intentionally no kernel @@ -429,3 +429,719 @@ def test_python_script_no_kernel_raises(self): run_info = graph.run_graph() # Should fail because no kernel is selected assert not run_info.success + + +# --------------------------------------------------------------------------- +# Tests — ArtifactContext integration (requires real kernel container) +# --------------------------------------------------------------------------- + + +class TestArtifactContextIntegration: + """Integration tests verifying ArtifactContext works with real kernel execution.""" + + def test_published_artifacts_recorded_in_context(self, kernel_manager: tuple[KernelManager, str]): + """After execution, published artifacts appear in artifact_context.""" + manager, kernel_id = kernel_manager + import flowfile_core.kernel as _kernel_mod + + _prev = _kernel_mod._manager + _kernel_mod._manager = manager + + try: + graph = _create_graph() + + data = [{"val": 1}] + node_promise = input_schema.NodePromise(flow_id=1, node_id=1, node_type="manual_input") + graph.add_node_promise(node_promise) + graph.add_manual_input( + input_schema.NodeManualInput( + flow_id=1, node_id=1, + raw_data_format=input_schema.RawData.from_pylist(data), + ) + ) + + node_promise_2 = input_schema.NodePromise(flow_id=1, node_id=2, node_type="python_script") + graph.add_node_promise(node_promise_2) + + code = """ +df = flowfile.read_input() +flowfile.publish_artifact("my_model", {"accuracy": 0.95}) +flowfile.publish_output(df) +""" + graph.add_python_script( + input_schema.NodePythonScript( + flow_id=1, node_id=2, depending_on_ids=[1], + python_script_input=input_schema.PythonScriptInput( + code=code, kernel_id=kernel_id, + ), + ) + ) + add_connection(graph, input_schema.NodeConnection.create_from_simple_input(1, 2)) + + run_info = graph.run_graph() + _handle_run_info(run_info) + + published = graph.artifact_context.get_published_by_node(2) + assert len(published) >= 1 + names = [r.name for r in published] + assert "my_model" in names + finally: + _kernel_mod._manager = _prev + + def test_available_artifacts_computed_before_execution(self, kernel_manager: tuple[KernelManager, str]): + """Downstream nodes have correct available artifacts.""" + manager, kernel_id = kernel_manager + import flowfile_core.kernel as _kernel_mod + + _prev = _kernel_mod._manager + _kernel_mod._manager = manager + + try: + graph = _create_graph() + + data = [{"val": 1}] + node_promise = input_schema.NodePromise(flow_id=1, node_id=1, node_type="manual_input") + graph.add_node_promise(node_promise) + graph.add_manual_input( + input_schema.NodeManualInput( + flow_id=1, node_id=1, + raw_data_format=input_schema.RawData.from_pylist(data), + ) + ) + + # Node 2: publishes artifact + node_promise_2 = input_schema.NodePromise(flow_id=1, node_id=2, node_type="python_script") + graph.add_node_promise(node_promise_2) + code_publish = """ +df = flowfile.read_input() +flowfile.publish_artifact("trained_model", {"type": "RF"}) +flowfile.publish_output(df) +""" + graph.add_python_script( + input_schema.NodePythonScript( + flow_id=1, node_id=2, depending_on_ids=[1], + python_script_input=input_schema.PythonScriptInput( + code=code_publish, kernel_id=kernel_id, + ), + ) + ) + add_connection(graph, input_schema.NodeConnection.create_from_simple_input(1, 2)) + + # Node 3: reads artifact (downstream of node 2) + node_promise_3 = input_schema.NodePromise(flow_id=1, node_id=3, node_type="python_script") + graph.add_node_promise(node_promise_3) + code_consume = """ +df = flowfile.read_input() +model = flowfile.read_artifact("trained_model") +flowfile.publish_output(df) +""" + graph.add_python_script( + input_schema.NodePythonScript( + flow_id=1, node_id=3, depending_on_ids=[2], + python_script_input=input_schema.PythonScriptInput( + code=code_consume, kernel_id=kernel_id, + ), + ) + ) + add_connection(graph, input_schema.NodeConnection.create_from_simple_input(2, 3)) + + run_info = graph.run_graph() + _handle_run_info(run_info) + + # Node 3 should have "trained_model" available + available = graph.artifact_context.get_available_for_node(3) + assert "trained_model" in available + + finally: + _kernel_mod._manager = _prev + + def test_artifacts_cleared_between_runs(self, kernel_manager: tuple[KernelManager, str]): + """Running flow twice doesn't leak artifacts from first run.""" + manager, kernel_id = kernel_manager + import flowfile_core.kernel as _kernel_mod + _prev = _kernel_mod._manager + _kernel_mod._manager = manager + + try: + graph = _create_graph() + + data = [{"val": 1}] + node_promise = input_schema.NodePromise(flow_id=1, node_id=1, node_type="manual_input") + graph.add_node_promise(node_promise) + graph.add_manual_input( + input_schema.NodeManualInput( + flow_id=1, node_id=1, + raw_data_format=input_schema.RawData.from_pylist(data), + ) + ) + + node_promise_2 = input_schema.NodePromise(flow_id=1, node_id=2, node_type="python_script") + graph.add_node_promise(node_promise_2) + + code = """ +df = flowfile.read_input() +flowfile.publish_artifact("run_artifact", [1, 2, 3]) +flowfile.publish_output(df) +""" + graph.add_python_script( + input_schema.NodePythonScript( + flow_id=1, node_id=2, depending_on_ids=[1], + python_script_input=input_schema.PythonScriptInput( + code=code, kernel_id=kernel_id, + ), + ) + ) + add_connection(graph, input_schema.NodeConnection.create_from_simple_input(1, 2)) + + # First run + run_info = graph.run_graph() + _handle_run_info(run_info) + assert len(graph.artifact_context.get_published_by_node(2)) >= 1 + + # Second run — context should be cleared at start then repopulated + run_info2 = graph.run_graph() + _handle_run_info(run_info2) + + # Should still have the artifact from this run, but no leftover state + published = graph.artifact_context.get_published_by_node(2) + names = [r.name for r in published] + assert "run_artifact" in names + # Verify it's exactly one entry (not duplicated from first run) + assert names.count("run_artifact") == 1 + + finally: + _kernel_mod._manager = _prev + + def test_multiple_artifacts_from_single_node(self, kernel_manager: tuple[KernelManager, str]): + """Node publishing multiple artifacts records all of them.""" + manager, kernel_id = kernel_manager + import flowfile_core.kernel as _kernel_mod + + _prev = _kernel_mod._manager + _kernel_mod._manager = manager + + try: + graph = _create_graph() + + data = [{"val": 1}] + node_promise = input_schema.NodePromise(flow_id=1, node_id=1, node_type="manual_input") + graph.add_node_promise(node_promise) + graph.add_manual_input( + input_schema.NodeManualInput( + flow_id=1, node_id=1, + raw_data_format=input_schema.RawData.from_pylist(data), + ) + ) + + node_promise_2 = input_schema.NodePromise(flow_id=1, node_id=2, node_type="python_script") + graph.add_node_promise(node_promise_2) + + code = """ +df = flowfile.read_input() +flowfile.publish_artifact("model", {"type": "classifier"}) +flowfile.publish_artifact("encoder", {"type": "label_encoder"}) +flowfile.publish_output(df) +""" + graph.add_python_script( + input_schema.NodePythonScript( + flow_id=1, node_id=2, depending_on_ids=[1], + python_script_input=input_schema.PythonScriptInput( + code=code, kernel_id=kernel_id, + ), + ) + ) + add_connection(graph, input_schema.NodeConnection.create_from_simple_input(1, 2)) + + run_info = graph.run_graph() + _handle_run_info(run_info) + + published = graph.artifact_context.get_published_by_node(2) + names = {r.name for r in published} + assert "model" in names + assert "encoder" in names + + finally: + _kernel_mod._manager = _prev + + def test_artifact_context_to_dict_after_run(self, kernel_manager: tuple[KernelManager, str]): + """to_dict() returns valid structure after flow execution.""" + manager, kernel_id = kernel_manager + import flowfile_core.kernel as _kernel_mod + + _prev = _kernel_mod._manager + _kernel_mod._manager = manager + + try: + graph = _create_graph() + + data = [{"val": 1}] + node_promise = input_schema.NodePromise(flow_id=1, node_id=1, node_type="manual_input") + graph.add_node_promise(node_promise) + graph.add_manual_input( + input_schema.NodeManualInput( + flow_id=1, node_id=1, + raw_data_format=input_schema.RawData.from_pylist(data), + ) + ) + + node_promise_2 = input_schema.NodePromise(flow_id=1, node_id=2, node_type="python_script") + graph.add_node_promise(node_promise_2) + + code = """ +df = flowfile.read_input() +flowfile.publish_artifact("ctx_model", {"version": 1}) +flowfile.publish_output(df) +""" + graph.add_python_script( + input_schema.NodePythonScript( + flow_id=1, node_id=2, depending_on_ids=[1], + python_script_input=input_schema.PythonScriptInput( + code=code, kernel_id=kernel_id, + ), + ) + ) + add_connection(graph, input_schema.NodeConnection.create_from_simple_input(1, 2)) + + run_info = graph.run_graph() + _handle_run_info(run_info) + + d = graph.artifact_context.to_dict() + assert "nodes" in d + assert "kernels" in d + # Should have at least node 2 in nodes + assert "2" in d["nodes"] + # Kernel should be tracked + assert kernel_id in d["kernels"] + + finally: + _kernel_mod._manager = _prev + + def test_train_model_and_apply(self, kernel_manager: tuple[KernelManager, str]): + """Train a numpy linear-regression model in node 2, apply it in node 3.""" + manager, kernel_id = kernel_manager + import flowfile_core.kernel as _kernel_mod + + _prev = _kernel_mod._manager + _kernel_mod._manager = manager + + try: + graph = _create_graph() + + # Node 1: input data with features and target + data = [ + {"x1": 1.0, "x2": 2.0, "y": 5.0}, + {"x1": 2.0, "x2": 3.0, "y": 8.0}, + {"x1": 3.0, "x2": 4.0, "y": 11.0}, + {"x1": 4.0, "x2": 5.0, "y": 14.0}, + ] + node_promise = input_schema.NodePromise(flow_id=1, node_id=1, node_type="manual_input") + graph.add_node_promise(node_promise) + graph.add_manual_input( + input_schema.NodeManualInput( + flow_id=1, node_id=1, + raw_data_format=input_schema.RawData.from_pylist(data), + ) + ) + + # Node 2: train model (least-squares fit) and publish as artifact + node_promise_2 = input_schema.NodePromise(flow_id=1, node_id=2, node_type="python_script") + graph.add_node_promise(node_promise_2) + train_code = """ +import numpy as np +import polars as pl + +df = flowfile.read_input().collect() +X = np.column_stack([df["x1"].to_numpy(), df["x2"].to_numpy(), np.ones(len(df))]) +y_vals = df["y"].to_numpy() +coeffs = np.linalg.lstsq(X, y_vals, rcond=None)[0] +flowfile.publish_artifact("linear_model", {"coefficients": coeffs.tolist()}) +flowfile.publish_output(df) +""" + graph.add_python_script( + input_schema.NodePythonScript( + flow_id=1, node_id=2, depending_on_ids=[1], + python_script_input=input_schema.PythonScriptInput( + code=train_code, kernel_id=kernel_id, + ), + ) + ) + add_connection(graph, input_schema.NodeConnection.create_from_simple_input(1, 2)) + + # Node 3: load model and apply predictions + node_promise_3 = input_schema.NodePromise(flow_id=1, node_id=3, node_type="python_script") + graph.add_node_promise(node_promise_3) + apply_code = """ +import numpy as np +import polars as pl + +df = flowfile.read_input().collect() +model = flowfile.read_artifact("linear_model") +coeffs = np.array(model["coefficients"]) +X = np.column_stack([df["x1"].to_numpy(), df["x2"].to_numpy(), np.ones(len(df))]) +predictions = X @ coeffs +result = df.with_columns(pl.Series("predicted_y", predictions)) +flowfile.publish_output(result) +""" + graph.add_python_script( + input_schema.NodePythonScript( + flow_id=1, node_id=3, depending_on_ids=[2], + python_script_input=input_schema.PythonScriptInput( + code=apply_code, kernel_id=kernel_id, + ), + ) + ) + add_connection(graph, input_schema.NodeConnection.create_from_simple_input(2, 3)) + + run_info = graph.run_graph() + _handle_run_info(run_info) + + # Verify model was published and tracked + published = graph.artifact_context.get_published_by_node(2) + assert any(r.name == "linear_model" for r in published) + + # Verify node 3 had the model available + available = graph.artifact_context.get_available_for_node(3) + assert "linear_model" in available + + # Verify predictions were produced + node_3 = graph.get_node(3) + result_df = node_3.get_resulting_data().data_frame.collect() + assert "predicted_y" in result_df.columns + # The predictions should be close to the actual y values + preds = result_df["predicted_y"].to_list() + actuals = result_df["y"].to_list() + for pred, actual in zip(preds, actuals): + assert abs(pred - actual) < 0.01, f"Prediction {pred} too far from {actual}" + + finally: + _kernel_mod._manager = _prev + + def test_publish_delete_republish_access(self, kernel_manager: tuple[KernelManager, str]): + """ + Flow: node_a publishes model -> node_b uses & deletes model -> + node_c publishes new model -> node_d accesses new model. + """ + manager, kernel_id = kernel_manager + import flowfile_core.kernel as _kernel_mod + + _prev = _kernel_mod._manager + _kernel_mod._manager = manager + + try: + graph = _create_graph() + + # Node 1: input data + data = [{"val": 1}] + node_promise = input_schema.NodePromise(flow_id=1, node_id=1, node_type="manual_input") + graph.add_node_promise(node_promise) + graph.add_manual_input( + input_schema.NodeManualInput( + flow_id=1, node_id=1, + raw_data_format=input_schema.RawData.from_pylist(data), + ) + ) + + # Node 2 (node_a): publish artifact_model v1 + node_promise_2 = input_schema.NodePromise(flow_id=1, node_id=2, node_type="python_script") + graph.add_node_promise(node_promise_2) + code_a = """ +df = flowfile.read_input() +flowfile.publish_artifact("artifact_model", {"version": 1, "weights": [0.5]}) +flowfile.publish_output(df) +""" + graph.add_python_script( + input_schema.NodePythonScript( + flow_id=1, node_id=2, depending_on_ids=[1], + python_script_input=input_schema.PythonScriptInput( + code=code_a, kernel_id=kernel_id, + ), + ) + ) + add_connection(graph, input_schema.NodeConnection.create_from_simple_input(1, 2)) + + # Node 3 (node_b): read artifact_model, use it, then delete it + node_promise_3 = input_schema.NodePromise(flow_id=1, node_id=3, node_type="python_script") + graph.add_node_promise(node_promise_3) + code_b = """ +df = flowfile.read_input() +model = flowfile.read_artifact("artifact_model") +assert model["version"] == 1, f"Expected v1, got {model}" +flowfile.delete_artifact("artifact_model") +flowfile.publish_output(df) +""" + graph.add_python_script( + input_schema.NodePythonScript( + flow_id=1, node_id=3, depending_on_ids=[2], + python_script_input=input_schema.PythonScriptInput( + code=code_b, kernel_id=kernel_id, + ), + ) + ) + add_connection(graph, input_schema.NodeConnection.create_from_simple_input(2, 3)) + + # Node 4 (node_c): publish new artifact_model v2 + node_promise_4 = input_schema.NodePromise(flow_id=1, node_id=4, node_type="python_script") + graph.add_node_promise(node_promise_4) + code_c = """ +df = flowfile.read_input() +flowfile.publish_artifact("artifact_model", {"version": 2, "weights": [0.9]}) +flowfile.publish_output(df) +""" + graph.add_python_script( + input_schema.NodePythonScript( + flow_id=1, node_id=4, depending_on_ids=[3], + python_script_input=input_schema.PythonScriptInput( + code=code_c, kernel_id=kernel_id, + ), + ) + ) + add_connection(graph, input_schema.NodeConnection.create_from_simple_input(3, 4)) + + # Node 5 (node_d): read artifact_model — should get v2 + node_promise_5 = input_schema.NodePromise(flow_id=1, node_id=5, node_type="python_script") + graph.add_node_promise(node_promise_5) + code_d = """ +df = flowfile.read_input() +model = flowfile.read_artifact("artifact_model") +assert model["version"] == 2, f"Expected v2, got {model}" +flowfile.publish_output(df) +""" + graph.add_python_script( + input_schema.NodePythonScript( + flow_id=1, node_id=5, depending_on_ids=[4], + python_script_input=input_schema.PythonScriptInput( + code=code_d, kernel_id=kernel_id, + ), + ) + ) + add_connection(graph, input_schema.NodeConnection.create_from_simple_input(4, 5)) + + run_info = graph.run_graph() + _handle_run_info(run_info) + + # Verify artifact context tracks the flow correctly + # Node 4 re-published artifact_model + published_4 = graph.artifact_context.get_published_by_node(4) + assert any(r.name == "artifact_model" for r in published_4) + + # Node 5 should see artifact_model as available (from node 4) + available_5 = graph.artifact_context.get_available_for_node(5) + assert "artifact_model" in available_5 + assert available_5["artifact_model"].source_node_id == 4 + + finally: + _kernel_mod._manager = _prev + + def test_duplicate_publish_fails(self, kernel_manager: tuple[KernelManager, str]): + """Publishing an artifact with the same name without deleting first should fail.""" + manager, kernel_id = kernel_manager + import flowfile_core.kernel as _kernel_mod + + _prev = _kernel_mod._manager + _kernel_mod._manager = manager + + try: + graph = _create_graph() + + data = [{"val": 1}] + node_promise = input_schema.NodePromise(flow_id=1, node_id=1, node_type="manual_input") + graph.add_node_promise(node_promise) + graph.add_manual_input( + input_schema.NodeManualInput( + flow_id=1, node_id=1, + raw_data_format=input_schema.RawData.from_pylist(data), + ) + ) + + # Node 2: publishes artifact + node_promise_2 = input_schema.NodePromise(flow_id=1, node_id=2, node_type="python_script") + graph.add_node_promise(node_promise_2) + code_publish = """ +df = flowfile.read_input() +flowfile.publish_artifact("model", "v1") +flowfile.publish_output(df) +""" + graph.add_python_script( + input_schema.NodePythonScript( + flow_id=1, node_id=2, depending_on_ids=[1], + python_script_input=input_schema.PythonScriptInput( + code=code_publish, kernel_id=kernel_id, + ), + ) + ) + add_connection(graph, input_schema.NodeConnection.create_from_simple_input(1, 2)) + + # Node 3: tries to publish same name without deleting — should fail + node_promise_3 = input_schema.NodePromise(flow_id=1, node_id=3, node_type="python_script") + graph.add_node_promise(node_promise_3) + code_dup = """ +df = flowfile.read_input() +flowfile.publish_artifact("model", "v2") +flowfile.publish_output(df) +""" + graph.add_python_script( + input_schema.NodePythonScript( + flow_id=1, node_id=3, depending_on_ids=[2], + python_script_input=input_schema.PythonScriptInput( + code=code_dup, kernel_id=kernel_id, + ), + ) + ) + add_connection(graph, input_schema.NodeConnection.create_from_simple_input(2, 3)) + + run_info = graph.run_graph() + + # Node 3 should have failed + node_3_result = next( + r for r in run_info.node_step_result if r.node_id == 3 + ) + assert node_3_result.success is False + assert "already exists" in node_3_result.error + + finally: + _kernel_mod._manager = _prev + + def test_multi_input_python_script(self, kernel_manager: tuple[KernelManager, str]): + """python_script node receives data from multiple input nodes and unions them.""" + manager, kernel_id = kernel_manager + import flowfile_core.kernel as _kernel_mod + + _prev = _kernel_mod._manager + _kernel_mod._manager = manager + + try: + graph = _create_graph() + + # Node 1: first input dataset + data_a = [{"id": 1, "value": "alpha"}, {"id": 2, "value": "beta"}] + node_promise_1 = input_schema.NodePromise(flow_id=1, node_id=1, node_type="manual_input") + graph.add_node_promise(node_promise_1) + graph.add_manual_input( + input_schema.NodeManualInput( + flow_id=1, node_id=1, + raw_data_format=input_schema.RawData.from_pylist(data_a), + ) + ) + + # Node 2: second input dataset (same schema, different rows) + data_b = [{"id": 3, "value": "gamma"}, {"id": 4, "value": "delta"}] + node_promise_2 = input_schema.NodePromise(flow_id=1, node_id=2, node_type="manual_input") + graph.add_node_promise(node_promise_2) + graph.add_manual_input( + input_schema.NodeManualInput( + flow_id=1, node_id=2, + raw_data_format=input_schema.RawData.from_pylist(data_b), + ) + ) + + # Node 3: python_script that reads all inputs (union) and outputs the result + node_promise_3 = input_schema.NodePromise(flow_id=1, node_id=3, node_type="python_script") + graph.add_node_promise(node_promise_3) + + code = """ +import polars as pl +df = flowfile.read_input().collect() +# Should contain all 4 rows from both inputs +assert len(df) == 4, f"Expected 4 rows, got {len(df)}" +flowfile.publish_output(df) +""" + graph.add_python_script( + input_schema.NodePythonScript( + flow_id=1, node_id=3, depending_on_ids=[1, 2], + python_script_input=input_schema.PythonScriptInput( + code=code, kernel_id=kernel_id, + ), + ) + ) + + # Connect both inputs to node 3 + add_connection(graph, input_schema.NodeConnection.create_from_simple_input(1, 3)) + add_connection(graph, input_schema.NodeConnection.create_from_simple_input(2, 3)) + + run_info = graph.run_graph() + _handle_run_info(run_info) + + # Verify the output contains all rows from both inputs + result = graph.get_node(3).get_resulting_data() + assert result is not None + df = result.data_frame + if hasattr(df, "collect"): + df = df.collect() + assert len(df) == 4 + assert set(df.columns) >= {"id", "value"} + ids = sorted(df["id"].to_list()) + assert ids == [1, 2, 3, 4] + + finally: + _kernel_mod._manager = _prev + + def test_multi_input_read_inputs_named(self, kernel_manager: tuple[KernelManager, str]): + """python_script node uses read_inputs() to access multiple named inputs individually.""" + manager, kernel_id = kernel_manager + import flowfile_core.kernel as _kernel_mod + + _prev = _kernel_mod._manager + _kernel_mod._manager = manager + + try: + graph = _create_graph() + + # Node 1: users dataset + users = [{"user_id": 1, "name": "Alice"}, {"user_id": 2, "name": "Bob"}] + node_promise_1 = input_schema.NodePromise(flow_id=1, node_id=1, node_type="manual_input") + graph.add_node_promise(node_promise_1) + graph.add_manual_input( + input_schema.NodeManualInput( + flow_id=1, node_id=1, + raw_data_format=input_schema.RawData.from_pylist(users), + ) + ) + + # Node 2: scores dataset + scores = [{"user_id": 1, "score": 95}, {"user_id": 2, "score": 87}] + node_promise_2 = input_schema.NodePromise(flow_id=1, node_id=2, node_type="manual_input") + graph.add_node_promise(node_promise_2) + graph.add_manual_input( + input_schema.NodeManualInput( + flow_id=1, node_id=2, + raw_data_format=input_schema.RawData.from_pylist(scores), + ) + ) + + # Node 3: python_script that reads first input and passes it through + # Since all inputs go under "main", read_first gets just the first + node_promise_3 = input_schema.NodePromise(flow_id=1, node_id=3, node_type="python_script") + graph.add_node_promise(node_promise_3) + + code = """ +import polars as pl +df = flowfile.read_first().collect() +# read_first should return only the first input (2 rows, not 4) +assert len(df) == 2, f"Expected 2 rows from read_first, got {len(df)}" +flowfile.publish_output(df) +""" + graph.add_python_script( + input_schema.NodePythonScript( + flow_id=1, node_id=3, depending_on_ids=[1, 2], + python_script_input=input_schema.PythonScriptInput( + code=code, kernel_id=kernel_id, + ), + ) + ) + + add_connection(graph, input_schema.NodeConnection.create_from_simple_input(1, 3)) + add_connection(graph, input_schema.NodeConnection.create_from_simple_input(2, 3)) + + run_info = graph.run_graph() + _handle_run_info(run_info) + + result = graph.get_node(3).get_resulting_data() + assert result is not None + df = result.data_frame + if hasattr(df, "collect"): + df = df.collect() + # read_first returns only the first input's data + assert len(df) == 2 + + finally: + _kernel_mod._manager = _prev diff --git a/kernel_runtime/kernel_runtime/artifact_store.py b/kernel_runtime/kernel_runtime/artifact_store.py index 5814b9350..a18ed81db 100644 --- a/kernel_runtime/kernel_runtime/artifact_store.py +++ b/kernel_runtime/kernel_runtime/artifact_store.py @@ -13,6 +13,13 @@ def __init__(self): def publish(self, name: str, obj: Any, node_id: int) -> None: with self._lock: + if name in self._artifacts: + raise ValueError( + f"Artifact '{name}' already exists (published by node " + f"{self._artifacts[name]['node_id']}). " + f"Delete it first with flowfile.delete_artifact('{name}') " + f"before publishing a new one with the same name." + ) self._artifacts[name] = { "object": obj, "name": name, @@ -23,6 +30,12 @@ def publish(self, name: str, obj: Any, node_id: int) -> None: "size_bytes": sys.getsizeof(obj), } + def delete(self, name: str) -> None: + with self._lock: + if name not in self._artifacts: + raise KeyError(f"Artifact '{name}' not found") + del self._artifacts[name] + def get(self, name: str) -> Any: with self._lock: if name not in self._artifacts: diff --git a/kernel_runtime/kernel_runtime/flowfile_client.py b/kernel_runtime/kernel_runtime/flowfile_client.py index 4d1daacc3..bed63c2e8 100644 --- a/kernel_runtime/kernel_runtime/flowfile_client.py +++ b/kernel_runtime/kernel_runtime/flowfile_client.py @@ -1,5 +1,6 @@ from __future__ import annotations +import contextvars import os from pathlib import Path from typing import Any @@ -8,42 +9,77 @@ from kernel_runtime.artifact_store import ArtifactStore -_context: dict[str, Any] = {} +_context: contextvars.ContextVar[dict[str, Any]] = contextvars.ContextVar("flowfile_context") def _set_context( node_id: int, - input_paths: dict[str, str], + input_paths: dict[str, list[str]], output_dir: str, artifact_store: ArtifactStore, ) -> None: - _context["node_id"] = node_id - _context["input_paths"] = input_paths - _context["output_dir"] = output_dir - _context["artifact_store"] = artifact_store + _context.set({ + "node_id": node_id, + "input_paths": input_paths, + "output_dir": output_dir, + "artifact_store": artifact_store, + }) def _clear_context() -> None: - _context.clear() + _context.set({}) def _get_context_value(key: str) -> Any: - if key not in _context: + ctx = _context.get({}) + if key not in ctx: raise RuntimeError(f"flowfile context not initialized (missing '{key}'). This API is only available during /execute.") - return _context[key] + return ctx[key] def read_input(name: str = "main") -> pl.LazyFrame: - input_paths: dict[str, str] = _get_context_value("input_paths") + """Read all input files for *name* and return them as a single LazyFrame. + + When multiple paths are registered under the same name (e.g. a union + of several upstream nodes), all files are scanned and concatenated + automatically by Polars. + """ + input_paths: dict[str, list[str]] = _get_context_value("input_paths") if name not in input_paths: available = list(input_paths.keys()) raise KeyError(f"Input '{name}' not found. Available inputs: {available}") - return pl.scan_parquet(input_paths[name]) + paths = input_paths[name] + if len(paths) == 1: + return pl.scan_parquet(paths[0]) + return pl.scan_parquet(paths) + + +def read_first(name: str = "main") -> pl.LazyFrame: + """Read only the first input file for *name*. + + This is a convenience shortcut equivalent to scanning + ``input_paths[name][0]``. + """ + input_paths: dict[str, list[str]] = _get_context_value("input_paths") + if name not in input_paths: + available = list(input_paths.keys()) + raise KeyError(f"Input '{name}' not found. Available inputs: {available}") + return pl.scan_parquet(input_paths[name][0]) def read_inputs() -> dict[str, pl.LazyFrame]: - input_paths: dict[str, str] = _get_context_value("input_paths") - return {name: pl.scan_parquet(path) for name, path in input_paths.items()} + """Read all named inputs, returning a dict of LazyFrames. + + Each entry concatenates all paths registered under that name. + """ + input_paths: dict[str, list[str]] = _get_context_value("input_paths") + result: dict[str, pl.LazyFrame] = {} + for name, paths in input_paths.items(): + if len(paths) == 1: + result[name] = pl.scan_parquet(paths[0]) + else: + result[name] = pl.scan_parquet(paths) + return result def publish_output(df: pl.LazyFrame | pl.DataFrame, name: str = "main") -> None: @@ -66,6 +102,11 @@ def read_artifact(name: str) -> Any: return store.get(name) +def delete_artifact(name: str) -> None: + store: ArtifactStore = _get_context_value("artifact_store") + store.delete(name) + + def list_artifacts() -> dict: store: ArtifactStore = _get_context_value("artifact_store") return store.list_all() diff --git a/kernel_runtime/kernel_runtime/main.py b/kernel_runtime/kernel_runtime/main.py index f518510d5..36b6d5ff2 100644 --- a/kernel_runtime/kernel_runtime/main.py +++ b/kernel_runtime/kernel_runtime/main.py @@ -17,7 +17,7 @@ class ExecuteRequest(BaseModel): node_id: int code: str - input_paths: dict[str, str] = {} + input_paths: dict[str, list[str]] = {} output_dir: str = "" @@ -25,6 +25,7 @@ class ExecuteResponse(BaseModel): success: bool output_paths: list[str] = [] artifacts_published: list[str] = [] + artifacts_deleted: list[str] = [] stdout: str = "" stderr: str = "" error: str | None = None @@ -63,12 +64,14 @@ async def execute(request: ExecuteRequest): artifacts_after = set(artifact_store.list_all().keys()) new_artifacts = sorted(artifacts_after - artifacts_before) + deleted_artifacts = sorted(artifacts_before - artifacts_after) elapsed = (time.perf_counter() - start) * 1000 return ExecuteResponse( success=True, output_paths=output_paths, artifacts_published=new_artifacts, + artifacts_deleted=deleted_artifacts, stdout=stdout_buf.getvalue(), stderr=stderr_buf.getvalue(), execution_time_ms=elapsed, diff --git a/kernel_runtime/tests/test_artifact_store.py b/kernel_runtime/tests/test_artifact_store.py index 61e1d7c13..6c9564525 100644 --- a/kernel_runtime/tests/test_artifact_store.py +++ b/kernel_runtime/tests/test_artifact_store.py @@ -12,8 +12,14 @@ def test_publish_and_retrieve(self, store: ArtifactStore): store.publish("my_obj", {"a": 1}, node_id=1) assert store.get("my_obj") == {"a": 1} - def test_publish_overwrites(self, store: ArtifactStore): + def test_publish_duplicate_raises(self, store: ArtifactStore): store.publish("key", "first", node_id=1) + with pytest.raises(ValueError, match="already exists"): + store.publish("key", "second", node_id=2) + + def test_publish_after_delete_succeeds(self, store: ArtifactStore): + store.publish("key", "first", node_id=1) + store.delete("key") store.publish("key", "second", node_id=2) assert store.get("key") == "second" @@ -76,6 +82,30 @@ def test_clear_idempotent(self, store: ArtifactStore): assert store.list_all() == {} +class TestDelete: + def test_delete_removes_artifact(self, store: ArtifactStore): + store.publish("model", {"w": [1, 2]}, node_id=1) + store.delete("model") + assert "model" not in store.list_all() + + def test_delete_missing_raises(self, store: ArtifactStore): + with pytest.raises(KeyError, match="not found"): + store.delete("nonexistent") + + def test_delete_then_get_raises(self, store: ArtifactStore): + store.publish("tmp", 42, node_id=1) + store.delete("tmp") + with pytest.raises(KeyError, match="not found"): + store.get("tmp") + + def test_delete_only_target(self, store: ArtifactStore): + store.publish("keep", 1, node_id=1) + store.publish("remove", 2, node_id=1) + store.delete("remove") + assert store.get("keep") == 1 + assert set(store.list_all().keys()) == {"keep"} + + class TestThreadSafety: def test_concurrent_publishes(self, store: ArtifactStore): errors = [] diff --git a/kernel_runtime/tests/test_flowfile_client.py b/kernel_runtime/tests/test_flowfile_client.py index 5d745de8a..431dd5c45 100644 --- a/kernel_runtime/tests/test_flowfile_client.py +++ b/kernel_runtime/tests/test_flowfile_client.py @@ -33,7 +33,7 @@ def ctx(tmp_dir: Path) -> dict: flowfile_client._set_context( node_id=1, - input_paths={"main": str(main_path)}, + input_paths={"main": [str(main_path)]}, output_dir=str(output_dir), artifact_store=store, ) @@ -103,7 +103,7 @@ def test_multiple_named_inputs(self, tmp_dir: Path): flowfile_client._set_context( node_id=2, - input_paths={"left": str(left_path), "right": str(right_path)}, + input_paths={"left": [str(left_path)], "right": [str(right_path)]}, output_dir=str(tmp_dir / "outputs"), artifact_store=store, ) @@ -113,6 +113,76 @@ def test_multiple_named_inputs(self, tmp_dir: Path): assert inputs["left"].collect()["id"].to_list() == [1, 2] assert inputs["right"].collect()["id"].to_list() == [3, 4] + def test_read_input_concatenates_multiple_main_paths(self, tmp_dir: Path): + """When 'main' has multiple paths, read_input returns a union of all.""" + store = ArtifactStore() + input_dir = tmp_dir / "inputs" + input_dir.mkdir(exist_ok=True) + + path_a = input_dir / "main_0.parquet" + path_b = input_dir / "main_1.parquet" + pl.DataFrame({"val": [1, 2]}).write_parquet(str(path_a)) + pl.DataFrame({"val": [3, 4]}).write_parquet(str(path_b)) + + flowfile_client._set_context( + node_id=3, + input_paths={"main": [str(path_a), str(path_b)]}, + output_dir=str(tmp_dir / "outputs"), + artifact_store=store, + ) + + df = flowfile_client.read_input().collect() + assert sorted(df["val"].to_list()) == [1, 2, 3, 4] + + def test_read_first_returns_only_first(self, tmp_dir: Path): + """read_first returns only the first file, not the union.""" + store = ArtifactStore() + input_dir = tmp_dir / "inputs" + input_dir.mkdir(exist_ok=True) + + path_a = input_dir / "main_0.parquet" + path_b = input_dir / "main_1.parquet" + pl.DataFrame({"val": [1, 2]}).write_parquet(str(path_a)) + pl.DataFrame({"val": [3, 4]}).write_parquet(str(path_b)) + + flowfile_client._set_context( + node_id=4, + input_paths={"main": [str(path_a), str(path_b)]}, + output_dir=str(tmp_dir / "outputs"), + artifact_store=store, + ) + + df = flowfile_client.read_first().collect() + assert df["val"].to_list() == [1, 2] + + def test_read_first_missing_name_raises(self, ctx: dict): + with pytest.raises(KeyError, match="not found"): + flowfile_client.read_first("nonexistent") + + def test_read_inputs_with_multiple_main_paths(self, tmp_dir: Path): + """read_inputs should concatenate paths per name.""" + store = ArtifactStore() + input_dir = tmp_dir / "inputs" + input_dir.mkdir(exist_ok=True) + + path_0 = input_dir / "main_0.parquet" + path_1 = input_dir / "main_1.parquet" + path_2 = input_dir / "main_2.parquet" + pl.DataFrame({"x": [1]}).write_parquet(str(path_0)) + pl.DataFrame({"x": [2]}).write_parquet(str(path_1)) + pl.DataFrame({"x": [3]}).write_parquet(str(path_2)) + + flowfile_client._set_context( + node_id=5, + input_paths={"main": [str(path_0), str(path_1), str(path_2)]}, + output_dir=str(tmp_dir / "outputs"), + artifact_store=store, + ) + + inputs = flowfile_client.read_inputs() + df = inputs["main"].collect() + assert sorted(df["x"].to_list()) == [1, 2, 3] + class TestPublishOutput: def test_publish_dataframe(self, ctx: dict): @@ -166,3 +236,24 @@ def test_list_artifacts(self, ctx: dict): def test_read_missing_artifact_raises(self, ctx: dict): with pytest.raises(KeyError, match="not found"): flowfile_client.read_artifact("missing") + + def test_publish_duplicate_artifact_raises(self, ctx: dict): + flowfile_client.publish_artifact("model", {"v": 1}) + with pytest.raises(ValueError, match="already exists"): + flowfile_client.publish_artifact("model", {"v": 2}) + + def test_delete_artifact(self, ctx: dict): + flowfile_client.publish_artifact("temp", 42) + flowfile_client.delete_artifact("temp") + with pytest.raises(KeyError, match="not found"): + flowfile_client.read_artifact("temp") + + def test_delete_missing_artifact_raises(self, ctx: dict): + with pytest.raises(KeyError, match="not found"): + flowfile_client.delete_artifact("nonexistent") + + def test_delete_then_republish(self, ctx: dict): + flowfile_client.publish_artifact("model", "v1") + flowfile_client.delete_artifact("model") + flowfile_client.publish_artifact("model", "v2") + assert flowfile_client.read_artifact("model") == "v2" diff --git a/kernel_runtime/tests/test_main.py b/kernel_runtime/tests/test_main.py index 6e56ce99b..dd0bf7d8f 100644 --- a/kernel_runtime/tests/test_main.py +++ b/kernel_runtime/tests/test_main.py @@ -129,7 +129,7 @@ def test_read_and_write_parquet(self, client: TestClient, tmp_dir: Path): json={ "node_id": 10, "code": code, - "input_paths": {"main": str(input_path)}, + "input_paths": {"main": [str(input_path)]}, "output_dir": str(output_dir), }, ) @@ -170,8 +170,8 @@ def test_multiple_inputs(self, client: TestClient, tmp_dir: Path): "node_id": 11, "code": code, "input_paths": { - "left": str(input_dir / "left.parquet"), - "right": str(input_dir / "right.parquet"), + "left": [str(input_dir / "left.parquet")], + "right": [str(input_dir / "right.parquet")], }, "output_dir": str(output_dir), }, @@ -183,6 +183,76 @@ def test_multiple_inputs(self, client: TestClient, tmp_dir: Path): assert set(df_out.columns) == {"id", "name", "score"} assert len(df_out) == 2 + def test_multi_main_inputs_union(self, client: TestClient, tmp_dir: Path): + """Multiple paths under 'main' are concatenated (union) by read_input.""" + input_dir = tmp_dir / "inputs" + output_dir = tmp_dir / "outputs" + input_dir.mkdir() + output_dir.mkdir() + + pl.DataFrame({"v": [1, 2]}).write_parquet(str(input_dir / "main_0.parquet")) + pl.DataFrame({"v": [3, 4]}).write_parquet(str(input_dir / "main_1.parquet")) + + code = ( + "df = flowfile.read_input().collect()\n" + "flowfile.publish_output(df)\n" + ) + + resp = client.post( + "/execute", + json={ + "node_id": 13, + "code": code, + "input_paths": { + "main": [ + str(input_dir / "main_0.parquet"), + str(input_dir / "main_1.parquet"), + ], + }, + "output_dir": str(output_dir), + }, + ) + data = resp.json() + assert data["success"] is True, f"Execution failed: {data['error']}" + + df_out = pl.read_parquet(str(output_dir / "main.parquet")) + assert sorted(df_out["v"].to_list()) == [1, 2, 3, 4] + + def test_read_first_via_execute(self, client: TestClient, tmp_dir: Path): + """read_first returns only the first input file.""" + input_dir = tmp_dir / "inputs" + output_dir = tmp_dir / "outputs" + input_dir.mkdir() + output_dir.mkdir() + + pl.DataFrame({"v": [10, 20]}).write_parquet(str(input_dir / "a.parquet")) + pl.DataFrame({"v": [30, 40]}).write_parquet(str(input_dir / "b.parquet")) + + code = ( + "df = flowfile.read_first().collect()\n" + "flowfile.publish_output(df)\n" + ) + + resp = client.post( + "/execute", + json={ + "node_id": 14, + "code": code, + "input_paths": { + "main": [ + str(input_dir / "a.parquet"), + str(input_dir / "b.parquet"), + ], + }, + "output_dir": str(output_dir), + }, + ) + data = resp.json() + assert data["success"] is True, f"Execution failed: {data['error']}" + + df_out = pl.read_parquet(str(output_dir / "main.parquet")) + assert df_out["v"].to_list() == [10, 20] + def test_publish_lazyframe_output(self, client: TestClient, tmp_dir: Path): input_dir = tmp_dir / "inputs" output_dir = tmp_dir / "outputs" @@ -201,7 +271,7 @@ def test_publish_lazyframe_output(self, client: TestClient, tmp_dir: Path): json={ "node_id": 12, "code": code, - "input_paths": {"main": str(input_dir / "main.parquet")}, + "input_paths": {"main": [str(input_dir / "main.parquet")]}, "output_dir": str(output_dir), }, ) @@ -280,6 +350,108 @@ def test_health_shows_artifact_count(self, client: TestClient): resp = client.get("/health") assert resp.json()["artifact_count"] == 1 + def test_duplicate_publish_fails(self, client: TestClient): + """Publishing an artifact with the same name twice should fail.""" + resp = client.post( + "/execute", + json={ + "node_id": 24, + "code": 'flowfile.publish_artifact("model", 1)', + "input_paths": {}, + "output_dir": "", + }, + ) + assert resp.json()["success"] is True + + resp2 = client.post( + "/execute", + json={ + "node_id": 25, + "code": 'flowfile.publish_artifact("model", 2)', + "input_paths": {}, + "output_dir": "", + }, + ) + data = resp2.json() + assert data["success"] is False + assert "already exists" in data["error"] + + def test_delete_artifact_via_execute(self, client: TestClient): + """delete_artifact removes from the store and appears in artifacts_deleted.""" + client.post( + "/execute", + json={ + "node_id": 26, + "code": 'flowfile.publish_artifact("temp", 99)', + "input_paths": {}, + "output_dir": "", + }, + ) + resp = client.post( + "/execute", + json={ + "node_id": 27, + "code": 'flowfile.delete_artifact("temp")', + "input_paths": {}, + "output_dir": "", + }, + ) + data = resp.json() + assert data["success"] is True + assert "temp" in data["artifacts_deleted"] + + # Verify artifact is gone + resp_list = client.get("/artifacts") + assert "temp" not in resp_list.json() + + def test_delete_then_republish_via_execute(self, client: TestClient): + """After deleting, a new artifact with the same name can be published.""" + client.post( + "/execute", + json={ + "node_id": 28, + "code": 'flowfile.publish_artifact("model", "v1")', + "input_paths": {}, + "output_dir": "", + }, + ) + resp = client.post( + "/execute", + json={ + "node_id": 29, + "code": ( + 'flowfile.delete_artifact("model")\n' + 'flowfile.publish_artifact("model", "v2")\n' + ), + "input_paths": {}, + "output_dir": "", + }, + ) + data = resp.json() + assert data["success"] is True + # The artifact was deleted and re-published in the same call. + # Since the final state has "model" which didn't exist before the + # first publish in this request, it depends on whether it was in + # artifacts_before. Since it existed before this execute call, + # and still exists after, it's neither new nor deleted from the + # perspective of this single call. But the name was re-published + # so it shouldn't appear in artifacts_deleted. + # Let's just verify the artifact exists and has the new value. + resp_read = client.post( + "/execute", + json={ + "node_id": 30, + "code": ( + 'v = flowfile.read_artifact("model")\n' + 'print(v)\n' + ), + "input_paths": {}, + "output_dir": "", + }, + ) + assert resp_read.json()["success"] is True + assert "v2" in resp_read.json()["stdout"] + class TestContextCleanup: def test_context_cleared_after_success(self, client: TestClient): From e6657c6fafe10563692aa971b6c0956455314369 Mon Sep 17 00:00:00 2001 From: Edward van Eechoud <41021650+Edwardvaneechoud@users.noreply.github.com> Date: Sun, 1 Feb 2026 10:34:49 +0100 Subject: [PATCH 04/38] Add Python Script node with kernel and artifact support (#287) * Add PythonScript node drawer with kernel selection, code editor, and artifacts panel Implements the frontend drawer UI for the python_script node type: - Kernel selection dropdown with state indicators and warnings - CodeMirror editor with Python syntax highlighting and flowfile API autocompletions - Artifacts panel showing available/published artifacts from kernel - Help modal documenting the flowfile.* API with examples - TypeScript types for PythonScriptInput and NodePythonScript - KernelApi.getArtifacts() method for fetching kernel artifact metadata https://claude.ai/code/session_017DkcGkambwWqtNQKqMBetg * Fix published artifacts matching by using correct field name from kernel API The kernel's /artifacts endpoint returns `node_id` (not `source_node_id`) to identify which node published each artifact. Updated the frontend to read the correct field so published artifacts display properly. https://claude.ai/code/session_017DkcGkambwWqtNQKqMBetg * add translator * Split artifacts into available (other nodes) vs published (this node) Available artifacts should only show artifacts from upstream nodes, not the current node's own publications. Filter by node_id !== currentNodeId for available, and node_id === currentNodeId for published. --- .../flowfile_core/schemas/schemas.py | 1 + .../src/renderer/app/api/kernel.api.ts | 12 + .../elements/pythonScript/FlowfileApiHelp.vue | 252 ++++++++ .../elements/pythonScript/PythonScript.vue | 578 ++++++++++++++++++ .../pythonScript/flowfileCompletions.ts | 81 +++ .../node-types/elements/pythonScript/utils.ts | 33 + .../src/renderer/app/types/node.types.ts | 13 + 7 files changed, 970 insertions(+) create mode 100644 flowfile_frontend/src/renderer/app/components/nodes/node-types/elements/pythonScript/FlowfileApiHelp.vue create mode 100644 flowfile_frontend/src/renderer/app/components/nodes/node-types/elements/pythonScript/PythonScript.vue create mode 100644 flowfile_frontend/src/renderer/app/components/nodes/node-types/elements/pythonScript/flowfileCompletions.ts create mode 100644 flowfile_frontend/src/renderer/app/components/nodes/node-types/elements/pythonScript/utils.ts diff --git a/flowfile_core/flowfile_core/schemas/schemas.py b/flowfile_core/flowfile_core/schemas/schemas.py index 9458e14a7..4f903f627 100644 --- a/flowfile_core/flowfile_core/schemas/schemas.py +++ b/flowfile_core/flowfile_core/schemas/schemas.py @@ -28,6 +28,7 @@ "unpivot": input_schema.NodeUnpivot, "text_to_rows": input_schema.NodeTextToRows, "graph_solver": input_schema.NodeGraphSolver, + "python_script": input_schema.NodePythonScript, "polars_code": input_schema.NodePolarsCode, "join": input_schema.NodeJoin, "cross_join": input_schema.NodeCrossJoin, diff --git a/flowfile_frontend/src/renderer/app/api/kernel.api.ts b/flowfile_frontend/src/renderer/app/api/kernel.api.ts index aed1942f5..54e136780 100644 --- a/flowfile_frontend/src/renderer/app/api/kernel.api.ts +++ b/flowfile_frontend/src/renderer/app/api/kernel.api.ts @@ -69,6 +69,18 @@ export class KernelApi { } } + static async getArtifacts(kernelId: string): Promise> { + try { + const response = await axios.get>( + `${API_BASE_URL}/${encodeURIComponent(kernelId)}/artifacts`, + ); + return response.data; + } catch (error) { + console.error("API Error: Failed to get artifacts:", error); + return {}; + } + } + static async getDockerStatus(): Promise { try { const response = await axios.get(`${API_BASE_URL}/docker-status`); diff --git a/flowfile_frontend/src/renderer/app/components/nodes/node-types/elements/pythonScript/FlowfileApiHelp.vue b/flowfile_frontend/src/renderer/app/components/nodes/node-types/elements/pythonScript/FlowfileApiHelp.vue new file mode 100644 index 000000000..8d428992b --- /dev/null +++ b/flowfile_frontend/src/renderer/app/components/nodes/node-types/elements/pythonScript/FlowfileApiHelp.vue @@ -0,0 +1,252 @@ + + + + + diff --git a/flowfile_frontend/src/renderer/app/components/nodes/node-types/elements/pythonScript/PythonScript.vue b/flowfile_frontend/src/renderer/app/components/nodes/node-types/elements/pythonScript/PythonScript.vue new file mode 100644 index 000000000..a0b6daa10 --- /dev/null +++ b/flowfile_frontend/src/renderer/app/components/nodes/node-types/elements/pythonScript/PythonScript.vue @@ -0,0 +1,578 @@ + + + + + diff --git a/flowfile_frontend/src/renderer/app/components/nodes/node-types/elements/pythonScript/flowfileCompletions.ts b/flowfile_frontend/src/renderer/app/components/nodes/node-types/elements/pythonScript/flowfileCompletions.ts new file mode 100644 index 000000000..7b8db9da2 --- /dev/null +++ b/flowfile_frontend/src/renderer/app/components/nodes/node-types/elements/pythonScript/flowfileCompletions.ts @@ -0,0 +1,81 @@ +export const flowfileCompletionVals = [ + // flowfile module + { + label: "flowfile", + type: "variable", + info: "FlowFile API module for data I/O and artifacts", + }, + + // Data I/O functions + { + label: "read_input", + type: "function", + info: "Read input DataFrame. Optional name parameter for named inputs.", + detail: "flowfile.read_input(name?)", + apply: "read_input()", + }, + { + label: "read_inputs", + type: "function", + info: "Read all inputs as a dict of DataFrames.", + detail: "flowfile.read_inputs()", + apply: "read_inputs()", + }, + { + label: "publish_output", + type: "function", + info: "Write output DataFrame. Optional name parameter for named outputs.", + detail: "flowfile.publish_output(df, name?)", + apply: "publish_output(df)", + }, + + // Artifact functions + { + label: "publish_artifact", + type: "function", + info: "Store a Python object as a named artifact in kernel memory.", + detail: 'flowfile.publish_artifact("name", obj)', + apply: 'publish_artifact("name", obj)', + }, + { + label: "read_artifact", + type: "function", + info: "Retrieve a Python object from a named artifact.", + detail: 'flowfile.read_artifact("name")', + apply: 'read_artifact("name")', + }, + { + label: "delete_artifact", + type: "function", + info: "Remove a named artifact from kernel memory.", + detail: 'flowfile.delete_artifact("name")', + apply: 'delete_artifact("name")', + }, + { + label: "list_artifacts", + type: "function", + info: "List all artifacts available in the kernel.", + detail: "flowfile.list_artifacts()", + apply: "list_artifacts()", + }, + + // Polars basics (also useful in python_script context) + { label: "pl", type: "variable", info: "Polars main module" }, + { label: "col", type: "function", info: "Polars column selector" }, + { label: "lit", type: "function", info: "Polars literal value" }, + + // Common Polars operations + { label: "select", type: "method", info: "Select columns" }, + { label: "filter", type: "method", info: "Filter rows" }, + { label: "group_by", type: "method", info: "Group by columns" }, + { label: "with_columns", type: "method", info: "Add/modify columns" }, + { label: "join", type: "method", info: "Join operations" }, + { label: "sort", type: "method", info: "Sort DataFrame" }, + { label: "collect", type: "method", info: "Collect LazyFrame to DataFrame" }, + + // Basic Python + { label: "print", type: "function" }, + { label: "len", type: "function" }, + { label: "range", type: "function" }, + { label: "import", type: "keyword" }, +]; diff --git a/flowfile_frontend/src/renderer/app/components/nodes/node-types/elements/pythonScript/utils.ts b/flowfile_frontend/src/renderer/app/components/nodes/node-types/elements/pythonScript/utils.ts new file mode 100644 index 000000000..046bd685c --- /dev/null +++ b/flowfile_frontend/src/renderer/app/components/nodes/node-types/elements/pythonScript/utils.ts @@ -0,0 +1,33 @@ +import type { NodePythonScript, PythonScriptInput } from "../../../../../types/node.types"; + +export const DEFAULT_PYTHON_SCRIPT_CODE = `import polars as pl + +# Read input data +df = flowfile.read_input() + +# Your transformation here +# df = df.filter(pl.col("column") > 0) + +# Publish output +flowfile.publish_output(df) +`; + +export const createPythonScriptNode = ( + flowId: number, + nodeId: number, +): NodePythonScript => { + const pythonScriptInput: PythonScriptInput = { + code: DEFAULT_PYTHON_SCRIPT_CODE, + kernel_id: null, + }; + + return { + flow_id: flowId, + node_id: nodeId, + pos_x: 0, + pos_y: 0, + depending_on_ids: null, + python_script_input: pythonScriptInput, + cache_results: false, + }; +}; diff --git a/flowfile_frontend/src/renderer/app/types/node.types.ts b/flowfile_frontend/src/renderer/app/types/node.types.ts index cf6e7e30d..b92b9b598 100644 --- a/flowfile_frontend/src/renderer/app/types/node.types.ts +++ b/flowfile_frontend/src/renderer/app/types/node.types.ts @@ -520,6 +520,15 @@ export interface PolarsCodeInput { polars_code: string; } +// ============================================================================ +// Python Script Types +// ============================================================================ + +export interface PythonScriptInput { + code: string; + kernel_id: string | null; +} + // ============================================================================ // Union Types // ============================================================================ @@ -747,6 +756,10 @@ export interface NodePolarsCode extends NodeSingleInput { polars_code_input: PolarsCodeInput; } +export interface NodePythonScript extends NodeMultiInput { + python_script_input: PythonScriptInput; +} + export interface NodeUnique extends NodeSingleInput { unique_input: UniqueInput; } From 9ee26f6617aaed3a4f557b2081106d0922c3108a Mon Sep 17 00:00:00 2001 From: Edward van Eechoud <41021650+Edwardvaneechoud@users.noreply.github.com> Date: Sun, 1 Feb 2026 10:36:17 +0100 Subject: [PATCH 05/38] Add kernel persistence and multi-user access control (#286) * Persist kernel configurations in database and clean up on shutdown Kernels are now stored in a `kernels` table (tied to user_id) so they survive core process restarts. On startup the KernelManager restores persisted configs from the DB, then reclaims any running Docker containers that match; orphan containers with no DB record are stopped. All kernel REST routes now require authentication and enforce per-user ownership (list returns only the caller's kernels, mutations check ownership before proceeding). On core shutdown (lifespan handler, SIGTERM, SIGINT) every running kernel container is stopped and removed via `shutdown_all()`. https://claude.ai/code/session_01PcxZsx9KTQvHLDvzgAUjzC * Check Docker image availability before starting a kernel start_kernel now explicitly checks for the flowfile-kernel image before attempting to run a container, giving a clear error message ("Docker image 'flowfile-kernel' not found. Please build or pull...") instead of a raw Docker API exception. https://claude.ai/code/session_01PcxZsx9KTQvHLDvzgAUjzC * Allocate port lazily in start_kernel for DB-restored kernels Kernels restored from the database have port=None since ports are ephemeral and not persisted. start_kernel now calls _allocate_port() when kernel.port is None, fixing the "Invalid port: 'None'" error that occurred when starting a kernel after a core restart. https://claude.ai/code/session_01PcxZsx9KTQvHLDvzgAUjzC --- .../flowfile_core/database/models.py | 15 ++- flowfile_core/flowfile_core/kernel/manager.py | 122 ++++++++++++++++-- .../flowfile_core/kernel/persistence.py | 71 ++++++++++ flowfile_core/flowfile_core/kernel/routes.py | 80 ++++++++---- flowfile_core/flowfile_core/main.py | 16 ++- flowfile_core/tests/kernel_fixtures.py | 2 +- 6 files changed, 266 insertions(+), 40 deletions(-) create mode 100644 flowfile_core/flowfile_core/kernel/persistence.py diff --git a/flowfile_core/flowfile_core/database/models.py b/flowfile_core/flowfile_core/database/models.py index c92458be6..ae849ca6d 100644 --- a/flowfile_core/flowfile_core/database/models.py +++ b/flowfile_core/flowfile_core/database/models.py @@ -1,4 +1,4 @@ -from sqlalchemy import Boolean, Column, DateTime, ForeignKey, Integer, String, Text +from sqlalchemy import Boolean, Column, DateTime, Float, ForeignKey, Integer, String, Text from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.sql import func @@ -88,3 +88,16 @@ class CloudStoragePermission(Base): can_write = Column(Boolean, default=False) can_delete = Column(Boolean, default=False) can_list = Column(Boolean, default=True) + + +class Kernel(Base): + __tablename__ = "kernels" + + id = Column(String, primary_key=True, index=True) + name = Column(String, nullable=False) + user_id = Column(Integer, ForeignKey("users.id"), nullable=False) + packages = Column(Text, default="[]") # JSON-serialized list of package names + cpu_cores = Column(Float, default=2.0) + memory_gb = Column(Float, default=4.0) + gpu = Column(Boolean, default=False) + created_at = Column(DateTime, default=func.now(), nullable=False) diff --git a/flowfile_core/flowfile_core/kernel/manager.py b/flowfile_core/flowfile_core/kernel/manager.py index a01adf47c..4c6f2fdc3 100644 --- a/flowfile_core/flowfile_core/kernel/manager.py +++ b/flowfile_core/flowfile_core/kernel/manager.py @@ -37,13 +37,66 @@ class KernelManager: def __init__(self, shared_volume_path: str | None = None): self._docker = docker.from_env() self._kernels: dict[str, KernelInfo] = {} + self._kernel_owners: dict[str, int] = {} # kernel_id -> user_id self._shared_volume = shared_volume_path or str(storage.cache_directory) + self._restore_kernels_from_db() self._reclaim_running_containers() @property def shared_volume_path(self) -> str: return self._shared_volume + # ------------------------------------------------------------------ + # Database persistence helpers + # ------------------------------------------------------------------ + + def _restore_kernels_from_db(self) -> None: + """Load persisted kernel configs from the database on startup.""" + try: + from flowfile_core.database.connection import get_db_context + from flowfile_core.kernel.persistence import get_all_kernels + + with get_db_context() as db: + for config, user_id in get_all_kernels(db): + if config.id in self._kernels: + continue + kernel = KernelInfo( + id=config.id, + name=config.name, + state=KernelState.STOPPED, + packages=config.packages, + memory_gb=config.memory_gb, + cpu_cores=config.cpu_cores, + gpu=config.gpu, + ) + self._kernels[config.id] = kernel + self._kernel_owners[config.id] = user_id + logger.info("Restored kernel '%s' for user %d from database", config.id, user_id) + except Exception as exc: + logger.warning("Could not restore kernels from database: %s", exc) + + def _persist_kernel(self, kernel: KernelInfo, user_id: int) -> None: + """Save a kernel record to the database.""" + try: + from flowfile_core.database.connection import get_db_context + from flowfile_core.kernel.persistence import save_kernel + + with get_db_context() as db: + save_kernel(db, kernel, user_id) + except Exception as exc: + logger.warning("Could not persist kernel '%s': %s", kernel.id, exc) + + def _remove_kernel_from_db(self, kernel_id: str) -> None: + """Remove a kernel record from the database.""" + try: + from flowfile_core.database.connection import get_db_context + from flowfile_core.kernel.persistence import delete_kernel + + with get_db_context() as db: + delete_kernel(db, kernel_id) + except Exception as exc: + logger.warning("Could not remove kernel '%s' from database: %s", kernel_id, exc) + # ------------------------------------------------------------------ # Port allocation # ------------------------------------------------------------------ @@ -73,18 +126,26 @@ def _reclaim_running_containers(self) -> None: except (KeyError, IndexError, TypeError, ValueError): pass - if port is not None and kernel_id not in self._kernels: - self._kernels[kernel_id] = KernelInfo( - id=kernel_id, - name=kernel_id, - state=KernelState.IDLE, - container_id=container.id, - port=port, - ) + if port is not None and kernel_id in self._kernels: + # Kernel was restored from DB — update with runtime info + self._kernels[kernel_id].container_id = container.id + self._kernels[kernel_id].port = port + self._kernels[kernel_id].state = KernelState.IDLE logger.info( "Reclaimed running kernel '%s' on port %d (container %s)", kernel_id, port, container.short_id, ) + elif port is not None and kernel_id not in self._kernels: + # Orphan container with no DB record — stop it + logger.warning( + "Found orphan kernel container '%s' with no database record, stopping it", + kernel_id, + ) + try: + container.stop(timeout=10) + container.remove(force=True) + except Exception as exc: + logger.warning("Error stopping orphan container '%s': %s", kernel_id, exc) def _allocate_port(self) -> int: """Find the next available port in the kernel port range.""" @@ -100,7 +161,7 @@ def _allocate_port(self) -> int: # Lifecycle # ------------------------------------------------------------------ - async def create_kernel(self, config: KernelConfig) -> KernelInfo: + async def create_kernel(self, config: KernelConfig, user_id: int) -> KernelInfo: if config.id in self._kernels: raise ValueError(f"Kernel '{config.id}' already exists") @@ -117,7 +178,9 @@ async def create_kernel(self, config: KernelConfig) -> KernelInfo: health_timeout=config.health_timeout, ) self._kernels[config.id] = kernel - logger.info("Created kernel '%s' on port %d", config.id, port) + self._kernel_owners[config.id] = user_id + self._persist_kernel(kernel, user_id) + logger.info("Created kernel '%s' on port %d for user %d", config.id, port, user_id) return kernel async def start_kernel(self, kernel_id: str) -> KernelInfo: @@ -125,6 +188,21 @@ async def start_kernel(self, kernel_id: str) -> KernelInfo: if kernel.state == KernelState.IDLE: return kernel + # Verify the kernel image exists before attempting to start + try: + self._docker.images.get(_KERNEL_IMAGE) + except docker.errors.ImageNotFound: + kernel.state = KernelState.ERROR + kernel.error_message = ( + f"Docker image '{_KERNEL_IMAGE}' not found. " + "Please build or pull the kernel image before starting a kernel." + ) + raise RuntimeError(kernel.error_message) + + # Allocate a port if the kernel doesn't have one yet (e.g. restored from DB) + if kernel.port is None: + kernel.port = self._allocate_port() + kernel.state = KernelState.STARTING kernel.error_message = None @@ -165,8 +243,22 @@ async def delete_kernel(self, kernel_id: str) -> None: if kernel.state in (KernelState.IDLE, KernelState.EXECUTING): await self.stop_kernel(kernel_id) del self._kernels[kernel_id] + self._kernel_owners.pop(kernel_id, None) + self._remove_kernel_from_db(kernel_id) logger.info("Deleted kernel '%s'", kernel_id) + def shutdown_all(self) -> None: + """Stop and remove all running kernel containers. Called on core shutdown.""" + kernel_ids = list(self._kernels.keys()) + for kernel_id in kernel_ids: + kernel = self._kernels.get(kernel_id) + if kernel and kernel.state in (KernelState.IDLE, KernelState.EXECUTING, KernelState.STARTING): + logger.info("Shutting down kernel '%s'", kernel_id) + self._cleanup_container(kernel_id) + kernel.state = KernelState.STOPPED + kernel.container_id = None + logger.info("All kernels have been shut down") + # ------------------------------------------------------------------ # Execution # ------------------------------------------------------------------ @@ -230,12 +322,20 @@ def clear_artifacts_sync(self, kernel_id: str) -> None: # Queries # ------------------------------------------------------------------ - async def list_kernels(self) -> list[KernelInfo]: + async def list_kernels(self, user_id: int | None = None) -> list[KernelInfo]: + if user_id is not None: + return [ + k for kid, k in self._kernels.items() + if self._kernel_owners.get(kid) == user_id + ] return list(self._kernels.values()) async def get_kernel(self, kernel_id: str) -> KernelInfo | None: return self._kernels.get(kernel_id) + def get_kernel_owner(self, kernel_id: str) -> int | None: + return self._kernel_owners.get(kernel_id) + # ------------------------------------------------------------------ # Internal helpers # ------------------------------------------------------------------ diff --git a/flowfile_core/flowfile_core/kernel/persistence.py b/flowfile_core/flowfile_core/kernel/persistence.py new file mode 100644 index 000000000..412f33f4a --- /dev/null +++ b/flowfile_core/flowfile_core/kernel/persistence.py @@ -0,0 +1,71 @@ +"""Database persistence for kernel configurations. + +Kernels are persisted so they survive core process restarts. Only the +configuration is stored (id, name, packages, resource limits, user ownership). +Runtime state (container_id, port, state) is ephemeral and reconstructed at +startup by reclaiming running Docker containers. +""" + +import json +import logging + +from sqlalchemy.orm import Session + +from flowfile_core.database import models as db_models +from flowfile_core.kernel.models import KernelConfig, KernelInfo + +logger = logging.getLogger(__name__) + + +def save_kernel(db: Session, kernel: KernelInfo, user_id: int) -> None: + """Insert or update a kernel record in the database.""" + existing = db.query(db_models.Kernel).filter(db_models.Kernel.id == kernel.id).first() + if existing: + existing.name = kernel.name + existing.packages = json.dumps(kernel.packages) + existing.cpu_cores = kernel.cpu_cores + existing.memory_gb = kernel.memory_gb + existing.gpu = kernel.gpu + existing.user_id = user_id + else: + record = db_models.Kernel( + id=kernel.id, + name=kernel.name, + user_id=user_id, + packages=json.dumps(kernel.packages), + cpu_cores=kernel.cpu_cores, + memory_gb=kernel.memory_gb, + gpu=kernel.gpu, + ) + db.add(record) + db.commit() + + +def delete_kernel(db: Session, kernel_id: str) -> None: + """Remove a kernel record from the database.""" + db.query(db_models.Kernel).filter(db_models.Kernel.id == kernel_id).delete() + db.commit() + + +def get_kernels_for_user(db: Session, user_id: int) -> list[KernelConfig]: + """Return all persisted kernel configs belonging to a user.""" + rows = db.query(db_models.Kernel).filter(db_models.Kernel.user_id == user_id).all() + return [_row_to_config(row) for row in rows] + + +def get_all_kernels(db: Session) -> list[tuple[KernelConfig, int]]: + """Return all persisted kernels as (config, user_id) tuples.""" + rows = db.query(db_models.Kernel).all() + return [(_row_to_config(row), row.user_id) for row in rows] + + +def _row_to_config(row: db_models.Kernel) -> KernelConfig: + packages = json.loads(row.packages) if row.packages else [] + return KernelConfig( + id=row.id, + name=row.name, + packages=packages, + cpu_cores=row.cpu_cores, + memory_gb=row.memory_gb, + gpu=row.gpu, + ) diff --git a/flowfile_core/flowfile_core/kernel/routes.py b/flowfile_core/flowfile_core/kernel/routes.py index 54ab14277..8aef90c32 100644 --- a/flowfile_core/flowfile_core/kernel/routes.py +++ b/flowfile_core/flowfile_core/kernel/routes.py @@ -1,7 +1,8 @@ import logging -from fastapi import APIRouter, HTTPException +from fastapi import APIRouter, Depends, HTTPException +from flowfile_core.auth.jwt import get_current_active_user from flowfile_core.kernel.models import DockerStatus, ExecuteRequest, ExecuteResult, KernelConfig, KernelInfo logger = logging.getLogger(__name__) @@ -20,18 +21,18 @@ def _get_manager(): ) -router = APIRouter(prefix="/kernels") +router = APIRouter(prefix="/kernels", dependencies=[Depends(get_current_active_user)]) @router.get("/", response_model=list[KernelInfo]) -async def list_kernels(): - return await _get_manager().list_kernels() +async def list_kernels(current_user=Depends(get_current_active_user)): + return await _get_manager().list_kernels(user_id=current_user.id) @router.post("/", response_model=KernelInfo) -async def create_kernel(config: KernelConfig): +async def create_kernel(config: KernelConfig, current_user=Depends(get_current_active_user)): try: - return await _get_manager().create_kernel(config) + return await _get_manager().create_kernel(config, user_id=current_user.id) except ValueError as exc: raise HTTPException(status_code=409, detail=str(exc)) @@ -61,57 +62,82 @@ async def docker_status(): @router.get("/{kernel_id}", response_model=KernelInfo) -async def get_kernel(kernel_id: str): - kernel = await _get_manager().get_kernel(kernel_id) +async def get_kernel(kernel_id: str, current_user=Depends(get_current_active_user)): + manager = _get_manager() + kernel = await manager.get_kernel(kernel_id) if kernel is None: raise HTTPException(status_code=404, detail=f"Kernel '{kernel_id}' not found") + if manager.get_kernel_owner(kernel_id) != current_user.id: + raise HTTPException(status_code=403, detail="Not authorized to access this kernel") return kernel @router.delete("/{kernel_id}") -async def delete_kernel(kernel_id: str): +async def delete_kernel(kernel_id: str, current_user=Depends(get_current_active_user)): + manager = _get_manager() + kernel = await manager.get_kernel(kernel_id) + if kernel is None: + raise HTTPException(status_code=404, detail=f"Kernel '{kernel_id}' not found") + if manager.get_kernel_owner(kernel_id) != current_user.id: + raise HTTPException(status_code=403, detail="Not authorized to access this kernel") try: - await _get_manager().delete_kernel(kernel_id) + await manager.delete_kernel(kernel_id) return {"status": "deleted", "kernel_id": kernel_id} except KeyError as exc: raise HTTPException(status_code=404, detail=str(exc)) @router.post("/{kernel_id}/start", response_model=KernelInfo) -async def start_kernel(kernel_id: str): +async def start_kernel(kernel_id: str, current_user=Depends(get_current_active_user)): + manager = _get_manager() + kernel = await manager.get_kernel(kernel_id) + if kernel is None: + raise HTTPException(status_code=404, detail=f"Kernel '{kernel_id}' not found") + if manager.get_kernel_owner(kernel_id) != current_user.id: + raise HTTPException(status_code=403, detail="Not authorized to access this kernel") try: - return await _get_manager().start_kernel(kernel_id) - except KeyError as exc: - raise HTTPException(status_code=404, detail=str(exc)) + return await manager.start_kernel(kernel_id) except Exception as exc: raise HTTPException(status_code=500, detail=str(exc)) @router.post("/{kernel_id}/stop") -async def stop_kernel(kernel_id: str): +async def stop_kernel(kernel_id: str, current_user=Depends(get_current_active_user)): + manager = _get_manager() + kernel = await manager.get_kernel(kernel_id) + if kernel is None: + raise HTTPException(status_code=404, detail=f"Kernel '{kernel_id}' not found") + if manager.get_kernel_owner(kernel_id) != current_user.id: + raise HTTPException(status_code=403, detail="Not authorized to access this kernel") try: - await _get_manager().stop_kernel(kernel_id) + await manager.stop_kernel(kernel_id) return {"status": "stopped", "kernel_id": kernel_id} except KeyError as exc: raise HTTPException(status_code=404, detail=str(exc)) @router.post("/{kernel_id}/execute", response_model=ExecuteResult) -async def execute_code(kernel_id: str, request: ExecuteRequest): +async def execute_code(kernel_id: str, request: ExecuteRequest, current_user=Depends(get_current_active_user)): + manager = _get_manager() + kernel = await manager.get_kernel(kernel_id) + if kernel is None: + raise HTTPException(status_code=404, detail=f"Kernel '{kernel_id}' not found") + if manager.get_kernel_owner(kernel_id) != current_user.id: + raise HTTPException(status_code=403, detail="Not authorized to access this kernel") try: - return await _get_manager().execute(kernel_id, request) - except KeyError as exc: - raise HTTPException(status_code=404, detail=str(exc)) + return await manager.execute(kernel_id, request) except RuntimeError as exc: raise HTTPException(status_code=400, detail=str(exc)) @router.get("/{kernel_id}/artifacts") -async def get_artifacts(kernel_id: str): +async def get_artifacts(kernel_id: str, current_user=Depends(get_current_active_user)): manager = _get_manager() kernel = await manager.get_kernel(kernel_id) if kernel is None: raise HTTPException(status_code=404, detail=f"Kernel '{kernel_id}' not found") + if manager.get_kernel_owner(kernel_id) != current_user.id: + raise HTTPException(status_code=403, detail="Not authorized to access this kernel") if kernel.state.value not in ("idle", "executing"): raise HTTPException(status_code=400, detail=f"Kernel '{kernel_id}' is not running") @@ -128,11 +154,15 @@ async def get_artifacts(kernel_id: str): @router.post("/{kernel_id}/clear") -async def clear_artifacts(kernel_id: str): +async def clear_artifacts(kernel_id: str, current_user=Depends(get_current_active_user)): + manager = _get_manager() + kernel = await manager.get_kernel(kernel_id) + if kernel is None: + raise HTTPException(status_code=404, detail=f"Kernel '{kernel_id}' not found") + if manager.get_kernel_owner(kernel_id) != current_user.id: + raise HTTPException(status_code=403, detail="Not authorized to access this kernel") try: - await _get_manager().clear_artifacts(kernel_id) + await manager.clear_artifacts(kernel_id) return {"status": "cleared", "kernel_id": kernel_id} - except KeyError as exc: - raise HTTPException(status_code=404, detail=str(exc)) except RuntimeError as exc: raise HTTPException(status_code=400, detail=str(exc)) diff --git a/flowfile_core/flowfile_core/main.py b/flowfile_core/flowfile_core/main.py index 9f772f829..2c8f11237 100644 --- a/flowfile_core/flowfile_core/main.py +++ b/flowfile_core/flowfile_core/main.py @@ -39,8 +39,8 @@ async def shutdown_handler(app: FastAPI): """Handles the graceful startup and shutdown of the FastAPI application. - This context manager ensures that resources, such as log files, are cleaned - up properly when the application is terminated. + This context manager ensures that resources, such as log files and kernel + containers, are cleaned up properly when the application is terminated. """ print("Starting core application...") try: @@ -48,10 +48,22 @@ async def shutdown_handler(app: FastAPI): finally: print("Shutting down core application...") print("Cleaning up core service resources...") + _shutdown_kernels() clear_all_flow_logs() await asyncio.sleep(0.1) # Give a moment for cleanup +def _shutdown_kernels(): + """Stop all running kernel containers during shutdown.""" + try: + from flowfile_core.kernel import get_kernel_manager + + manager = get_kernel_manager() + manager.shutdown_all() + except Exception as exc: + print(f"Error shutting down kernels: {exc}") + + # Initialize FastAPI with metadata app = FastAPI( title="Flowfile Backend", diff --git a/flowfile_core/tests/kernel_fixtures.py b/flowfile_core/tests/kernel_fixtures.py index 38afc2516..a686891b6 100644 --- a/flowfile_core/tests/kernel_fixtures.py +++ b/flowfile_core/tests/kernel_fixtures.py @@ -101,7 +101,7 @@ def managed_kernel( name="Integration Test Kernel", packages=packages or [], ) - loop.run_until_complete(manager.create_kernel(config)) + loop.run_until_complete(manager.create_kernel(config, user_id=1)) loop.run_until_complete(manager.start_kernel(kernel_id)) yield manager, kernel_id From 8b875d3799324a47b3a96f24bdd870aaac2d099c Mon Sep 17 00:00:00 2001 From: Edward van Eechoud <41021650+Edwardvaneechoud@users.noreply.github.com> Date: Mon, 2 Feb 2026 06:47:05 +0100 Subject: [PATCH 06/38] Add kernel runtime management with Docker containerization (#281) (#290) * Add flowfile.log() method for real-time log streaming from kernel to frontend Enable Python script nodes to stream log messages to the FlowFile log viewer in real time via flowfile.log(). The kernel container makes HTTP callbacks to the core's /raw_logs endpoint, which writes to the FlowLogger file. The existing SSE streamer picks up new lines and pushes them to the frontend immediately. Changes: - Add log(), log_info(), log_warning(), log_error() to flowfile_client - Pass flow_id and log_callback_url through ExecuteRequest to kernel - Add host.docker.internal mapping to kernel Docker containers - Update RawLogInput schema to support node_id and WARNING level - Forward captured stdout/stderr to FlowLogger after execution https://claude.ai/code/session_01Svv5uYus8tnHofhH667KKB * Add kernel runtime versioning visible in frontend Add __version__ to the kernel_runtime package (0.2.0) and expose it through the /health endpoint. The KernelManager reads the version when the container becomes healthy and stores it on KernelInfo. The frontend KernelCard displays the version badge next to the kernel ID so users can verify which image version a running kernel is using. --- .../flowfile_core/flowfile/flow_graph.py | 15 ++++ flowfile_core/flowfile_core/kernel/manager.py | 3 + flowfile_core/flowfile_core/kernel/models.py | 3 + flowfile_core/flowfile_core/routes/logs.py | 10 +-- .../flowfile_core/schemas/schemas.py | 6 +- .../src/renderer/app/types/kernel.types.ts | 1 + .../views/KernelManagerView/KernelCard.vue | 23 +++++- flowfile_worker/flowfile_worker/models.py | 3 +- kernel_runtime/kernel_runtime/__init__.py | 1 + .../kernel_runtime/flowfile_client.py | 77 ++++++++++++++++++- kernel_runtime/kernel_runtime/main.py | 10 ++- 11 files changed, 139 insertions(+), 13 deletions(-) diff --git a/flowfile_core/flowfile_core/flowfile/flow_graph.py b/flowfile_core/flowfile_core/flowfile/flow_graph.py index 9a20db5bc..5333aad53 100644 --- a/flowfile_core/flowfile_core/flowfile/flow_graph.py +++ b/flowfile_core/flowfile_core/flowfile/flow_graph.py @@ -1123,6 +1123,7 @@ def add_python_script(self, node_python_script: input_schema.NodePythonScript): """Adds a node that executes Python code on a kernel container.""" def _func(*flowfile_tables: FlowDataEngine) -> FlowDataEngine: + from flowfile_core.configs.settings import SERVER_PORT from flowfile_core.kernel import ExecuteRequest, get_kernel_manager kernel_id = node_python_script.python_script_input.kernel_id @@ -1135,6 +1136,7 @@ def _func(*flowfile_tables: FlowDataEngine) -> FlowDataEngine: node_id = node_python_script.node_id flow_id = self.flow_id + node_logger = self.flow_logger.get_node_logger(node_id) # Compute available artifacts before execution upstream_ids = self._get_upstream_node_ids(node_id) @@ -1161,15 +1163,28 @@ def _func(*flowfile_tables: FlowDataEngine) -> FlowDataEngine: main_paths.append(f"/shared/{flow_id}/{node_id}/inputs/{filename}") input_paths["main"] = main_paths + # Build the callback URL so the kernel can stream logs in real time + log_callback_url = f"http://host.docker.internal:{SERVER_PORT}/raw_logs" + # Execute on kernel (synchronous — no async boundary issues) request = ExecuteRequest( node_id=node_id, code=code, input_paths=input_paths, output_dir=f"/shared/{flow_id}/{node_id}/outputs", + flow_id=flow_id, + log_callback_url=log_callback_url, ) result = manager.execute_sync(kernel_id, request) + # Forward captured stdout/stderr to the flow logger + if result.stdout: + for line in result.stdout.strip().splitlines(): + node_logger.info(f"[stdout] {line}") + if result.stderr: + for line in result.stderr.strip().splitlines(): + node_logger.warning(f"[stderr] {line}") + if not result.success: raise RuntimeError(f"Kernel execution failed: {result.error}") diff --git a/flowfile_core/flowfile_core/kernel/manager.py b/flowfile_core/flowfile_core/kernel/manager.py index 4c6f2fdc3..d3fecd4c4 100644 --- a/flowfile_core/flowfile_core/kernel/manager.py +++ b/flowfile_core/flowfile_core/kernel/manager.py @@ -216,6 +216,7 @@ async def start_kernel(self, kernel_id: str) -> KernelInfo: "environment": {"KERNEL_PACKAGES": packages_str}, "mem_limit": f"{kernel.memory_gb}g", "nano_cpus": int(kernel.cpu_cores * 1e9), + "extra_hosts": {"host.docker.internal": "host-gateway"}, } container = self._docker.containers.run(_KERNEL_IMAGE, **run_kwargs) kernel.container_id = container.id @@ -370,6 +371,8 @@ async def _wait_for_healthy(self, kernel_id: str, timeout: int = _HEALTH_TIMEOUT async with httpx.AsyncClient(timeout=httpx.Timeout(5.0)) as client: response = await client.get(url) if response.status_code == 200: + data = response.json() + kernel.kernel_version = data.get("version") return except (httpx.HTTPError, OSError) as exc: logger.debug("Health poll for kernel '%s' failed: %s", kernel_id, exc) diff --git a/flowfile_core/flowfile_core/kernel/models.py b/flowfile_core/flowfile_core/kernel/models.py index c7d71389b..7d5158c30 100644 --- a/flowfile_core/flowfile_core/kernel/models.py +++ b/flowfile_core/flowfile_core/kernel/models.py @@ -35,6 +35,7 @@ class KernelInfo(BaseModel): health_timeout: int = 120 created_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc)) error_message: str | None = None + kernel_version: str | None = None class DockerStatus(BaseModel): @@ -48,6 +49,8 @@ class ExecuteRequest(BaseModel): code: str input_paths: dict[str, list[str]] = Field(default_factory=dict) output_dir: str = "" + flow_id: int = 0 + log_callback_url: str = "" class ExecuteResult(BaseModel): diff --git a/flowfile_core/flowfile_core/routes/logs.py b/flowfile_core/flowfile_core/routes/logs.py index 0d8a5de6a..2b3d78fed 100644 --- a/flowfile_core/flowfile_core/routes/logs.py +++ b/flowfile_core/flowfile_core/routes/logs.py @@ -45,17 +45,17 @@ async def add_log(flow_id: int, log_message: str): @router.post("/raw_logs", tags=["flow_logging"]) async def add_raw_log(raw_log_input: schemas.RawLogInput): """Adds a log message to the log file for a given flow_id.""" - logger.info("Adding raw logs") flow = flow_file_handler.get_flow(raw_log_input.flowfile_flow_id) if not flow: raise HTTPException(status_code=404, detail="Flow not found") - flow.flow_logger.get_log_filepath() flow_logger = flow.flow_logger - flow_logger.get_log_filepath() + node_id = raw_log_input.node_id if raw_log_input.node_id is not None else -1 if raw_log_input.log_type == "INFO": - flow_logger.info(raw_log_input.log_message, extra=raw_log_input.extra) + flow_logger.info(raw_log_input.log_message, extra=raw_log_input.extra, node_id=node_id) + elif raw_log_input.log_type == "WARNING": + flow_logger.warning(raw_log_input.log_message, extra=raw_log_input.extra, node_id=node_id) elif raw_log_input.log_type == "ERROR": - flow_logger.error(raw_log_input.log_message, extra=raw_log_input.extra) + flow_logger.error(raw_log_input.log_message, extra=raw_log_input.extra, node_id=node_id) return {"message": "Log added successfully"} diff --git a/flowfile_core/flowfile_core/schemas/schemas.py b/flowfile_core/flowfile_core/schemas/schemas.py index 4f903f627..ed3dcb256 100644 --- a/flowfile_core/flowfile_core/schemas/schemas.py +++ b/flowfile_core/flowfile_core/schemas/schemas.py @@ -175,13 +175,15 @@ class RawLogInput(BaseModel): Attributes: flowfile_flow_id (int): The ID of the flow that generated the log. log_message (str): The content of the log message. - log_type (Literal["INFO", "ERROR"]): The type of log. + log_type (Literal["INFO", "WARNING", "ERROR"]): The type of log. + node_id (int | None): Optional node ID to attribute the log to. extra (Optional[dict]): Extra context data for the log. """ flowfile_flow_id: int log_message: str - log_type: Literal["INFO", "ERROR"] + log_type: Literal["INFO", "WARNING", "ERROR"] + node_id: int | None = None extra: dict | None = None diff --git a/flowfile_frontend/src/renderer/app/types/kernel.types.ts b/flowfile_frontend/src/renderer/app/types/kernel.types.ts index 8a6d05aa7..349ce0074 100644 --- a/flowfile_frontend/src/renderer/app/types/kernel.types.ts +++ b/flowfile_frontend/src/renderer/app/types/kernel.types.ts @@ -29,4 +29,5 @@ export interface KernelInfo { gpu: boolean; created_at: string; error_message: string | null; + kernel_version: string | null; } diff --git a/flowfile_frontend/src/renderer/app/views/KernelManagerView/KernelCard.vue b/flowfile_frontend/src/renderer/app/views/KernelManagerView/KernelCard.vue index e4ddedbf0..86bafae99 100644 --- a/flowfile_frontend/src/renderer/app/views/KernelManagerView/KernelCard.vue +++ b/flowfile_frontend/src/renderer/app/views/KernelManagerView/KernelCard.vue @@ -5,7 +5,12 @@

{{ kernel.name }}

-

{{ kernel.id }}

+
+

{{ kernel.id }}

+ + v{{ kernel.kernel_version }} + +
@@ -133,6 +138,12 @@ const displayedPackages = computed(() => props.kernel.packages.slice(0, maxPacka white-space: nowrap; } +.kernel-card__meta { + display: flex; + align-items: center; + gap: var(--spacing-2); +} + .kernel-card__id { font-size: var(--font-size-2xs); color: var(--color-text-muted); @@ -140,6 +151,16 @@ const displayedPackages = computed(() => props.kernel.packages.slice(0, maxPacka margin: 0; } +.kernel-card__version { + font-size: var(--font-size-2xs); + color: var(--color-accent); + font-family: var(--font-family-mono); + font-weight: var(--font-weight-medium); + background-color: var(--color-accent-subtle); + padding: 0 var(--spacing-1); + border-radius: var(--border-radius-sm); +} + .kernel-card__body { display: flex; flex-direction: column; diff --git a/flowfile_worker/flowfile_worker/models.py b/flowfile_worker/flowfile_worker/models.py index 658b6e936..d72b2d3a3 100644 --- a/flowfile_worker/flowfile_worker/models.py +++ b/flowfile_worker/flowfile_worker/models.py @@ -140,5 +140,6 @@ def __hash__(self): class RawLogInput(BaseModel): flowfile_flow_id: int log_message: str - log_type: Literal["INFO", "ERROR"] + log_type: Literal["INFO", "WARNING", "ERROR"] + node_id: int | None = None extra: dict | None = None diff --git a/kernel_runtime/kernel_runtime/__init__.py b/kernel_runtime/kernel_runtime/__init__.py index e69de29bb..49f34f498 100644 --- a/kernel_runtime/kernel_runtime/__init__.py +++ b/kernel_runtime/kernel_runtime/__init__.py @@ -0,0 +1 @@ +__version__ = "0.2.0" \ No newline at end of file diff --git a/kernel_runtime/kernel_runtime/flowfile_client.py b/kernel_runtime/kernel_runtime/flowfile_client.py index bed63c2e8..1c5e5bf0d 100644 --- a/kernel_runtime/kernel_runtime/flowfile_client.py +++ b/kernel_runtime/kernel_runtime/flowfile_client.py @@ -3,30 +3,52 @@ import contextvars import os from pathlib import Path -from typing import Any +from typing import Any, Literal +import httpx import polars as pl from kernel_runtime.artifact_store import ArtifactStore _context: contextvars.ContextVar[dict[str, Any]] = contextvars.ContextVar("flowfile_context") +# Reusable HTTP client for log callbacks (created per execution context) +_log_client: contextvars.ContextVar[httpx.Client | None] = contextvars.ContextVar( + "flowfile_log_client", default=None +) + def _set_context( node_id: int, input_paths: dict[str, list[str]], output_dir: str, artifact_store: ArtifactStore, + flow_id: int = 0, + log_callback_url: str = "", ) -> None: _context.set({ "node_id": node_id, "input_paths": input_paths, "output_dir": output_dir, "artifact_store": artifact_store, + "flow_id": flow_id, + "log_callback_url": log_callback_url, }) + # Create a reusable HTTP client for log callbacks + if log_callback_url: + _log_client.set(httpx.Client(timeout=httpx.Timeout(5.0))) + else: + _log_client.set(None) def _clear_context() -> None: + client = _log_client.get(None) + if client is not None: + try: + client.close() + except Exception: + pass + _log_client.set(None) _context.set({}) @@ -110,3 +132,56 @@ def delete_artifact(name: str) -> None: def list_artifacts() -> dict: store: ArtifactStore = _get_context_value("artifact_store") return store.list_all() + + +# ===== Logging APIs ===== + +def log(message: str, level: Literal["INFO", "WARNING", "ERROR"] = "INFO") -> None: + """Send a log message to the FlowFile log viewer. + + The message appears in the frontend log stream in real time. + + Args: + message: The log message text. + level: Log severity — ``"INFO"`` (default), ``"WARNING"``, or ``"ERROR"``. + """ + flow_id: int = _get_context_value("flow_id") + node_id: int = _get_context_value("node_id") + callback_url: str = _get_context_value("log_callback_url") + if not callback_url: + # No callback configured — fall back to printing so the message + # still shows up in captured stdout. + print(f"[{level}] {message}") # noqa: T201 + return + + client = _log_client.get(None) + if client is None: + print(f"[{level}] {message}") # noqa: T201 + return + + payload = { + "flowfile_flow_id": flow_id, + "node_id": node_id, + "log_message": message, + "log_type": level, + } + try: + client.post(callback_url, json=payload) + except Exception: + # Best-effort — don't let logging failures break user code. + pass + + +def log_info(message: str) -> None: + """Convenience wrapper: ``flowfile.log(message, level="INFO")``.""" + log(message, level="INFO") + + +def log_warning(message: str) -> None: + """Convenience wrapper: ``flowfile.log(message, level="WARNING")``.""" + log(message, level="WARNING") + + +def log_error(message: str) -> None: + """Convenience wrapper: ``flowfile.log(message, level="ERROR")``.""" + log(message, level="ERROR") diff --git a/kernel_runtime/kernel_runtime/main.py b/kernel_runtime/kernel_runtime/main.py index 36b6d5ff2..918a989a7 100644 --- a/kernel_runtime/kernel_runtime/main.py +++ b/kernel_runtime/kernel_runtime/main.py @@ -7,10 +7,10 @@ from fastapi import FastAPI from pydantic import BaseModel -from kernel_runtime import flowfile_client +from kernel_runtime import __version__, flowfile_client from kernel_runtime.artifact_store import ArtifactStore -app = FastAPI(title="FlowFile Kernel Runtime") +app = FastAPI(title="FlowFile Kernel Runtime", version=__version__) artifact_store = ArtifactStore() @@ -19,6 +19,8 @@ class ExecuteRequest(BaseModel): code: str input_paths: dict[str, list[str]] = {} output_dir: str = "" + flow_id: int = 0 + log_callback_url: str = "" class ExecuteResponse(BaseModel): @@ -50,6 +52,8 @@ async def execute(request: ExecuteRequest): input_paths=request.input_paths, output_dir=output_dir, artifact_store=artifact_store, + flow_id=request.flow_id, + log_callback_url=request.log_callback_url, ) with contextlib.redirect_stdout(stdout_buf), contextlib.redirect_stderr(stderr_buf): @@ -102,4 +106,4 @@ async def list_artifacts(): @app.get("/health") async def health(): - return {"status": "healthy", "artifact_count": len(artifact_store.list_all())} + return {"status": "healthy", "version": __version__, "artifact_count": len(artifact_store.list_all())} From 7adb95ac2374b59878392db2c9454976bdc9b9d8 Mon Sep 17 00:00:00 2001 From: Edward van Eechoud <41021650+Edwardvaneechoud@users.noreply.github.com> Date: Mon, 2 Feb 2026 17:53:32 +0100 Subject: [PATCH 07/38] Implement selective artifact clearing for incremental flow execution (#291) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Fix artifact loss in debug mode by implementing selective clearing Previously, run_graph() cleared ALL artifacts from both the metadata tracker and kernel memory before every run. When a node was skipped (up-to-date), the metadata was restored from a snapshot but the actual Python objects in kernel memory were already gone. Downstream nodes that depended on those artifacts would fail with KeyError. The fix introduces artifact ownership tracking so that only artifacts from nodes that will actually re-execute are cleared: - ArtifactStore: add clear_by_node_ids() and list_by_node_id() - Kernel runtime: add POST /clear_node_artifacts and GET /artifacts/node/{id} - KernelManager: add clear_node_artifacts_sync() and get_node_artifacts() - ArtifactContext: add clear_nodes() for selective metadata clearing - Kernel routes: add /clear_node_artifacts and /artifacts/node/{id} endpoints - flow_graph.run_graph(): compute execution plan first, determine which python_script nodes will re-run, and only clear those nodes' artifacts. Skipped nodes keep their artifacts in both metadata and kernel memory. * Add integration tests for debug mode artifact persistence Tests verify that artifacts survive re-runs when producing nodes are skipped (up-to-date) and only consuming nodes re-execute, covering the core bug scenario, multiple artifacts, and producer re-run clearing. * Auto-clear node's own artifacts before re-execution in /execute When a node re-executes (e.g., forced refresh, performance mode re-run), its previously published artifacts are now automatically cleared before the new code runs. This prevents "Artifact already exists" errors without requiring manual delete_artifact() calls in user code. The clearing is scoped to the executing node's own artifacts only — artifacts from other nodes are untouched. * Scope artifacts by flow_id so multiple flows sharing a kernel are isolated The artifact store now keys artifacts by (flow_id, name) instead of just name. Two flows using the same kernel can each publish an artifact called "model" without colliding. All artifact operations (publish, read, delete, list, clear) are flow-scoped transparently via the execution context. --- .../flowfile_core/flowfile/artifacts.py | 31 ++ .../flowfile_core/flowfile/flow_graph.py | 101 +++-- .../flowfile_core/kernel/__init__.py | 4 + flowfile_core/flowfile_core/kernel/manager.py | 47 ++ flowfile_core/flowfile_core/kernel/models.py | 12 + flowfile_core/flowfile_core/kernel/routes.py | 50 ++- .../tests/flowfile/test_artifact_context.py | 73 ++++ .../tests/flowfile/test_kernel_integration.py | 410 ++++++++++++++++++ .../kernel_runtime/artifact_store.py | 85 +++- .../kernel_runtime/flowfile_client.py | 12 +- kernel_runtime/kernel_runtime/main.py | 42 +- kernel_runtime/tests/test_artifact_store.py | 125 ++++++ kernel_runtime/tests/test_main.py | 357 +++++++++++++++ 13 files changed, 1295 insertions(+), 54 deletions(-) diff --git a/flowfile_core/flowfile_core/flowfile/artifacts.py b/flowfile_core/flowfile_core/flowfile/artifacts.py index 1381643a2..c877cfd37 100644 --- a/flowfile_core/flowfile_core/flowfile/artifacts.py +++ b/flowfile_core/flowfile_core/flowfile/artifacts.py @@ -261,6 +261,37 @@ def clear_all(self) -> None: self._kernel_artifacts.clear() self._publisher_index.clear() + def clear_nodes(self, node_ids: set[int]) -> None: + """Remove tracking data only for the specified *node_ids*. + + Artifacts published by these nodes are removed from kernel + indices and publisher indices. States for other nodes are + left untouched so their artifact metadata is preserved. + """ + for nid in node_ids: + state = self._node_states.pop(nid, None) + if state is None: + continue + for ref in state.published: + # Remove from the kernel artifact index + kernel_map = self._kernel_artifacts.get(ref.kernel_id) + if kernel_map is not None: + # Only remove if this ref is still the current entry + existing = kernel_map.get(ref.name) + if existing is not None and existing.source_node_id == nid: + del kernel_map[ref.name] + # Remove from the reverse publisher index + key = (ref.kernel_id, ref.name) + pub_set = self._publisher_index.get(key) + if pub_set is not None: + pub_set.discard(nid) + if not pub_set: + del self._publisher_index[key] + + logger.debug( + "Cleared artifact metadata for node(s): %s", sorted(node_ids) + ) + def snapshot_node_states(self) -> dict[int, NodeArtifactState]: """Return a shallow copy of the current per-node states. diff --git a/flowfile_core/flowfile_core/flowfile/flow_graph.py b/flowfile_core/flowfile_core/flowfile/flow_graph.py index 5333aad53..9fa051754 100644 --- a/flowfile_core/flowfile_core/flowfile/flow_graph.py +++ b/flowfile_core/flowfile_core/flowfile/flow_graph.py @@ -2437,6 +2437,49 @@ def _get_required_kernel_ids(self) -> set[str]: kernel_ids.add(kid) return kernel_ids + def _compute_rerun_python_script_node_ids( + self, plan_skip_ids: set[str | int], + ) -> set[int]: + """Return node IDs for ``python_script`` nodes that will re-execute. + + A python_script node will re-execute (and thus needs its old + artifacts cleared) when: + + * It is NOT in the execution-plan skip set, **and** + * Its execution state indicates it has NOT already run with the + current setup (i.e. its cache is stale or it never ran). + """ + rerun: set[int] = set() + for node in self.nodes: + if node.node_type != "python_script": + continue + if node.node_id in plan_skip_ids: + continue + if not node._execution_state.has_run_with_current_setup: + rerun.add(node.node_id) + return rerun + + def _group_rerun_nodes_by_kernel( + self, rerun_node_ids: set[int], + ) -> dict[str, set[int]]: + """Group *rerun_node_ids* by their kernel ID. + + Returns a mapping ``kernel_id → {node_id, …}``. + """ + kernel_nodes: dict[str, set[int]] = {} + for node in self.nodes: + if node.node_id not in rerun_node_ids: + continue + if node.node_type == "python_script" and node.setting_input is not None: + kid = getattr( + getattr(node.setting_input, "python_script_input", None), + "kernel_id", + None, + ) + if kid: + kernel_nodes.setdefault(kid, set()).add(node.node_id) + return kernel_nodes + def _execute_single_node( self, node: FlowNode, @@ -2512,22 +2555,42 @@ def run_graph(self) -> RunInformation | None: self.flow_logger.clear_log_file() self.flow_logger.info("Starting to run flowfile flow...") - # Clear artifact tracking for a fresh run. - # Snapshot first so we can restore state for cached (skipped) nodes. - _prev_artifact_states = self.artifact_context.snapshot_node_states() - self.artifact_context.clear_all() - for kid in self._get_required_kernel_ids(): - self.artifact_context.clear_kernel(kid) - try: - from flowfile_core.kernel import get_kernel_manager - manager = get_kernel_manager() - manager.clear_artifacts_sync(kid) - except Exception: - logger.debug("Could not clear kernel artifacts for '%s'", kid) execution_plan = compute_execution_plan( nodes=self.nodes, flow_starts=self._flow_starts + self.get_implicit_starter_nodes() ) + # Selectively clear artifacts only for nodes that will re-run. + # Nodes that are up-to-date keep their artifacts in both the + # metadata tracker AND the kernel's in-memory store so that + # downstream nodes can still read them. + plan_skip_ids: set[str | int] = {n.node_id for n in execution_plan.skip_nodes} + rerun_node_ids = self._compute_rerun_python_script_node_ids(plan_skip_ids) + + # Also purge stale metadata for nodes not in this graph + # (e.g. injected externally or left over from removed nodes). + graph_node_ids = set(self._node_db.keys()) + stale_node_ids = { + nid for nid in self.artifact_context._node_states + if nid not in graph_node_ids + } + nodes_to_clear = rerun_node_ids | stale_node_ids + if nodes_to_clear: + self.artifact_context.clear_nodes(nodes_to_clear) + + if rerun_node_ids: + # Clear the actual kernel-side artifacts for re-running nodes + kernel_node_map = self._group_rerun_nodes_by_kernel(rerun_node_ids) + for kid, node_ids_for_kernel in kernel_node_map.items(): + try: + from flowfile_core.kernel import get_kernel_manager + manager = get_kernel_manager() + manager.clear_node_artifacts_sync(kid, list(node_ids_for_kernel), flow_id=self.flow_id) + except Exception: + logger.debug( + "Could not clear node artifacts for kernel '%s', nodes %s", + kid, sorted(node_ids_for_kernel), + ) + self.latest_run_info = self.create_initial_run_information( execution_plan.node_count, "full_run" ) @@ -2537,7 +2600,7 @@ def run_graph(self) -> RunInformation | None: performance_mode = self.flow_settings.execution_mode == "Performance" run_info_lock = threading.Lock() - skip_node_ids: set[str | int] = {n.node_id for n in execution_plan.skip_nodes} + skip_node_ids: set[str | int] = plan_skip_ids for stage in execution_plan.stages: if self.flow_settings.is_canceled: @@ -2582,18 +2645,6 @@ def run_graph(self) -> RunInformation | None: for dep in node.get_all_dependent_nodes(): skip_node_ids.add(dep.node_id) - # Restore artifact state for graph nodes that were cached (skipped). - # Their _func didn't re-execute, so record_published was never - # called — replay their state from the pre-clear snapshot. - # Only restore nodes that actually belong to this graph to avoid - # resurrecting stale entries injected outside the graph. - graph_node_ids = set(self._node_db.keys()) - for nid, prev_state in _prev_artifact_states.items(): - if (nid in graph_node_ids - and nid not in self.artifact_context._node_states - and prev_state.published): - self.artifact_context.restore_node_state(nid, prev_state) - self.latest_run_info.end_time = datetime.datetime.now() self.flow_logger.info("Flow completed!") self.end_datetime = datetime.datetime.now() diff --git a/flowfile_core/flowfile_core/kernel/__init__.py b/flowfile_core/flowfile_core/kernel/__init__.py index bb730d1bb..fb3907b1b 100644 --- a/flowfile_core/flowfile_core/kernel/__init__.py +++ b/flowfile_core/flowfile_core/kernel/__init__.py @@ -1,5 +1,7 @@ from flowfile_core.kernel.manager import KernelManager from flowfile_core.kernel.models import ( + ClearNodeArtifactsRequest, + ClearNodeArtifactsResult, DockerStatus, ExecuteRequest, ExecuteResult, @@ -11,6 +13,8 @@ __all__ = [ "KernelManager", + "ClearNodeArtifactsRequest", + "ClearNodeArtifactsResult", "DockerStatus", "KernelConfig", "KernelInfo", diff --git a/flowfile_core/flowfile_core/kernel/manager.py b/flowfile_core/flowfile_core/kernel/manager.py index d3fecd4c4..9999e549d 100644 --- a/flowfile_core/flowfile_core/kernel/manager.py +++ b/flowfile_core/flowfile_core/kernel/manager.py @@ -6,6 +6,7 @@ import httpx from flowfile_core.kernel.models import ( + ClearNodeArtifactsResult, ExecuteRequest, ExecuteResult, KernelConfig, @@ -319,6 +320,52 @@ def clear_artifacts_sync(self, kernel_id: str) -> None: response = client.post(url) response.raise_for_status() + async def clear_node_artifacts( + self, kernel_id: str, node_ids: list[int], flow_id: int | None = None, + ) -> ClearNodeArtifactsResult: + """Clear only artifacts published by the given node IDs.""" + kernel = self._get_kernel_or_raise(kernel_id) + if kernel.state not in (KernelState.IDLE, KernelState.EXECUTING): + raise RuntimeError(f"Kernel '{kernel_id}' is not running (state: {kernel.state})") + + url = f"http://localhost:{kernel.port}/clear_node_artifacts" + payload: dict = {"node_ids": node_ids} + if flow_id is not None: + payload["flow_id"] = flow_id + async with httpx.AsyncClient(timeout=httpx.Timeout(30.0)) as client: + response = await client.post(url, json=payload) + response.raise_for_status() + return ClearNodeArtifactsResult(**response.json()) + + def clear_node_artifacts_sync( + self, kernel_id: str, node_ids: list[int], flow_id: int | None = None, + ) -> ClearNodeArtifactsResult: + """Synchronous wrapper for clearing artifacts by node IDs.""" + kernel = self._get_kernel_or_raise(kernel_id) + if kernel.state not in (KernelState.IDLE, KernelState.EXECUTING): + raise RuntimeError(f"Kernel '{kernel_id}' is not running (state: {kernel.state})") + + url = f"http://localhost:{kernel.port}/clear_node_artifacts" + payload: dict = {"node_ids": node_ids} + if flow_id is not None: + payload["flow_id"] = flow_id + with httpx.Client(timeout=httpx.Timeout(30.0)) as client: + response = client.post(url, json=payload) + response.raise_for_status() + return ClearNodeArtifactsResult(**response.json()) + + async def get_node_artifacts(self, kernel_id: str, node_id: int) -> dict: + """Get artifacts published by a specific node.""" + kernel = self._get_kernel_or_raise(kernel_id) + if kernel.state not in (KernelState.IDLE, KernelState.EXECUTING): + raise RuntimeError(f"Kernel '{kernel_id}' is not running (state: {kernel.state})") + + url = f"http://localhost:{kernel.port}/artifacts/node/{node_id}" + async with httpx.AsyncClient(timeout=httpx.Timeout(30.0)) as client: + response = await client.get(url) + response.raise_for_status() + return response.json() + # ------------------------------------------------------------------ # Queries # ------------------------------------------------------------------ diff --git a/flowfile_core/flowfile_core/kernel/models.py b/flowfile_core/flowfile_core/kernel/models.py index 7d5158c30..8d539cf39 100644 --- a/flowfile_core/flowfile_core/kernel/models.py +++ b/flowfile_core/flowfile_core/kernel/models.py @@ -53,6 +53,18 @@ class ExecuteRequest(BaseModel): log_callback_url: str = "" +class ClearNodeArtifactsRequest(BaseModel): + """Request to selectively clear artifacts owned by specific node IDs.""" + node_ids: list[int] + flow_id: int | None = None + + +class ClearNodeArtifactsResult(BaseModel): + """Result of a selective artifact clear operation.""" + status: str = "cleared" + removed: list[str] = Field(default_factory=list) + + class ExecuteResult(BaseModel): success: bool output_paths: list[str] = Field(default_factory=list) diff --git a/flowfile_core/flowfile_core/kernel/routes.py b/flowfile_core/flowfile_core/kernel/routes.py index 8aef90c32..6b2679dbe 100644 --- a/flowfile_core/flowfile_core/kernel/routes.py +++ b/flowfile_core/flowfile_core/kernel/routes.py @@ -3,7 +3,15 @@ from fastapi import APIRouter, Depends, HTTPException from flowfile_core.auth.jwt import get_current_active_user -from flowfile_core.kernel.models import DockerStatus, ExecuteRequest, ExecuteResult, KernelConfig, KernelInfo +from flowfile_core.kernel.models import ( + ClearNodeArtifactsRequest, + ClearNodeArtifactsResult, + DockerStatus, + ExecuteRequest, + ExecuteResult, + KernelConfig, + KernelInfo, +) logger = logging.getLogger(__name__) @@ -166,3 +174,43 @@ async def clear_artifacts(kernel_id: str, current_user=Depends(get_current_activ return {"status": "cleared", "kernel_id": kernel_id} except RuntimeError as exc: raise HTTPException(status_code=400, detail=str(exc)) + + +@router.post("/{kernel_id}/clear_node_artifacts", response_model=ClearNodeArtifactsResult) +async def clear_node_artifacts( + kernel_id: str, + request: ClearNodeArtifactsRequest, + current_user=Depends(get_current_active_user), +): + """Clear only artifacts published by specific node IDs.""" + manager = _get_manager() + kernel = await manager.get_kernel(kernel_id) + if kernel is None: + raise HTTPException(status_code=404, detail=f"Kernel '{kernel_id}' not found") + if manager.get_kernel_owner(kernel_id) != current_user.id: + raise HTTPException(status_code=403, detail="Not authorized to access this kernel") + try: + return await manager.clear_node_artifacts(kernel_id, request.node_ids, flow_id=request.flow_id) + except RuntimeError as exc: + raise HTTPException(status_code=400, detail=str(exc)) + + +@router.get("/{kernel_id}/artifacts/node/{node_id}") +async def get_node_artifacts( + kernel_id: str, + node_id: int, + current_user=Depends(get_current_active_user), +): + """Get artifacts published by a specific node.""" + manager = _get_manager() + kernel = await manager.get_kernel(kernel_id) + if kernel is None: + raise HTTPException(status_code=404, detail=f"Kernel '{kernel_id}' not found") + if manager.get_kernel_owner(kernel_id) != current_user.id: + raise HTTPException(status_code=403, detail="Not authorized to access this kernel") + if kernel.state.value not in ("idle", "executing"): + raise HTTPException(status_code=400, detail=f"Kernel '{kernel_id}' is not running") + try: + return await manager.get_node_artifacts(kernel_id, node_id) + except Exception as exc: + raise HTTPException(status_code=500, detail=str(exc)) diff --git a/flowfile_core/tests/flowfile/test_artifact_context.py b/flowfile_core/tests/flowfile/test_artifact_context.py index 5fac95bd5..c40e8b808 100644 --- a/flowfile_core/tests/flowfile/test_artifact_context.py +++ b/flowfile_core/tests/flowfile/test_artifact_context.py @@ -276,6 +276,79 @@ def test_clear_all_removes_everything(self): assert ctx.get_all_artifacts() == {} +# --------------------------------------------------------------------------- +# ArtifactContext — Selective node clearing +# --------------------------------------------------------------------------- + + +class TestArtifactContextClearNodes: + def test_clear_nodes_removes_only_target(self): + ctx = ArtifactContext() + ctx.record_published(1, "k1", ["model"]) + ctx.record_published(2, "k1", ["encoder"]) + ctx.clear_nodes({1}) + assert ctx.get_published_by_node(1) == [] + assert len(ctx.get_published_by_node(2)) == 1 + assert ctx.get_kernel_artifacts("k1") == {"encoder": ctx.get_published_by_node(2)[0]} + + def test_clear_nodes_preserves_other_node_metadata(self): + """Clearing node 2 should leave node 1's artifacts intact.""" + ctx = ArtifactContext() + ctx.record_published(1, "k1", ["model"]) + ctx.record_published(2, "k1", ["scaler"]) + ctx.clear_nodes({2}) + published_1 = ctx.get_published_by_node(1) + assert len(published_1) == 1 + assert published_1[0].name == "model" + ka = ctx.get_kernel_artifacts("k1") + assert "model" in ka + assert "scaler" not in ka + + def test_clear_nodes_empty_set(self): + ctx = ArtifactContext() + ctx.record_published(1, "k1", ["model"]) + ctx.clear_nodes(set()) + assert len(ctx.get_published_by_node(1)) == 1 + + def test_clear_nodes_nonexistent(self): + ctx = ArtifactContext() + ctx.record_published(1, "k1", ["model"]) + ctx.clear_nodes({99}) # Should not raise + assert len(ctx.get_published_by_node(1)) == 1 + + def test_clear_nodes_allows_re_record(self): + """After clearing, the node can re-record new artifacts.""" + ctx = ArtifactContext() + ctx.record_published(1, "k1", ["model"]) + ctx.clear_nodes({1}) + ctx.record_published(1, "k1", ["model_v2"]) + published = ctx.get_published_by_node(1) + assert len(published) == 1 + assert published[0].name == "model_v2" + + def test_clear_nodes_updates_publisher_index(self): + """Publisher index should be cleaned up when a node is cleared.""" + ctx = ArtifactContext() + ctx.record_published(1, "k1", ["model"]) + ctx.clear_nodes({1}) + # After clearing, the artifact should not show up as available + avail = ctx.compute_available(node_id=2, kernel_id="k1", upstream_node_ids=[1]) + assert avail == {} + + def test_clear_nodes_preserves_upstream_for_downstream(self): + """Simulates debug mode: node 1 is skipped (not cleared), + node 2 is re-running (cleared). Node 3 should still see node 1's artifact.""" + ctx = ArtifactContext() + ctx.record_published(1, "k1", ["model"]) + ctx.record_published(2, "k1", ["predictions"]) + # Clear only node 2 (it will re-run) + ctx.clear_nodes({2}) + # Node 3 should still see "model" from node 1 + avail = ctx.compute_available(node_id=3, kernel_id="k1", upstream_node_ids=[1, 2]) + assert "model" in avail + assert "predictions" not in avail + + # --------------------------------------------------------------------------- # ArtifactContext — Queries # --------------------------------------------------------------------------- diff --git a/flowfile_core/tests/flowfile/test_kernel_integration.py b/flowfile_core/tests/flowfile/test_kernel_integration.py index 1b6130ede..b4b7d307d 100644 --- a/flowfile_core/tests/flowfile/test_kernel_integration.py +++ b/flowfile_core/tests/flowfile/test_kernel_integration.py @@ -1145,3 +1145,413 @@ def test_multi_input_read_inputs_named(self, kernel_manager: tuple[KernelManager finally: _kernel_mod._manager = _prev + + +# --------------------------------------------------------------------------- +# Tests — debug mode artifact persistence +# --------------------------------------------------------------------------- + + +class TestDebugModeArtifactPersistence: + """Integration tests verifying that artifacts survive re-runs in debug + (Development) mode when the producing node is skipped (up-to-date) but + a downstream consumer node needs to re-execute. + + This reproduces the exact scenario from the bug report: + 1. First run: Node 2 publishes 'linear_model', Node 3 reads it — OK. + 2. User changes Node 3's code. + 3. Second run: Node 2 is up-to-date → skipped, Node 3 re-runs → + must still be able to read 'linear_model' from kernel memory. + """ + + def test_artifact_survives_when_producer_skipped( + self, kernel_manager: tuple[KernelManager, str], + ): + """Core scenario: producer skipped, consumer re-runs, artifact accessible.""" + manager, kernel_id = kernel_manager + import flowfile_core.kernel as _kernel_mod + + _prev = _kernel_mod._manager + _kernel_mod._manager = manager + + try: + graph = _create_graph() + + # Node 1: input data + data = [ + {"x1": 1.0, "x2": 2.0, "y": 5.0}, + {"x1": 2.0, "x2": 3.0, "y": 8.0}, + {"x1": 3.0, "x2": 4.0, "y": 11.0}, + {"x1": 4.0, "x2": 5.0, "y": 14.0}, + ] + node_promise_1 = input_schema.NodePromise( + flow_id=1, node_id=1, node_type="manual_input", + ) + graph.add_node_promise(node_promise_1) + graph.add_manual_input( + input_schema.NodeManualInput( + flow_id=1, node_id=1, + raw_data_format=input_schema.RawData.from_pylist(data), + ) + ) + + # Node 2: train model and publish as artifact + node_promise_2 = input_schema.NodePromise( + flow_id=1, node_id=2, node_type="python_script", + ) + graph.add_node_promise(node_promise_2) + train_code = """ +import numpy as np +import polars as pl + +df = flowfile.read_input().collect() +X = np.column_stack([df["x1"].to_numpy(), df["x2"].to_numpy(), np.ones(len(df))]) +y_vals = df["y"].to_numpy() +coeffs = np.linalg.lstsq(X, y_vals, rcond=None)[0] +flowfile.publish_artifact("linear_model", {"coefficients": coeffs.tolist()}) +flowfile.publish_output(df) +""" + graph.add_python_script( + input_schema.NodePythonScript( + flow_id=1, node_id=2, depending_on_ids=[1], + python_script_input=input_schema.PythonScriptInput( + code=train_code, kernel_id=kernel_id, + ), + ) + ) + add_connection( + graph, + input_schema.NodeConnection.create_from_simple_input(1, 2), + ) + + # Node 3: read model artifact and produce predictions + node_promise_3 = input_schema.NodePromise( + flow_id=1, node_id=3, node_type="python_script", + ) + graph.add_node_promise(node_promise_3) + apply_code_v1 = """ +import numpy as np +import polars as pl + +df = flowfile.read_input().collect() +model = flowfile.read_artifact("linear_model") +coeffs = np.array(model["coefficients"]) +X = np.column_stack([df["x1"].to_numpy(), df["x2"].to_numpy(), np.ones(len(df))]) +predictions = X @ coeffs +result = df.with_columns(pl.Series("predicted_y", predictions)) +flowfile.publish_output(result) +""" + graph.add_python_script( + input_schema.NodePythonScript( + flow_id=1, node_id=3, depending_on_ids=[2], + python_script_input=input_schema.PythonScriptInput( + code=apply_code_v1, kernel_id=kernel_id, + ), + ) + ) + add_connection( + graph, + input_schema.NodeConnection.create_from_simple_input(2, 3), + ) + + # ---- First run: everything executes ---- + run_info_1 = graph.run_graph() + _handle_run_info(run_info_1) + + # Verify artifact was published and predictions were produced + published = graph.artifact_context.get_published_by_node(2) + assert any(r.name == "linear_model" for r in published) + node_3_df = graph.get_node(3).get_resulting_data().data_frame.collect() + assert "predicted_y" in node_3_df.columns + + # ---- Change Node 3's code (simulates user editing the consumer) ---- + # The new code still reads the same artifact but adds an extra column. + apply_code_v2 = """ +import numpy as np +import polars as pl + +df = flowfile.read_input().collect() +model = flowfile.read_artifact("linear_model") +coeffs = np.array(model["coefficients"]) +X = np.column_stack([df["x1"].to_numpy(), df["x2"].to_numpy(), np.ones(len(df))]) +predictions = X @ coeffs +residuals = df["y"].to_numpy() - predictions +result = df.with_columns( + pl.Series("predicted_y", predictions), + pl.Series("residual", residuals), +) +flowfile.publish_output(result) +""" + graph.add_python_script( + input_schema.NodePythonScript( + flow_id=1, node_id=3, depending_on_ids=[2], + python_script_input=input_schema.PythonScriptInput( + code=apply_code_v2, kernel_id=kernel_id, + ), + ) + ) + + # Verify the execution state before second run: + # Node 2 (producer) should still be up-to-date + node_2 = graph.get_node(2) + assert node_2._execution_state.has_run_with_current_setup, ( + "Producer node should be up-to-date (will be skipped)" + ) + # Node 3 (consumer) should need re-execution + node_3 = graph.get_node(3) + assert not node_3._execution_state.has_run_with_current_setup, ( + "Consumer node should be invalidated (will re-run)" + ) + + # ---- Second run: Node 2 is skipped, Node 3 re-runs ---- + # This is the critical test: Node 3 must still be able to + # read "linear_model" from kernel memory even though Node 2 + # did not re-execute. + run_info_2 = graph.run_graph() + _handle_run_info(run_info_2) + + # Verify the producer's artifact metadata is still tracked + published_after = graph.artifact_context.get_published_by_node(2) + assert any(r.name == "linear_model" for r in published_after), ( + "Producer's artifact metadata should be preserved when skipped" + ) + + # Verify the consumer ran with the new code (has residual column) + node_3_df_v2 = graph.get_node(3).get_resulting_data().data_frame.collect() + assert "predicted_y" in node_3_df_v2.columns + assert "residual" in node_3_df_v2.columns, ( + "Consumer should have run with updated code" + ) + # Residuals should be near-zero for this perfect linear fit + for r in node_3_df_v2["residual"].to_list(): + assert abs(r) < 0.01 + + finally: + _kernel_mod._manager = _prev + + def test_multiple_artifacts_survive_selective_clear( + self, kernel_manager: tuple[KernelManager, str], + ): + """Multiple artifacts from a skipped producer survive when only + the consumer is re-run.""" + manager, kernel_id = kernel_manager + import flowfile_core.kernel as _kernel_mod + + _prev = _kernel_mod._manager + _kernel_mod._manager = manager + + try: + graph = _create_graph() + + # Node 1: input data + data = [{"val": 10}, {"val": 20}, {"val": 30}] + graph.add_node_promise( + input_schema.NodePromise(flow_id=1, node_id=1, node_type="manual_input"), + ) + graph.add_manual_input( + input_schema.NodeManualInput( + flow_id=1, node_id=1, + raw_data_format=input_schema.RawData.from_pylist(data), + ) + ) + + # Node 2: publish two artifacts (model + scaler) + graph.add_node_promise( + input_schema.NodePromise(flow_id=1, node_id=2, node_type="python_script"), + ) + producer_code = """ +df = flowfile.read_input() +flowfile.publish_artifact("model", {"type": "linear", "coeff": 2.0}) +flowfile.publish_artifact("scaler", {"mean": 20.0, "std": 10.0}) +flowfile.publish_output(df) +""" + graph.add_python_script( + input_schema.NodePythonScript( + flow_id=1, node_id=2, depending_on_ids=[1], + python_script_input=input_schema.PythonScriptInput( + code=producer_code, kernel_id=kernel_id, + ), + ) + ) + add_connection( + graph, + input_schema.NodeConnection.create_from_simple_input(1, 2), + ) + + # Node 3: read both artifacts + graph.add_node_promise( + input_schema.NodePromise(flow_id=1, node_id=3, node_type="python_script"), + ) + consumer_code_v1 = """ +import polars as pl +df = flowfile.read_input().collect() +model = flowfile.read_artifact("model") +scaler = flowfile.read_artifact("scaler") +result = df.with_columns( + (pl.col("val") * model["coeff"]).alias("scaled"), +) +flowfile.publish_output(result) +""" + graph.add_python_script( + input_schema.NodePythonScript( + flow_id=1, node_id=3, depending_on_ids=[2], + python_script_input=input_schema.PythonScriptInput( + code=consumer_code_v1, kernel_id=kernel_id, + ), + ) + ) + add_connection( + graph, + input_schema.NodeConnection.create_from_simple_input(2, 3), + ) + + # First run + _handle_run_info(graph.run_graph()) + + # Change the consumer's code — also use the scaler now + consumer_code_v2 = """ +import polars as pl +df = flowfile.read_input().collect() +model = flowfile.read_artifact("model") +scaler = flowfile.read_artifact("scaler") +normalized = (pl.col("val") - scaler["mean"]) / scaler["std"] +result = df.with_columns( + (pl.col("val") * model["coeff"]).alias("scaled"), + normalized.alias("normalized"), +) +flowfile.publish_output(result) +""" + graph.add_python_script( + input_schema.NodePythonScript( + flow_id=1, node_id=3, depending_on_ids=[2], + python_script_input=input_schema.PythonScriptInput( + code=consumer_code_v2, kernel_id=kernel_id, + ), + ) + ) + + # Second run — producer skipped, consumer re-runs + _handle_run_info(graph.run_graph()) + + # Both artifacts should still be accessible + published = graph.artifact_context.get_published_by_node(2) + names = {r.name for r in published} + assert "model" in names + assert "scaler" in names + + # Consumer should have the new column + df_out = graph.get_node(3).get_resulting_data().data_frame.collect() + assert "scaled" in df_out.columns + assert "normalized" in df_out.columns + + finally: + _kernel_mod._manager = _prev + + def test_rerun_producer_clears_old_artifacts( + self, kernel_manager: tuple[KernelManager, str], + ): + """When the producer itself is changed and re-runs, its old + artifacts are properly cleared before re-execution.""" + manager, kernel_id = kernel_manager + import flowfile_core.kernel as _kernel_mod + + _prev = _kernel_mod._manager + _kernel_mod._manager = manager + + try: + graph = _create_graph() + + # Node 1: input + data = [{"val": 1}] + graph.add_node_promise( + input_schema.NodePromise(flow_id=1, node_id=1, node_type="manual_input"), + ) + graph.add_manual_input( + input_schema.NodeManualInput( + flow_id=1, node_id=1, + raw_data_format=input_schema.RawData.from_pylist(data), + ) + ) + + # Node 2: publish artifact v1 + graph.add_node_promise( + input_schema.NodePromise(flow_id=1, node_id=2, node_type="python_script"), + ) + code_v1 = """ +df = flowfile.read_input() +flowfile.publish_artifact("model", {"version": 1}) +flowfile.publish_output(df) +""" + graph.add_python_script( + input_schema.NodePythonScript( + flow_id=1, node_id=2, depending_on_ids=[1], + python_script_input=input_schema.PythonScriptInput( + code=code_v1, kernel_id=kernel_id, + ), + ) + ) + add_connection( + graph, + input_schema.NodeConnection.create_from_simple_input(1, 2), + ) + + # Node 3: read artifact + graph.add_node_promise( + input_schema.NodePromise(flow_id=1, node_id=3, node_type="python_script"), + ) + consumer_code = """ +df = flowfile.read_input() +model = flowfile.read_artifact("model") +print(f"model version: {model['version']}") +flowfile.publish_output(df) +""" + graph.add_python_script( + input_schema.NodePythonScript( + flow_id=1, node_id=3, depending_on_ids=[2], + python_script_input=input_schema.PythonScriptInput( + code=consumer_code, kernel_id=kernel_id, + ), + ) + ) + add_connection( + graph, + input_schema.NodeConnection.create_from_simple_input(2, 3), + ) + + # First run + _handle_run_info(graph.run_graph()) + + published = graph.artifact_context.get_published_by_node(2) + assert any(r.name == "model" for r in published) + + # Change the PRODUCER (Node 2) — publish v2 of the artifact + code_v2 = """ +df = flowfile.read_input() +flowfile.publish_artifact("model", {"version": 2}) +flowfile.publish_output(df) +""" + graph.add_python_script( + input_schema.NodePythonScript( + flow_id=1, node_id=2, depending_on_ids=[1], + python_script_input=input_schema.PythonScriptInput( + code=code_v2, kernel_id=kernel_id, + ), + ) + ) + + # Both Node 2 and Node 3 should need re-execution + # (Node 3 because its upstream changed via evaluate_nodes) + assert not graph.get_node(2)._execution_state.has_run_with_current_setup + assert not graph.get_node(3)._execution_state.has_run_with_current_setup + + # Second run — both re-execute; old "model" must be cleared + # before Node 2 re-publishes, otherwise publish would fail + # with "already exists". + _handle_run_info(graph.run_graph()) + + # Artifact should be the new version + published_v2 = graph.artifact_context.get_published_by_node(2) + assert any(r.name == "model" for r in published_v2) + + finally: + _kernel_mod._manager = _prev diff --git a/kernel_runtime/kernel_runtime/artifact_store.py b/kernel_runtime/kernel_runtime/artifact_store.py index a18ed81db..f9382493d 100644 --- a/kernel_runtime/kernel_runtime/artifact_store.py +++ b/kernel_runtime/kernel_runtime/artifact_store.py @@ -5,50 +5,101 @@ class ArtifactStore: - """Thread-safe in-memory store for Python artifacts produced during kernel execution.""" + """Thread-safe in-memory store for Python artifacts produced during kernel execution. + + Artifacts are scoped by ``flow_id`` so that multiple flows sharing the + same kernel container cannot collide on artifact names. + """ def __init__(self): self._lock = threading.Lock() - self._artifacts: dict[str, dict[str, Any]] = {} + # Keyed by (flow_id, name) so each flow has its own namespace. + self._artifacts: dict[tuple[int, str], dict[str, Any]] = {} - def publish(self, name: str, obj: Any, node_id: int) -> None: + def publish(self, name: str, obj: Any, node_id: int, flow_id: int = 0) -> None: + key = (flow_id, name) with self._lock: - if name in self._artifacts: + if key in self._artifacts: raise ValueError( f"Artifact '{name}' already exists (published by node " - f"{self._artifacts[name]['node_id']}). " + f"{self._artifacts[key]['node_id']}). " f"Delete it first with flowfile.delete_artifact('{name}') " f"before publishing a new one with the same name." ) - self._artifacts[name] = { + self._artifacts[key] = { "object": obj, "name": name, "type_name": type(obj).__name__, "module": type(obj).__module__, "node_id": node_id, + "flow_id": flow_id, "created_at": datetime.now(timezone.utc).isoformat(), "size_bytes": sys.getsizeof(obj), } - def delete(self, name: str) -> None: + def delete(self, name: str, flow_id: int = 0) -> None: + key = (flow_id, name) with self._lock: - if name not in self._artifacts: + if key not in self._artifacts: raise KeyError(f"Artifact '{name}' not found") - del self._artifacts[name] + del self._artifacts[key] - def get(self, name: str) -> Any: + def get(self, name: str, flow_id: int = 0) -> Any: + key = (flow_id, name) with self._lock: - if name not in self._artifacts: + if key not in self._artifacts: raise KeyError(f"Artifact '{name}' not found") - return self._artifacts[name]["object"] + return self._artifacts[key]["object"] - def list_all(self) -> dict[str, dict[str, Any]]: + def list_all(self, flow_id: int | None = None) -> dict[str, dict[str, Any]]: + """Return metadata for all artifacts, optionally filtered by *flow_id*.""" with self._lock: return { - name: {k: v for k, v in meta.items() if k != "object"} - for name, meta in self._artifacts.items() + meta["name"]: {k: v for k, v in meta.items() if k != "object"} + for (_fid, _name), meta in self._artifacts.items() + if flow_id is None or _fid == flow_id } - def clear(self) -> None: + def clear(self, flow_id: int | None = None) -> None: + """Clear all artifacts, or only those belonging to *flow_id*.""" + with self._lock: + if flow_id is None: + self._artifacts.clear() + else: + to_remove = [ + key for key in self._artifacts if key[0] == flow_id + ] + for key in to_remove: + del self._artifacts[key] + + def clear_by_node_ids( + self, node_ids: set[int], flow_id: int | None = None, + ) -> list[str]: + """Remove all artifacts published by the given *node_ids*. + + When *flow_id* is provided, only artifacts in that flow are + considered. Returns the names of deleted artifacts. + """ with self._lock: - self._artifacts.clear() + to_remove = [ + key + for key, meta in self._artifacts.items() + if meta["node_id"] in node_ids + and (flow_id is None or key[0] == flow_id) + ] + removed_names = [self._artifacts[key]["name"] for key in to_remove] + for key in to_remove: + del self._artifacts[key] + return removed_names + + def list_by_node_id( + self, node_id: int, flow_id: int | None = None, + ) -> dict[str, dict[str, Any]]: + """Return metadata for artifacts published by *node_id*.""" + with self._lock: + return { + meta["name"]: {k: v for k, v in meta.items() if k != "object"} + for (_fid, _name), meta in self._artifacts.items() + if meta["node_id"] == node_id + and (flow_id is None or _fid == flow_id) + } diff --git a/kernel_runtime/kernel_runtime/flowfile_client.py b/kernel_runtime/kernel_runtime/flowfile_client.py index 1c5e5bf0d..b68acb87f 100644 --- a/kernel_runtime/kernel_runtime/flowfile_client.py +++ b/kernel_runtime/kernel_runtime/flowfile_client.py @@ -116,22 +116,26 @@ def publish_output(df: pl.LazyFrame | pl.DataFrame, name: str = "main") -> None: def publish_artifact(name: str, obj: Any) -> None: store: ArtifactStore = _get_context_value("artifact_store") node_id: int = _get_context_value("node_id") - store.publish(name, obj, node_id) + flow_id: int = _get_context_value("flow_id") + store.publish(name, obj, node_id, flow_id=flow_id) def read_artifact(name: str) -> Any: store: ArtifactStore = _get_context_value("artifact_store") - return store.get(name) + flow_id: int = _get_context_value("flow_id") + return store.get(name, flow_id=flow_id) def delete_artifact(name: str) -> None: store: ArtifactStore = _get_context_value("artifact_store") - store.delete(name) + flow_id: int = _get_context_value("flow_id") + store.delete(name, flow_id=flow_id) def list_artifacts() -> dict: store: ArtifactStore = _get_context_value("artifact_store") - return store.list_all() + flow_id: int = _get_context_value("flow_id") + return store.list_all(flow_id=flow_id) # ===== Logging APIs ===== diff --git a/kernel_runtime/kernel_runtime/main.py b/kernel_runtime/kernel_runtime/main.py index 918a989a7..7ea76bbca 100644 --- a/kernel_runtime/kernel_runtime/main.py +++ b/kernel_runtime/kernel_runtime/main.py @@ -4,7 +4,7 @@ import time from pathlib import Path -from fastapi import FastAPI +from fastapi import FastAPI, Query from pydantic import BaseModel from kernel_runtime import __version__, flowfile_client @@ -23,6 +23,11 @@ class ExecuteRequest(BaseModel): log_callback_url: str = "" +class ClearNodeArtifactsRequest(BaseModel): + node_ids: list[int] + flow_id: int | None = None + + class ExecuteResponse(BaseModel): success: bool output_paths: list[str] = [] @@ -44,7 +49,11 @@ async def execute(request: ExecuteRequest): if output_dir: os.makedirs(output_dir, exist_ok=True) - artifacts_before = set(artifact_store.list_all().keys()) + # Clear any artifacts this node previously published so re-execution + # doesn't fail with "already exists". + artifact_store.clear_by_node_ids({request.node_id}, flow_id=request.flow_id) + + artifacts_before = set(artifact_store.list_all(flow_id=request.flow_id).keys()) try: flowfile_client._set_context( @@ -66,7 +75,7 @@ async def execute(request: ExecuteRequest): str(p) for p in sorted(Path(output_dir).glob("*.parquet")) ] - artifacts_after = set(artifact_store.list_all().keys()) + artifacts_after = set(artifact_store.list_all(flow_id=request.flow_id).keys()) new_artifacts = sorted(artifacts_after - artifacts_before) deleted_artifacts = sorted(artifacts_before - artifacts_after) @@ -94,14 +103,33 @@ async def execute(request: ExecuteRequest): @app.post("/clear") -async def clear_artifacts(): - artifact_store.clear() +async def clear_artifacts(flow_id: int | None = Query(default=None)): + """Clear all artifacts, or only those belonging to a specific flow.""" + artifact_store.clear(flow_id=flow_id) return {"status": "cleared"} +@app.post("/clear_node_artifacts") +async def clear_node_artifacts(request: ClearNodeArtifactsRequest): + """Clear only artifacts published by the specified node IDs.""" + removed = artifact_store.clear_by_node_ids( + set(request.node_ids), flow_id=request.flow_id, + ) + return {"status": "cleared", "removed": removed} + + @app.get("/artifacts") -async def list_artifacts(): - return artifact_store.list_all() +async def list_artifacts(flow_id: int | None = Query(default=None)): + """List all artifacts, optionally filtered by flow_id.""" + return artifact_store.list_all(flow_id=flow_id) + + +@app.get("/artifacts/node/{node_id}") +async def list_node_artifacts( + node_id: int, flow_id: int | None = Query(default=None), +): + """List artifacts published by a specific node.""" + return artifact_store.list_by_node_id(node_id, flow_id=flow_id) @app.get("/health") diff --git a/kernel_runtime/tests/test_artifact_store.py b/kernel_runtime/tests/test_artifact_store.py index 6c9564525..c138aa4b3 100644 --- a/kernel_runtime/tests/test_artifact_store.py +++ b/kernel_runtime/tests/test_artifact_store.py @@ -106,6 +106,131 @@ def test_delete_only_target(self, store: ArtifactStore): assert set(store.list_all().keys()) == {"keep"} +class TestClearByNodeIds: + def test_clear_by_node_ids_removes_only_target(self, store: ArtifactStore): + store.publish("a", 1, node_id=1) + store.publish("b", 2, node_id=2) + store.publish("c", 3, node_id=1) + removed = store.clear_by_node_ids({1}) + assert sorted(removed) == ["a", "c"] + assert "b" in store.list_all() + assert "a" not in store.list_all() + assert "c" not in store.list_all() + + def test_clear_by_node_ids_empty_set(self, store: ArtifactStore): + store.publish("x", 1, node_id=1) + removed = store.clear_by_node_ids(set()) + assert removed == [] + assert "x" in store.list_all() + + def test_clear_by_node_ids_nonexistent(self, store: ArtifactStore): + store.publish("x", 1, node_id=1) + removed = store.clear_by_node_ids({99}) + assert removed == [] + assert "x" in store.list_all() + + def test_clear_by_node_ids_multiple(self, store: ArtifactStore): + store.publish("a", 1, node_id=1) + store.publish("b", 2, node_id=2) + store.publish("c", 3, node_id=3) + removed = store.clear_by_node_ids({1, 3}) + assert sorted(removed) == ["a", "c"] + assert set(store.list_all().keys()) == {"b"} + + def test_clear_allows_republish(self, store: ArtifactStore): + """After clearing a node's artifacts, re-publishing with the same name works.""" + store.publish("model", {"v": 1}, node_id=5) + store.clear_by_node_ids({5}) + store.publish("model", {"v": 2}, node_id=5) + assert store.get("model") == {"v": 2} + + +class TestListByNodeId: + def test_list_by_node_id(self, store: ArtifactStore): + store.publish("a", 1, node_id=1) + store.publish("b", 2, node_id=2) + store.publish("c", 3, node_id=1) + listing = store.list_by_node_id(1) + assert set(listing.keys()) == {"a", "c"} + + def test_list_by_node_id_empty(self, store: ArtifactStore): + assert store.list_by_node_id(99) == {} + + def test_list_by_node_id_excludes_object(self, store: ArtifactStore): + store.publish("x", {"secret": "data"}, node_id=1) + listing = store.list_by_node_id(1) + assert "object" not in listing["x"] + + +class TestFlowIsolation: + """Artifacts with the same name in different flows are independent.""" + + def test_same_name_different_flows(self, store: ArtifactStore): + store.publish("model", "flow1_model", node_id=1, flow_id=1) + store.publish("model", "flow2_model", node_id=2, flow_id=2) + assert store.get("model", flow_id=1) == "flow1_model" + assert store.get("model", flow_id=2) == "flow2_model" + + def test_delete_scoped_to_flow(self, store: ArtifactStore): + store.publish("model", "v1", node_id=1, flow_id=1) + store.publish("model", "v2", node_id=2, flow_id=2) + store.delete("model", flow_id=1) + # flow 2's artifact is untouched + assert store.get("model", flow_id=2) == "v2" + with pytest.raises(KeyError): + store.get("model", flow_id=1) + + def test_list_all_filtered_by_flow(self, store: ArtifactStore): + store.publish("a", 1, node_id=1, flow_id=1) + store.publish("b", 2, node_id=2, flow_id=2) + store.publish("c", 3, node_id=1, flow_id=1) + assert set(store.list_all(flow_id=1).keys()) == {"a", "c"} + assert set(store.list_all(flow_id=2).keys()) == {"b"} + + def test_list_all_unfiltered_returns_everything(self, store: ArtifactStore): + store.publish("a", 1, node_id=1, flow_id=1) + store.publish("b", 2, node_id=2, flow_id=2) + assert set(store.list_all().keys()) == {"a", "b"} + + def test_clear_scoped_to_flow(self, store: ArtifactStore): + store.publish("a", 1, node_id=1, flow_id=1) + store.publish("b", 2, node_id=2, flow_id=2) + store.clear(flow_id=1) + with pytest.raises(KeyError): + store.get("a", flow_id=1) + assert store.get("b", flow_id=2) == 2 + + def test_clear_all_clears_every_flow(self, store: ArtifactStore): + store.publish("a", 1, node_id=1, flow_id=1) + store.publish("b", 2, node_id=2, flow_id=2) + store.clear() + assert store.list_all() == {} + + def test_clear_by_node_ids_scoped_to_flow(self, store: ArtifactStore): + """Same node_id in different flows — only the targeted flow is cleared.""" + store.publish("model", "f1", node_id=5, flow_id=1) + store.publish("model", "f2", node_id=5, flow_id=2) + removed = store.clear_by_node_ids({5}, flow_id=1) + assert removed == ["model"] + # flow 2's artifact survives + assert store.get("model", flow_id=2) == "f2" + with pytest.raises(KeyError): + store.get("model", flow_id=1) + + def test_list_by_node_id_scoped_to_flow(self, store: ArtifactStore): + store.publish("a", 1, node_id=5, flow_id=1) + store.publish("b", 2, node_id=5, flow_id=2) + assert set(store.list_by_node_id(5, flow_id=1).keys()) == {"a"} + assert set(store.list_by_node_id(5, flow_id=2).keys()) == {"b"} + # Unfiltered returns both + assert set(store.list_by_node_id(5).keys()) == {"a", "b"} + + def test_metadata_includes_flow_id(self, store: ArtifactStore): + store.publish("item", 42, node_id=1, flow_id=7) + meta = store.list_all(flow_id=7)["item"] + assert meta["flow_id"] == 7 + + class TestThreadSafety: def test_concurrent_publishes(self, store: ArtifactStore): errors = [] diff --git a/kernel_runtime/tests/test_main.py b/kernel_runtime/tests/test_main.py index dd0bf7d8f..7f429454f 100644 --- a/kernel_runtime/tests/test_main.py +++ b/kernel_runtime/tests/test_main.py @@ -404,6 +404,46 @@ def test_delete_artifact_via_execute(self, client: TestClient): resp_list = client.get("/artifacts") assert "temp" not in resp_list.json() + def test_same_node_reexecution_clears_own_artifacts(self, client: TestClient): + """Re-executing the same node auto-clears its previous artifacts.""" + resp1 = client.post( + "/execute", + json={ + "node_id": 24, + "code": 'flowfile.publish_artifact("model", "v1")', + "input_paths": {}, + "output_dir": "", + }, + ) + assert resp1.json()["success"] is True + assert "model" in resp1.json()["artifacts_published"] + + # Same node re-executes — should NOT fail with "already exists" + resp2 = client.post( + "/execute", + json={ + "node_id": 24, + "code": 'flowfile.publish_artifact("model", "v2")', + "input_paths": {}, + "output_dir": "", + }, + ) + assert resp2.json()["success"] is True + assert "model" in resp2.json()["artifacts_published"] + + # Verify we get v2 + resp3 = client.post( + "/execute", + json={ + "node_id": 99, + "code": 'v = flowfile.read_artifact("model"); print(v)', + "input_paths": {}, + "output_dir": "", + }, + ) + assert resp3.json()["success"] is True + assert "v2" in resp3.json()["stdout"] + def test_delete_then_republish_via_execute(self, client: TestClient): """After deleting, a new artifact with the same name can be published.""" client.post( @@ -453,6 +493,120 @@ def test_delete_then_republish_via_execute(self, client: TestClient): assert "v2" in resp_read.json()["stdout"] +class TestClearNodeArtifactsEndpoint: + def test_clear_node_artifacts_selective(self, client: TestClient): + """Only artifacts from specified node IDs should be removed.""" + # Publish artifacts from two different nodes + client.post( + "/execute", + json={ + "node_id": 40, + "code": 'flowfile.publish_artifact("model", {"v": 1})', + "input_paths": {}, + "output_dir": "", + }, + ) + client.post( + "/execute", + json={ + "node_id": 41, + "code": 'flowfile.publish_artifact("scaler", {"v": 2})', + "input_paths": {}, + "output_dir": "", + }, + ) + + # Clear only node 40's artifacts + resp = client.post("/clear_node_artifacts", json={"node_ids": [40]}) + assert resp.status_code == 200 + data = resp.json() + assert data["status"] == "cleared" + assert "model" in data["removed"] + + # "scaler" from node 41 should still exist + artifacts = client.get("/artifacts").json() + assert "model" not in artifacts + assert "scaler" in artifacts + + def test_clear_node_artifacts_empty_list(self, client: TestClient): + """Passing empty list should not remove anything.""" + client.post( + "/execute", + json={ + "node_id": 42, + "code": 'flowfile.publish_artifact("keep_me", 42)', + "input_paths": {}, + "output_dir": "", + }, + ) + resp = client.post("/clear_node_artifacts", json={"node_ids": []}) + assert resp.status_code == 200 + assert resp.json()["removed"] == [] + assert "keep_me" in client.get("/artifacts").json() + + def test_clear_node_artifacts_allows_republish(self, client: TestClient): + """After clearing, the same artifact name can be re-published.""" + client.post( + "/execute", + json={ + "node_id": 43, + "code": 'flowfile.publish_artifact("reuse", "v1")', + "input_paths": {}, + "output_dir": "", + }, + ) + client.post("/clear_node_artifacts", json={"node_ids": [43]}) + resp = client.post( + "/execute", + json={ + "node_id": 43, + "code": 'flowfile.publish_artifact("reuse", "v2")', + "input_paths": {}, + "output_dir": "", + }, + ) + assert resp.json()["success"] is True + + +class TestNodeArtifactsEndpoint: + def test_list_node_artifacts(self, client: TestClient): + """Should return only artifacts for the specified node.""" + client.post( + "/execute", + json={ + "node_id": 50, + "code": ( + 'flowfile.publish_artifact("a", 1)\n' + 'flowfile.publish_artifact("b", 2)\n' + ), + "input_paths": {}, + "output_dir": "", + }, + ) + client.post( + "/execute", + json={ + "node_id": 51, + "code": 'flowfile.publish_artifact("c", 3)', + "input_paths": {}, + "output_dir": "", + }, + ) + + resp = client.get("/artifacts/node/50") + assert resp.status_code == 200 + data = resp.json() + assert set(data.keys()) == {"a", "b"} + + resp2 = client.get("/artifacts/node/51") + assert set(resp2.json().keys()) == {"c"} + + def test_list_node_artifacts_empty(self, client: TestClient): + resp = client.get("/artifacts/node/999") + assert resp.status_code == 200 + assert resp.json() == {} + + class TestContextCleanup: def test_context_cleared_after_success(self, client: TestClient): """After a successful /execute, the flowfile context should be cleared.""" @@ -501,3 +655,206 @@ def test_context_cleared_after_error(self, client: TestClient): data = resp.json() assert data["success"] is True assert "still works" in data["stdout"] + + +class TestFlowIsolation: + """Artifacts published by different flows don't interfere with each other.""" + + def test_same_artifact_name_different_flows(self, client: TestClient): + """Two flows can each publish an artifact called 'model' independently.""" + resp1 = client.post( + "/execute", + json={ + "node_id": 1, + "code": 'flowfile.publish_artifact("model", "flow1_model")', + "input_paths": {}, + "output_dir": "", + "flow_id": 1, + }, + ) + assert resp1.json()["success"] is True + + resp2 = client.post( + "/execute", + json={ + "node_id": 1, + "code": 'flowfile.publish_artifact("model", "flow2_model")', + "input_paths": {}, + "output_dir": "", + "flow_id": 2, + }, + ) + assert resp2.json()["success"] is True + + # Each flow reads its own artifact + resp_read1 = client.post( + "/execute", + json={ + "node_id": 99, + "code": 'v = flowfile.read_artifact("model"); print(v)', + "input_paths": {}, + "output_dir": "", + "flow_id": 1, + }, + ) + assert resp_read1.json()["success"] is True + assert "flow1_model" in resp_read1.json()["stdout"] + + resp_read2 = client.post( + "/execute", + json={ + "node_id": 99, + "code": 'v = flowfile.read_artifact("model"); print(v)', + "input_paths": {}, + "output_dir": "", + "flow_id": 2, + }, + ) + assert resp_read2.json()["success"] is True + assert "flow2_model" in resp_read2.json()["stdout"] + + def test_flow_cannot_read_other_flows_artifact(self, client: TestClient): + """Flow 1 publishes 'secret'; flow 2 should not see it.""" + client.post( + "/execute", + json={ + "node_id": 1, + "code": 'flowfile.publish_artifact("secret", "hidden")', + "input_paths": {}, + "output_dir": "", + "flow_id": 1, + }, + ) + + resp = client.post( + "/execute", + json={ + "node_id": 2, + "code": 'flowfile.read_artifact("secret")', + "input_paths": {}, + "output_dir": "", + "flow_id": 2, + }, + ) + data = resp.json() + assert data["success"] is False + assert "not found" in data["error"] + + def test_reexecution_only_clears_own_flow(self, client: TestClient): + """Re-executing a node in flow 1 doesn't clear flow 2's artifacts.""" + # Flow 1, node 5 publishes "model" + client.post( + "/execute", + json={ + "node_id": 5, + "code": 'flowfile.publish_artifact("model", "f1v1")', + "input_paths": {}, + "output_dir": "", + "flow_id": 1, + }, + ) + # Flow 2, node 5 publishes "model" + client.post( + "/execute", + json={ + "node_id": 5, + "code": 'flowfile.publish_artifact("model", "f2v1")', + "input_paths": {}, + "output_dir": "", + "flow_id": 2, + }, + ) + + # Re-execute node 5 in flow 1 — auto-clear only affects flow 1 + resp = client.post( + "/execute", + json={ + "node_id": 5, + "code": 'flowfile.publish_artifact("model", "f1v2")', + "input_paths": {}, + "output_dir": "", + "flow_id": 1, + }, + ) + assert resp.json()["success"] is True + + # Flow 2's artifact should be untouched + resp_f2 = client.post( + "/execute", + json={ + "node_id": 99, + "code": 'v = flowfile.read_artifact("model"); print(v)', + "input_paths": {}, + "output_dir": "", + "flow_id": 2, + }, + ) + assert resp_f2.json()["success"] is True + assert "f2v1" in resp_f2.json()["stdout"] + + def test_list_artifacts_filtered_by_flow(self, client: TestClient): + """GET /artifacts?flow_id=X returns only that flow's artifacts.""" + client.post( + "/execute", + json={ + "node_id": 1, + "code": 'flowfile.publish_artifact("a", 1)', + "input_paths": {}, + "output_dir": "", + "flow_id": 10, + }, + ) + client.post( + "/execute", + json={ + "node_id": 2, + "code": 'flowfile.publish_artifact("b", 2)', + "input_paths": {}, + "output_dir": "", + "flow_id": 20, + }, + ) + + resp10 = client.get("/artifacts", params={"flow_id": 10}) + assert set(resp10.json().keys()) == {"a"} + + resp20 = client.get("/artifacts", params={"flow_id": 20}) + assert set(resp20.json().keys()) == {"b"} + + # No filter returns both + resp_all = client.get("/artifacts") + assert set(resp_all.json().keys()) == {"a", "b"} + + def test_clear_node_artifacts_scoped_to_flow(self, client: TestClient): + """POST /clear_node_artifacts with flow_id only clears that flow.""" + client.post( + "/execute", + json={ + "node_id": 5, + "code": 'flowfile.publish_artifact("model", "f1")', + "input_paths": {}, + "output_dir": "", + "flow_id": 1, + }, + ) + client.post( + "/execute", + json={ + "node_id": 5, + "code": 'flowfile.publish_artifact("model", "f2")', + "input_paths": {}, + "output_dir": "", + "flow_id": 2, + }, + ) + + resp = client.post( + "/clear_node_artifacts", + json={"node_ids": [5], "flow_id": 1}, + ) + assert resp.json()["status"] == "cleared" + assert "model" in resp.json()["removed"] + + # Flow 2's artifact survives + artifacts_f2 = client.get("/artifacts", params={"flow_id": 2}).json() + assert "model" in artifacts_f2 From 69e0a3a1ae4dfe4d44d3550663002a169137720d Mon Sep 17 00:00:00 2001 From: edwardvanechoud Date: Mon, 2 Feb 2026 18:17:02 +0100 Subject: [PATCH 08/38] fixing issue in index.ts --- flowfile_frontend/src/renderer/app/router/index.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/flowfile_frontend/src/renderer/app/router/index.ts b/flowfile_frontend/src/renderer/app/router/index.ts index 224267c22..b93fb2353 100644 --- a/flowfile_frontend/src/renderer/app/router/index.ts +++ b/flowfile_frontend/src/renderer/app/router/index.ts @@ -66,6 +66,7 @@ const routes: Array = [ path: "kernelManager", component: () => import("../views/KernelManagerView/KernelManagerView.vue"), }, + { name: "catalog", path: "catalog", component: () => import("../views/CatalogView/CatalogView.vue"), From f358b666433a0a19a2a1a5a499729f0df22458bb Mon Sep 17 00:00:00 2001 From: Edward van Eechoud <41021650+Edwardvaneechoud@users.noreply.github.com> Date: Tue, 3 Feb 2026 07:29:32 +0100 Subject: [PATCH 09/38] Fix artifact not found on re-run when consumer deletes artifact (#294) When a python_script node deletes an artifact (via delete_artifact) and is later re-executed (e.g. after a code change), the upstream producer node was not being re-run. This meant the deleted artifact was permanently lost from the kernel's in-memory store, causing a KeyError on the consumer's read_artifact call. The fix tracks which node originally published each deleted artifact (_deletion_origins in ArtifactContext). During the pre-execution phase in run_graph, if a re-running node previously deleted artifacts, the original producer nodes are added to the re-run set and their execution state is marked stale so they actually re-execute and republish. --- .../flowfile_core/flowfile/artifacts.py | 39 ++++++ .../flowfile_core/flowfile/flow_graph.py | 19 +++ .../tests/flowfile/test_artifact_context.py | 61 ++++++++++ .../tests/flowfile/test_kernel_integration.py | 115 ++++++++++++++++++ 4 files changed, 234 insertions(+) diff --git a/flowfile_core/flowfile_core/flowfile/artifacts.py b/flowfile_core/flowfile_core/flowfile/artifacts.py index c877cfd37..8b6d33337 100644 --- a/flowfile_core/flowfile_core/flowfile/artifacts.py +++ b/flowfile_core/flowfile_core/flowfile/artifacts.py @@ -72,6 +72,11 @@ def __init__(self) -> None: # Reverse index: (kernel_id, artifact_name) → set of node_ids that # published it. Avoids O(N) scan in record_deleted / clear_kernel. self._publisher_index: dict[tuple[str, str], set[int]] = {} + # Tracks which nodes produced the artifacts that were deleted by each + # node. Used during re-execution to force producers to re-run when + # a consumer that deleted their artifacts needs to re-execute. + # Maps: deleter_node_id → [(kernel_id, artifact_name, publisher_node_id), …] + self._deletion_origins: dict[int, list[tuple[str, str, int]]] = {} # ------------------------------------------------------------------ # Recording @@ -150,6 +155,14 @@ def record_deleted( # Use the reverse index to update only the affected nodes key = (kernel_id, name) publisher_ids = self._publisher_index.pop(key, set()) + + # Remember which nodes produced these artifacts so we can + # force them to re-run if this deleter node is re-executed. + for pid in publisher_ids: + self._deletion_origins.setdefault(node_id, []).append( + (kernel_id, name, pid) + ) + for pid in publisher_ids: ns = self._node_states.get(pid) if ns is not None: @@ -233,6 +246,21 @@ def get_all_artifacts(self) -> dict[str, ArtifactRef]: result.update(kernel_map) return result + def get_producer_nodes_for_deletions( + self, deleter_node_ids: set[int], + ) -> set[int]: + """Return node IDs that produced artifacts deleted by *deleter_node_ids*. + + When a consumer node that previously deleted artifacts needs to + re-execute, the original producer nodes must also re-run so the + artifacts are available again in the kernel's in-memory store. + """ + producers: set[int] = set() + for nid in deleter_node_ids: + for _kernel_id, _name, pub_id in self._deletion_origins.get(nid, []): + producers.add(pub_id) + return producers + # ------------------------------------------------------------------ # Clearing # ------------------------------------------------------------------ @@ -248,6 +276,15 @@ def clear_kernel(self, kernel_id: str) -> None: for k in keys_to_remove: del self._publisher_index[k] + # Clean deletion origin entries for this kernel + for nid in list(self._deletion_origins): + self._deletion_origins[nid] = [ + entry for entry in self._deletion_origins[nid] + if entry[0] != kernel_id + ] + if not self._deletion_origins[nid]: + del self._deletion_origins[nid] + self._kernel_artifacts.pop(kernel_id, None) for state in self._node_states.values(): state.published = [r for r in state.published if r.kernel_id != kernel_id] @@ -260,6 +297,7 @@ def clear_all(self) -> None: self._node_states.clear() self._kernel_artifacts.clear() self._publisher_index.clear() + self._deletion_origins.clear() def clear_nodes(self, node_ids: set[int]) -> None: """Remove tracking data only for the specified *node_ids*. @@ -269,6 +307,7 @@ def clear_nodes(self, node_ids: set[int]) -> None: left untouched so their artifact metadata is preserved. """ for nid in node_ids: + self._deletion_origins.pop(nid, None) state = self._node_states.pop(nid, None) if state is None: continue diff --git a/flowfile_core/flowfile_core/flowfile/flow_graph.py b/flowfile_core/flowfile_core/flowfile/flow_graph.py index 9fa051754..784fd9c7e 100644 --- a/flowfile_core/flowfile_core/flowfile/flow_graph.py +++ b/flowfile_core/flowfile_core/flowfile/flow_graph.py @@ -2566,6 +2566,25 @@ def run_graph(self) -> RunInformation | None: plan_skip_ids: set[str | int] = {n.node_id for n in execution_plan.skip_nodes} rerun_node_ids = self._compute_rerun_python_script_node_ids(plan_skip_ids) + # Expand re-run set: if a re-running node previously deleted + # artifacts, the original producer nodes must also re-run so + # those artifacts are available again in the kernel store. + while True: + deleted_producers = self.artifact_context.get_producer_nodes_for_deletions( + rerun_node_ids, + ) + new_ids = deleted_producers - rerun_node_ids + if not new_ids: + break + rerun_node_ids |= new_ids + + # Force producer nodes (added due to artifact deletions) to + # actually re-execute by marking their execution state stale. + for nid in rerun_node_ids: + node = self.get_node(nid) + if node is not None and node._execution_state.has_run_with_current_setup: + node._execution_state.has_run_with_current_setup = False + # Also purge stale metadata for nodes not in this graph # (e.g. injected externally or left over from removed nodes). graph_node_ids = set(self._node_db.keys()) diff --git a/flowfile_core/tests/flowfile/test_artifact_context.py b/flowfile_core/tests/flowfile/test_artifact_context.py index c40e8b808..1d6fd29fa 100644 --- a/flowfile_core/tests/flowfile/test_artifact_context.py +++ b/flowfile_core/tests/flowfile/test_artifact_context.py @@ -417,3 +417,64 @@ def test_to_dict_is_json_serialisable(self): # Should not raise serialised = json.dumps(d) assert isinstance(serialised, str) + + +# --------------------------------------------------------------------------- +# ArtifactContext — Deletion origin tracking +# --------------------------------------------------------------------------- + + +class TestArtifactContextDeletionOrigins: + def test_get_producer_nodes_for_deletions_basic(self): + """Deleting an artifact tracks the original publisher.""" + ctx = ArtifactContext() + ctx.record_published(1, "k1", ["model"]) + ctx.record_deleted(2, "k1", ["model"]) + producers = ctx.get_producer_nodes_for_deletions({2}) + assert producers == {1} + + def test_get_producer_nodes_for_deletions_no_deletions(self): + """Nodes without deletions return an empty set.""" + ctx = ArtifactContext() + ctx.record_published(1, "k1", ["model"]) + producers = ctx.get_producer_nodes_for_deletions({1}) + assert producers == set() + + def test_get_producer_nodes_for_deletions_multiple_artifacts(self): + """Deleting multiple artifacts from different producers.""" + ctx = ArtifactContext() + ctx.record_published(1, "k1", ["model"]) + ctx.record_published(2, "k1", ["scaler"]) + ctx.record_deleted(3, "k1", ["model", "scaler"]) + producers = ctx.get_producer_nodes_for_deletions({3}) + assert producers == {1, 2} + + def test_clear_nodes_removes_deletion_origins(self): + """Clearing a deleter node also clears its deletion origins.""" + ctx = ArtifactContext() + ctx.record_published(1, "k1", ["model"]) + ctx.record_deleted(2, "k1", ["model"]) + ctx.clear_nodes({2}) + producers = ctx.get_producer_nodes_for_deletions({2}) + assert producers == set() + + def test_clear_all_removes_deletion_origins(self): + """clear_all removes all deletion origin tracking.""" + ctx = ArtifactContext() + ctx.record_published(1, "k1", ["model"]) + ctx.record_deleted(2, "k1", ["model"]) + ctx.clear_all() + producers = ctx.get_producer_nodes_for_deletions({2}) + assert producers == set() + + def test_clear_kernel_removes_deletion_origins(self): + """clear_kernel removes deletion origins for that kernel only.""" + ctx = ArtifactContext() + ctx.record_published(1, "k1", ["model"]) + ctx.record_published(2, "k2", ["encoder"]) + ctx.record_deleted(3, "k1", ["model"]) + ctx.record_deleted(3, "k2", ["encoder"]) + ctx.clear_kernel("k1") + producers = ctx.get_producer_nodes_for_deletions({3}) + # Only the k2 producer should remain + assert producers == {2} diff --git a/flowfile_core/tests/flowfile/test_kernel_integration.py b/flowfile_core/tests/flowfile/test_kernel_integration.py index b4b7d307d..30734ef57 100644 --- a/flowfile_core/tests/flowfile/test_kernel_integration.py +++ b/flowfile_core/tests/flowfile/test_kernel_integration.py @@ -1555,3 +1555,118 @@ def test_rerun_producer_clears_old_artifacts( finally: _kernel_mod._manager = _prev + + def test_deleted_artifact_producer_reruns_on_consumer_change( + self, kernel_manager: tuple[KernelManager, str], + ): + """When a consumer that deleted an artifact is re-run, the + producer must also re-run so the artifact is available again.""" + manager, kernel_id = kernel_manager + import flowfile_core.kernel as _kernel_mod + + _prev = _kernel_mod._manager + _kernel_mod._manager = manager + + try: + graph = _create_graph() + + # Node 1: input data + data = [{"x1": 1, "x2": 2, "y": 5}, {"x1": 3, "x2": 4, "y": 11}] + graph.add_node_promise( + input_schema.NodePromise(flow_id=1, node_id=1, node_type="manual_input"), + ) + graph.add_manual_input( + input_schema.NodeManualInput( + flow_id=1, node_id=1, + raw_data_format=input_schema.RawData.from_pylist(data), + ) + ) + + # Node 2: publish artifact + graph.add_node_promise( + input_schema.NodePromise(flow_id=1, node_id=2, node_type="python_script"), + ) + producer_code = """ +df = flowfile.read_input() +flowfile.publish_artifact("linear_model", {"coefficients": [1.0, 2.0, 3.0]}) +flowfile.publish_output(df) +""" + graph.add_python_script( + input_schema.NodePythonScript( + flow_id=1, node_id=2, depending_on_ids=[1], + python_script_input=input_schema.PythonScriptInput( + code=producer_code, kernel_id=kernel_id, + ), + ) + ) + add_connection( + graph, + input_schema.NodeConnection.create_from_simple_input(1, 2), + ) + + # Node 3: read artifact, use it, then delete it + graph.add_node_promise( + input_schema.NodePromise(flow_id=1, node_id=3, node_type="python_script"), + ) + consumer_code_v1 = """ +import polars as pl +df = flowfile.read_input().collect() +model = flowfile.read_artifact("linear_model") +coeffs = model["coefficients"] +result = df.with_columns(pl.lit(coeffs[0]).alias("c0")) +flowfile.publish_output(result) +flowfile.delete_artifact("linear_model") +""" + graph.add_python_script( + input_schema.NodePythonScript( + flow_id=1, node_id=3, depending_on_ids=[2], + python_script_input=input_schema.PythonScriptInput( + code=consumer_code_v1, kernel_id=kernel_id, + ), + ) + ) + add_connection( + graph, + input_schema.NodeConnection.create_from_simple_input(2, 3), + ) + + # First run — everything works + _handle_run_info(graph.run_graph()) + + # Verify node 3 produced output + df_out = graph.get_node(3).get_resulting_data().data_frame.collect() + assert "c0" in df_out.columns + + # Change the consumer's code (node 3) — still deletes the artifact + consumer_code_v2 = """ +import polars as pl +df = flowfile.read_input().collect() +model = flowfile.read_artifact("linear_model") +coeffs = model["coefficients"] +result = df.with_columns( + pl.lit(coeffs[0]).alias("c0"), + pl.lit(coeffs[1]).alias("c1"), +) +flowfile.publish_output(result) +flowfile.delete_artifact("linear_model") +""" + graph.add_python_script( + input_schema.NodePythonScript( + flow_id=1, node_id=3, depending_on_ids=[2], + python_script_input=input_schema.PythonScriptInput( + code=consumer_code_v2, kernel_id=kernel_id, + ), + ) + ) + + # Second run — consumer re-runs; producer must also re-run + # because the artifact was deleted on the first run. + _handle_run_info(graph.run_graph()) + + # Consumer should have the new columns + df_out2 = graph.get_node(3).get_resulting_data().data_frame.collect() + assert "c0" in df_out2.columns + assert "c1" in df_out2.columns + + finally: + _kernel_mod._manager = _prev From 040c8f984a9ed99d1120af8a6dda29c2afdde0be Mon Sep 17 00:00:00 2001 From: Edward van Eechoud <41021650+Edwardvaneechoud@users.noreply.github.com> Date: Tue, 3 Feb 2026 08:37:02 +0100 Subject: [PATCH 10/38] Add catalog service layer with repository pattern (#298) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Implement service layer for Flow Catalog system Extract business logic from route handlers into a proper layered architecture: - catalog/exceptions.py: Domain-specific exceptions (CatalogError hierarchy) replacing inline HTTPException raises in service code - catalog/repository.py: CatalogRepository Protocol + SQLAlchemy implementation abstracting all data access - catalog/service.py: CatalogService class owning all business logic (validation, enrichment, authorization checks) - catalog/__init__.py: Public package interface Refactor routes/catalog.py into a thin HTTP adapter that injects CatalogService via FastAPI Depends, delegates to service methods, and translates domain exceptions to HTTP responses. All 33 existing catalog API tests pass with no behavior changes. https://claude.ai/code/session_017KkbxgQFxELX8fhk3cDYGF * Address performance and observability concerns 1. Fix N+1 queries in flow listing (4×N → 3 queries): - Add bulk_get_favorite_flow_ids, bulk_get_follow_flow_ids, bulk_get_run_stats to CatalogRepository - Add _bulk_enrich_flows to CatalogService - Update list_flows, get_namespace_tree, list_favorites, list_following, get_catalog_stats to use bulk enrichment 2. Add tech debt comment for ArtifactStore memory pattern: - Document the in-memory storage limitation for large artifacts - Suggest future improvements (spill-to-disk, external store) 3. Promote _auto_register_flow logging from debug to info: - Users can now see why flows don't appear in catalog - Log success and specific failure reasons 4. Improve _run_and_track error handling: - Use ERROR level for DB persistence failures - Add tracking_succeeded flag with explicit failure message - Log successful tracking with run details - Add context about flow status in error messages --- .../flowfile_core/catalog/__init__.py | 44 ++ .../flowfile_core/catalog/exceptions.py | 121 ++++ .../flowfile_core/catalog/repository.py | 504 +++++++++++++ .../flowfile_core/catalog/service.py | 672 ++++++++++++++++++ flowfile_core/flowfile_core/routes/catalog.py | 502 ++++--------- flowfile_core/flowfile_core/routes/routes.py | 59 +- .../kernel_runtime/artifact_store.py | 15 + 7 files changed, 1526 insertions(+), 391 deletions(-) create mode 100644 flowfile_core/flowfile_core/catalog/__init__.py create mode 100644 flowfile_core/flowfile_core/catalog/exceptions.py create mode 100644 flowfile_core/flowfile_core/catalog/repository.py create mode 100644 flowfile_core/flowfile_core/catalog/service.py diff --git a/flowfile_core/flowfile_core/catalog/__init__.py b/flowfile_core/flowfile_core/catalog/__init__.py new file mode 100644 index 000000000..67c9bf8e4 --- /dev/null +++ b/flowfile_core/flowfile_core/catalog/__init__.py @@ -0,0 +1,44 @@ +"""Flow Catalog service layer. + +Public interface: + +* ``CatalogService`` — business-logic orchestrator +* ``CatalogRepository`` — data-access protocol (for type-hints / mocking) +* ``SQLAlchemyCatalogRepository`` — concrete SQLAlchemy implementation +* Domain exceptions (``CatalogError`` hierarchy) +""" + +from .exceptions import ( + CatalogError, + FavoriteNotFoundError, + FlowExistsError, + FlowNotFoundError, + FollowNotFoundError, + NamespaceExistsError, + NamespaceNotEmptyError, + NamespaceNotFoundError, + NestingLimitError, + NoSnapshotError, + NotAuthorizedError, + RunNotFoundError, +) +from .repository import CatalogRepository, SQLAlchemyCatalogRepository +from .service import CatalogService + +__all__ = [ + "CatalogService", + "CatalogRepository", + "SQLAlchemyCatalogRepository", + "CatalogError", + "NamespaceNotFoundError", + "NamespaceExistsError", + "NestingLimitError", + "NamespaceNotEmptyError", + "FlowNotFoundError", + "FlowExistsError", + "RunNotFoundError", + "NotAuthorizedError", + "FavoriteNotFoundError", + "FollowNotFoundError", + "NoSnapshotError", +] diff --git a/flowfile_core/flowfile_core/catalog/exceptions.py b/flowfile_core/flowfile_core/catalog/exceptions.py new file mode 100644 index 000000000..37d06bc53 --- /dev/null +++ b/flowfile_core/flowfile_core/catalog/exceptions.py @@ -0,0 +1,121 @@ +"""Domain-specific exceptions for the Flow Catalog system. + +These exceptions represent business-rule violations and are raised by the +service layer. Route handlers catch them and translate to appropriate +HTTP responses. +""" + + +class CatalogError(Exception): + """Base exception for all catalog domain errors.""" + + +class NamespaceNotFoundError(CatalogError): + """Raised when a namespace lookup fails.""" + + def __init__(self, namespace_id: int | None = None, name: str | None = None): + self.namespace_id = namespace_id + self.name = name + detail = "Namespace not found" + if namespace_id is not None: + detail = f"Namespace with id={namespace_id} not found" + elif name is not None: + detail = f"Namespace '{name}' not found" + super().__init__(detail) + + +class NamespaceExistsError(CatalogError): + """Raised when attempting to create a duplicate namespace.""" + + def __init__(self, name: str, parent_id: int | None = None): + self.name = name + self.parent_id = parent_id + super().__init__( + f"Namespace '{name}' already exists" + + (f" under parent_id={parent_id}" if parent_id is not None else " at root level") + ) + + +class NestingLimitError(CatalogError): + """Raised when attempting to nest namespaces deeper than catalog -> schema.""" + + def __init__(self, parent_id: int, parent_level: int): + self.parent_id = parent_id + self.parent_level = parent_level + super().__init__("Cannot nest deeper than catalog -> schema") + + +class NamespaceNotEmptyError(CatalogError): + """Raised when trying to delete a namespace that still has children or flows.""" + + def __init__(self, namespace_id: int, children: int = 0, flows: int = 0): + self.namespace_id = namespace_id + self.children = children + self.flows = flows + super().__init__("Cannot delete namespace with children or flows") + + +class FlowNotFoundError(CatalogError): + """Raised when a flow registration lookup fails.""" + + def __init__(self, registration_id: int | None = None, name: str | None = None): + self.registration_id = registration_id + self.name = name + detail = "Flow not found" + if registration_id is not None: + detail = f"Flow with id={registration_id} not found" + elif name is not None: + detail = f"Flow '{name}' not found" + super().__init__(detail) + + +class FlowExistsError(CatalogError): + """Raised when attempting to create a duplicate flow registration.""" + + def __init__(self, name: str, namespace_id: int | None = None): + self.name = name + self.namespace_id = namespace_id + super().__init__(f"Flow '{name}' already exists in namespace_id={namespace_id}") + + +class RunNotFoundError(CatalogError): + """Raised when a flow run lookup fails.""" + + def __init__(self, run_id: int): + self.run_id = run_id + super().__init__(f"Run with id={run_id} not found") + + +class NotAuthorizedError(CatalogError): + """Raised when a user attempts an action they are not permitted to perform.""" + + def __init__(self, user_id: int, action: str = "perform this action"): + self.user_id = user_id + self.action = action + super().__init__(f"User {user_id} is not authorized to {action}") + + +class FavoriteNotFoundError(CatalogError): + """Raised when a favorite record is not found.""" + + def __init__(self, user_id: int, registration_id: int): + self.user_id = user_id + self.registration_id = registration_id + super().__init__(f"Favorite not found for user={user_id}, flow={registration_id}") + + +class FollowNotFoundError(CatalogError): + """Raised when a follow record is not found.""" + + def __init__(self, user_id: int, registration_id: int): + self.user_id = user_id + self.registration_id = registration_id + super().__init__(f"Follow not found for user={user_id}, flow={registration_id}") + + +class NoSnapshotError(CatalogError): + """Raised when a run has no flow snapshot available.""" + + def __init__(self, run_id: int): + self.run_id = run_id + super().__init__(f"No flow snapshot available for run id={run_id}") diff --git a/flowfile_core/flowfile_core/catalog/repository.py b/flowfile_core/flowfile_core/catalog/repository.py new file mode 100644 index 000000000..e81f6f42e --- /dev/null +++ b/flowfile_core/flowfile_core/catalog/repository.py @@ -0,0 +1,504 @@ +"""Data-access abstraction for the Flow Catalog system. + +Defines a ``CatalogRepository`` :pep:`544` Protocol and provides a concrete +``SQLAlchemyCatalogRepository`` implementation backed by SQLAlchemy. +""" + +from __future__ import annotations + +from typing import Protocol, runtime_checkable + +from sqlalchemy.orm import Session + +from flowfile_core.database.models import ( + CatalogNamespace, + FlowFavorite, + FlowFollow, + FlowRegistration, + FlowRun, +) + + +# --------------------------------------------------------------------------- +# Repository Protocol +# --------------------------------------------------------------------------- + + +@runtime_checkable +class CatalogRepository(Protocol): + """Abstract interface for catalog data access. + + Any class that satisfies this protocol can be used by ``CatalogService``, + enabling easy unit-testing with mock implementations. + """ + + # -- Namespace operations ------------------------------------------------ + + def get_namespace(self, namespace_id: int) -> CatalogNamespace | None: ... + + def get_namespace_by_name( + self, name: str, parent_id: int | None + ) -> CatalogNamespace | None: ... + + def list_namespaces(self, parent_id: int | None = None) -> list[CatalogNamespace]: ... + + def list_root_namespaces(self) -> list[CatalogNamespace]: ... + + def list_child_namespaces(self, parent_id: int) -> list[CatalogNamespace]: ... + + def create_namespace(self, ns: CatalogNamespace) -> CatalogNamespace: ... + + def update_namespace(self, ns: CatalogNamespace) -> CatalogNamespace: ... + + def delete_namespace(self, namespace_id: int) -> None: ... + + def count_children(self, namespace_id: int) -> int: ... + + # -- Flow registration operations ---------------------------------------- + + def get_flow(self, registration_id: int) -> FlowRegistration | None: ... + + def get_flow_by_name( + self, name: str, namespace_id: int + ) -> FlowRegistration | None: ... + + def get_flow_by_path(self, flow_path: str) -> FlowRegistration | None: ... + + def list_flows( + self, + namespace_id: int | None = None, + owner_id: int | None = None, + ) -> list[FlowRegistration]: ... + + def create_flow(self, reg: FlowRegistration) -> FlowRegistration: ... + + def update_flow(self, reg: FlowRegistration) -> FlowRegistration: ... + + def delete_flow(self, registration_id: int) -> None: ... + + def count_flows_in_namespace(self, namespace_id: int) -> int: ... + + # -- Run operations ------------------------------------------------------ + + def get_run(self, run_id: int) -> FlowRun | None: ... + + def list_runs( + self, + registration_id: int | None = None, + limit: int = 50, + offset: int = 0, + ) -> list[FlowRun]: ... + + def create_run(self, run: FlowRun) -> FlowRun: ... + + def update_run(self, run: FlowRun) -> FlowRun: ... + + def count_runs(self) -> int: ... + + # -- Favorites ----------------------------------------------------------- + + def get_favorite( + self, user_id: int, registration_id: int + ) -> FlowFavorite | None: ... + + def add_favorite(self, fav: FlowFavorite) -> FlowFavorite: ... + + def remove_favorite(self, user_id: int, registration_id: int) -> None: ... + + def list_favorites(self, user_id: int) -> list[FlowFavorite]: ... + + def count_favorites(self, user_id: int) -> int: ... + + # -- Follows ------------------------------------------------------------- + + def get_follow( + self, user_id: int, registration_id: int + ) -> FlowFollow | None: ... + + def add_follow(self, follow: FlowFollow) -> FlowFollow: ... + + def remove_follow(self, user_id: int, registration_id: int) -> None: ... + + def list_follows(self, user_id: int) -> list[FlowFollow]: ... + + # -- Aggregate helpers --------------------------------------------------- + + def count_run_for_flow(self, registration_id: int) -> int: ... + + def last_run_for_flow(self, registration_id: int) -> FlowRun | None: ... + + def count_catalog_namespaces(self) -> int: ... + + def count_all_flows(self) -> int: ... + + # -- Bulk enrichment helpers (for N+1 elimination) ----------------------- + + def bulk_get_favorite_flow_ids( + self, user_id: int, flow_ids: list[int] + ) -> set[int]: ... + + def bulk_get_follow_flow_ids( + self, user_id: int, flow_ids: list[int] + ) -> set[int]: ... + + def bulk_get_run_stats( + self, flow_ids: list[int] + ) -> dict[int, tuple[int, FlowRun | None]]: ... + + +# --------------------------------------------------------------------------- +# SQLAlchemy implementation +# --------------------------------------------------------------------------- + + +class SQLAlchemyCatalogRepository: + """Concrete ``CatalogRepository`` backed by a SQLAlchemy ``Session``.""" + + def __init__(self, db: Session) -> None: + self._db = db + + # -- Namespace operations ------------------------------------------------ + + def get_namespace(self, namespace_id: int) -> CatalogNamespace | None: + return self._db.get(CatalogNamespace, namespace_id) + + def get_namespace_by_name( + self, name: str, parent_id: int | None + ) -> CatalogNamespace | None: + return ( + self._db.query(CatalogNamespace) + .filter_by(name=name, parent_id=parent_id) + .first() + ) + + def list_namespaces(self, parent_id: int | None = None) -> list[CatalogNamespace]: + q = self._db.query(CatalogNamespace) + if parent_id is not None: + q = q.filter(CatalogNamespace.parent_id == parent_id) + else: + q = q.filter(CatalogNamespace.parent_id.is_(None)) + return q.order_by(CatalogNamespace.name).all() + + def list_root_namespaces(self) -> list[CatalogNamespace]: + return ( + self._db.query(CatalogNamespace) + .filter(CatalogNamespace.parent_id.is_(None)) + .order_by(CatalogNamespace.name) + .all() + ) + + def list_child_namespaces(self, parent_id: int) -> list[CatalogNamespace]: + return ( + self._db.query(CatalogNamespace) + .filter_by(parent_id=parent_id) + .order_by(CatalogNamespace.name) + .all() + ) + + def create_namespace(self, ns: CatalogNamespace) -> CatalogNamespace: + self._db.add(ns) + self._db.commit() + self._db.refresh(ns) + return ns + + def update_namespace(self, ns: CatalogNamespace) -> CatalogNamespace: + self._db.commit() + self._db.refresh(ns) + return ns + + def delete_namespace(self, namespace_id: int) -> None: + ns = self._db.get(CatalogNamespace, namespace_id) + if ns is not None: + self._db.delete(ns) + self._db.commit() + + def count_children(self, namespace_id: int) -> int: + return ( + self._db.query(CatalogNamespace) + .filter_by(parent_id=namespace_id) + .count() + ) + + # -- Flow registration operations ---------------------------------------- + + def get_flow(self, registration_id: int) -> FlowRegistration | None: + return self._db.get(FlowRegistration, registration_id) + + def get_flow_by_name( + self, name: str, namespace_id: int + ) -> FlowRegistration | None: + return ( + self._db.query(FlowRegistration) + .filter_by(name=name, namespace_id=namespace_id) + .first() + ) + + def get_flow_by_path(self, flow_path: str) -> FlowRegistration | None: + return ( + self._db.query(FlowRegistration) + .filter_by(flow_path=flow_path) + .first() + ) + + def list_flows( + self, + namespace_id: int | None = None, + owner_id: int | None = None, + ) -> list[FlowRegistration]: + q = self._db.query(FlowRegistration) + if namespace_id is not None: + q = q.filter_by(namespace_id=namespace_id) + if owner_id is not None: + q = q.filter_by(owner_id=owner_id) + return q.order_by(FlowRegistration.name).all() + + def create_flow(self, reg: FlowRegistration) -> FlowRegistration: + self._db.add(reg) + self._db.commit() + self._db.refresh(reg) + return reg + + def update_flow(self, reg: FlowRegistration) -> FlowRegistration: + self._db.commit() + self._db.refresh(reg) + return reg + + def delete_flow(self, registration_id: int) -> None: + # Clean up related records first + self._db.query(FlowFavorite).filter_by(registration_id=registration_id).delete() + self._db.query(FlowFollow).filter_by(registration_id=registration_id).delete() + flow = self._db.get(FlowRegistration, registration_id) + if flow is not None: + self._db.delete(flow) + self._db.commit() + + def count_flows_in_namespace(self, namespace_id: int) -> int: + return ( + self._db.query(FlowRegistration) + .filter_by(namespace_id=namespace_id) + .count() + ) + + # -- Run operations ------------------------------------------------------ + + def get_run(self, run_id: int) -> FlowRun | None: + return self._db.get(FlowRun, run_id) + + def list_runs( + self, + registration_id: int | None = None, + limit: int = 50, + offset: int = 0, + ) -> list[FlowRun]: + q = self._db.query(FlowRun) + if registration_id is not None: + q = q.filter_by(registration_id=registration_id) + return ( + q.order_by(FlowRun.started_at.desc()) + .offset(offset) + .limit(limit) + .all() + ) + + def create_run(self, run: FlowRun) -> FlowRun: + self._db.add(run) + self._db.commit() + self._db.refresh(run) + return run + + def update_run(self, run: FlowRun) -> FlowRun: + self._db.commit() + self._db.refresh(run) + return run + + def count_runs(self) -> int: + return self._db.query(FlowRun).count() + + # -- Favorites ----------------------------------------------------------- + + def get_favorite( + self, user_id: int, registration_id: int + ) -> FlowFavorite | None: + return ( + self._db.query(FlowFavorite) + .filter_by(user_id=user_id, registration_id=registration_id) + .first() + ) + + def add_favorite(self, fav: FlowFavorite) -> FlowFavorite: + self._db.add(fav) + self._db.commit() + self._db.refresh(fav) + return fav + + def remove_favorite(self, user_id: int, registration_id: int) -> None: + fav = ( + self._db.query(FlowFavorite) + .filter_by(user_id=user_id, registration_id=registration_id) + .first() + ) + if fav is not None: + self._db.delete(fav) + self._db.commit() + + def list_favorites(self, user_id: int) -> list[FlowFavorite]: + return ( + self._db.query(FlowFavorite) + .filter_by(user_id=user_id) + .order_by(FlowFavorite.created_at.desc()) + .all() + ) + + def count_favorites(self, user_id: int) -> int: + return ( + self._db.query(FlowFavorite) + .filter_by(user_id=user_id) + .count() + ) + + # -- Follows ------------------------------------------------------------- + + def get_follow( + self, user_id: int, registration_id: int + ) -> FlowFollow | None: + return ( + self._db.query(FlowFollow) + .filter_by(user_id=user_id, registration_id=registration_id) + .first() + ) + + def add_follow(self, follow: FlowFollow) -> FlowFollow: + self._db.add(follow) + self._db.commit() + self._db.refresh(follow) + return follow + + def remove_follow(self, user_id: int, registration_id: int) -> None: + follow = ( + self._db.query(FlowFollow) + .filter_by(user_id=user_id, registration_id=registration_id) + .first() + ) + if follow is not None: + self._db.delete(follow) + self._db.commit() + + def list_follows(self, user_id: int) -> list[FlowFollow]: + return ( + self._db.query(FlowFollow) + .filter_by(user_id=user_id) + .order_by(FlowFollow.created_at.desc()) + .all() + ) + + # -- Aggregate helpers --------------------------------------------------- + + def count_run_for_flow(self, registration_id: int) -> int: + return ( + self._db.query(FlowRun) + .filter_by(registration_id=registration_id) + .count() + ) + + def last_run_for_flow(self, registration_id: int) -> FlowRun | None: + return ( + self._db.query(FlowRun) + .filter_by(registration_id=registration_id) + .order_by(FlowRun.started_at.desc()) + .first() + ) + + def count_catalog_namespaces(self) -> int: + return ( + self._db.query(CatalogNamespace) + .filter_by(level=0) + .count() + ) + + def count_all_flows(self) -> int: + return self._db.query(FlowRegistration).count() + + # -- Bulk enrichment helpers (for N+1 elimination) ----------------------- + + def bulk_get_favorite_flow_ids( + self, user_id: int, flow_ids: list[int] + ) -> set[int]: + """Return the subset of flow_ids that the user has favourited.""" + if not flow_ids: + return set() + rows = ( + self._db.query(FlowFavorite.registration_id) + .filter( + FlowFavorite.user_id == user_id, + FlowFavorite.registration_id.in_(flow_ids), + ) + .all() + ) + return {r[0] for r in rows} + + def bulk_get_follow_flow_ids( + self, user_id: int, flow_ids: list[int] + ) -> set[int]: + """Return the subset of flow_ids that the user is following.""" + if not flow_ids: + return set() + rows = ( + self._db.query(FlowFollow.registration_id) + .filter( + FlowFollow.user_id == user_id, + FlowFollow.registration_id.in_(flow_ids), + ) + .all() + ) + return {r[0] for r in rows} + + def bulk_get_run_stats( + self, flow_ids: list[int] + ) -> dict[int, tuple[int, FlowRun | None]]: + """Return run_count and last_run for each flow_id in one query batch. + + Returns a dict: flow_id -> (run_count, last_run_or_none) + """ + if not flow_ids: + return {} + + from sqlalchemy import func + + # Query 1: counts per registration_id + count_rows = ( + self._db.query( + FlowRun.registration_id, + func.count(FlowRun.id).label("cnt"), + ) + .filter(FlowRun.registration_id.in_(flow_ids)) + .group_by(FlowRun.registration_id) + .all() + ) + counts = {r[0]: r[1] for r in count_rows} + + # Query 2: last run per registration_id using a subquery for max started_at + subq = ( + self._db.query( + FlowRun.registration_id, + func.max(FlowRun.started_at).label("max_started"), + ) + .filter(FlowRun.registration_id.in_(flow_ids)) + .group_by(FlowRun.registration_id) + .subquery() + ) + last_runs_rows = ( + self._db.query(FlowRun) + .join( + subq, + (FlowRun.registration_id == subq.c.registration_id) + & (FlowRun.started_at == subq.c.max_started), + ) + .all() + ) + last_runs = {r.registration_id: r for r in last_runs_rows} + + # Build result dict + result: dict[int, tuple[int, FlowRun | None]] = {} + for fid in flow_ids: + result[fid] = (counts.get(fid, 0), last_runs.get(fid)) + return result diff --git a/flowfile_core/flowfile_core/catalog/service.py b/flowfile_core/flowfile_core/catalog/service.py new file mode 100644 index 000000000..1cead5bed --- /dev/null +++ b/flowfile_core/flowfile_core/catalog/service.py @@ -0,0 +1,672 @@ +"""Business-logic layer for the Flow Catalog system. + +``CatalogService`` encapsulates all domain rules (validation, authorisation, +enrichment) and delegates persistence to a ``CatalogRepository``. It never +raises ``HTTPException`` — only domain-specific exceptions from +``catalog.exceptions``. +""" + +from __future__ import annotations + +import os +from datetime import datetime, timezone + +from flowfile_core.catalog.exceptions import ( + FavoriteNotFoundError, + FlowNotFoundError, + FollowNotFoundError, + NamespaceExistsError, + NamespaceNotEmptyError, + NamespaceNotFoundError, + NestingLimitError, + NoSnapshotError, + RunNotFoundError, +) +from flowfile_core.catalog.repository import CatalogRepository +from flowfile_core.database.models import ( + CatalogNamespace, + FlowFavorite, + FlowFollow, + FlowRegistration, + FlowRun, +) +from flowfile_core.schemas.catalog_schema import ( + CatalogStats, + FlowRegistrationOut, + FlowRunDetail, + FlowRunOut, + NamespaceTree, +) + + +class CatalogService: + """Coordinates all catalog business logic. + + Parameters + ---------- + repo: + Any object satisfying the ``CatalogRepository`` protocol. + """ + + def __init__(self, repo: CatalogRepository) -> None: + self.repo = repo + + # ------------------------------------------------------------------ # + # Private helpers + # ------------------------------------------------------------------ # + + def _enrich_flow_registration( + self, flow: FlowRegistration, user_id: int + ) -> FlowRegistrationOut: + """Attach favourite/follow flags and run stats to a single registration. + + Note: For bulk operations, prefer ``_bulk_enrich_flows`` to avoid N+1 queries. + """ + is_fav = self.repo.get_favorite(user_id, flow.id) is not None + is_follow = self.repo.get_follow(user_id, flow.id) is not None + run_count = self.repo.count_run_for_flow(flow.id) + last_run = self.repo.last_run_for_flow(flow.id) + return FlowRegistrationOut( + id=flow.id, + name=flow.name, + description=flow.description, + flow_path=flow.flow_path, + namespace_id=flow.namespace_id, + owner_id=flow.owner_id, + created_at=flow.created_at, + updated_at=flow.updated_at, + is_favorite=is_fav, + is_following=is_follow, + run_count=run_count, + last_run_at=last_run.started_at if last_run else None, + last_run_success=last_run.success if last_run else None, + file_exists=os.path.exists(flow.flow_path) if flow.flow_path else False, + ) + + def _bulk_enrich_flows( + self, flows: list[FlowRegistration], user_id: int + ) -> list[FlowRegistrationOut]: + """Enrich multiple flows with favourites, follows, and run stats in bulk. + + Uses 3 queries total instead of 4×N, dramatically improving performance + when listing many flows. + """ + if not flows: + return [] + + flow_ids = [f.id for f in flows] + + # Bulk fetch all enrichment data (3 queries total) + fav_ids = self.repo.bulk_get_favorite_flow_ids(user_id, flow_ids) + follow_ids = self.repo.bulk_get_follow_flow_ids(user_id, flow_ids) + run_stats = self.repo.bulk_get_run_stats(flow_ids) + + result: list[FlowRegistrationOut] = [] + for flow in flows: + run_count, last_run = run_stats.get(flow.id, (0, None)) + result.append( + FlowRegistrationOut( + id=flow.id, + name=flow.name, + description=flow.description, + flow_path=flow.flow_path, + namespace_id=flow.namespace_id, + owner_id=flow.owner_id, + created_at=flow.created_at, + updated_at=flow.updated_at, + is_favorite=flow.id in fav_ids, + is_following=flow.id in follow_ids, + run_count=run_count, + last_run_at=last_run.started_at if last_run else None, + last_run_success=last_run.success if last_run else None, + file_exists=os.path.exists(flow.flow_path) if flow.flow_path else False, + ) + ) + return result + + @staticmethod + def _run_to_out(run: FlowRun) -> FlowRunOut: + return FlowRunOut( + id=run.id, + registration_id=run.registration_id, + flow_name=run.flow_name, + flow_path=run.flow_path, + user_id=run.user_id, + started_at=run.started_at, + ended_at=run.ended_at, + success=run.success, + nodes_completed=run.nodes_completed, + number_of_nodes=run.number_of_nodes, + duration_seconds=run.duration_seconds, + run_type=run.run_type, + has_snapshot=run.flow_snapshot is not None, + ) + + # ------------------------------------------------------------------ # + # Namespace operations + # ------------------------------------------------------------------ # + + def create_namespace( + self, + name: str, + owner_id: int, + parent_id: int | None = None, + description: str | None = None, + ) -> CatalogNamespace: + """Create a catalog (level 0) or schema (level 1) namespace. + + Raises + ------ + NamespaceNotFoundError + If ``parent_id`` is given but doesn't exist. + NestingLimitError + If the parent is already at level 1 (schema). + NamespaceExistsError + If a namespace with the same name already exists under the parent. + """ + level = 0 + if parent_id is not None: + parent = self.repo.get_namespace(parent_id) + if parent is None: + raise NamespaceNotFoundError(namespace_id=parent_id) + if parent.level >= 1: + raise NestingLimitError(parent_id=parent_id, parent_level=parent.level) + level = parent.level + 1 + + existing = self.repo.get_namespace_by_name(name, parent_id) + if existing is not None: + raise NamespaceExistsError(name=name, parent_id=parent_id) + + ns = CatalogNamespace( + name=name, + parent_id=parent_id, + level=level, + description=description, + owner_id=owner_id, + ) + return self.repo.create_namespace(ns) + + def update_namespace( + self, + namespace_id: int, + name: str | None = None, + description: str | None = None, + ) -> CatalogNamespace: + """Update a namespace's name and/or description. + + Raises + ------ + NamespaceNotFoundError + If the namespace doesn't exist. + """ + ns = self.repo.get_namespace(namespace_id) + if ns is None: + raise NamespaceNotFoundError(namespace_id=namespace_id) + if name is not None: + ns.name = name + if description is not None: + ns.description = description + return self.repo.update_namespace(ns) + + def delete_namespace(self, namespace_id: int) -> None: + """Delete a namespace if it has no children or flows. + + Raises + ------ + NamespaceNotFoundError + If the namespace doesn't exist. + NamespaceNotEmptyError + If the namespace has child namespaces or flow registrations. + """ + ns = self.repo.get_namespace(namespace_id) + if ns is None: + raise NamespaceNotFoundError(namespace_id=namespace_id) + children = self.repo.count_children(namespace_id) + flows = self.repo.count_flows_in_namespace(namespace_id) + if children > 0 or flows > 0: + raise NamespaceNotEmptyError( + namespace_id=namespace_id, children=children, flows=flows + ) + self.repo.delete_namespace(namespace_id) + + def get_namespace(self, namespace_id: int) -> CatalogNamespace: + """Retrieve a single namespace by ID. + + Raises + ------ + NamespaceNotFoundError + If the namespace doesn't exist. + """ + ns = self.repo.get_namespace(namespace_id) + if ns is None: + raise NamespaceNotFoundError(namespace_id=namespace_id) + return ns + + def list_namespaces(self, parent_id: int | None = None) -> list[CatalogNamespace]: + """List namespaces, optionally filtered by parent.""" + return self.repo.list_namespaces(parent_id) + + def get_namespace_tree(self, user_id: int) -> list[NamespaceTree]: + """Build the full catalog tree with flows nested under schemas. + + Uses bulk enrichment to avoid N+1 queries when there are many flows. + """ + catalogs = self.repo.list_root_namespaces() + + # Collect all flows first, then bulk-enrich them + all_flows: list[FlowRegistration] = [] + namespace_flow_map: dict[int, list[FlowRegistration]] = {} + + for cat in catalogs: + cat_flows = self.repo.list_flows(namespace_id=cat.id) + namespace_flow_map[cat.id] = cat_flows + all_flows.extend(cat_flows) + + for schema in self.repo.list_child_namespaces(cat.id): + schema_flows = self.repo.list_flows(namespace_id=schema.id) + namespace_flow_map[schema.id] = schema_flows + all_flows.extend(schema_flows) + + # Bulk enrich all flows at once + enriched = self._bulk_enrich_flows(all_flows, user_id) + enriched_map = {e.id: e for e in enriched} + + # Build tree structure + result: list[NamespaceTree] = [] + for cat in catalogs: + schemas = self.repo.list_child_namespaces(cat.id) + children: list[NamespaceTree] = [] + for schema in schemas: + schema_flows = namespace_flow_map.get(schema.id, []) + flow_outs = [enriched_map[f.id] for f in schema_flows if f.id in enriched_map] + children.append( + NamespaceTree( + id=schema.id, + name=schema.name, + parent_id=schema.parent_id, + level=schema.level, + description=schema.description, + owner_id=schema.owner_id, + created_at=schema.created_at, + updated_at=schema.updated_at, + children=[], + flows=flow_outs, + ) + ) + cat_flows = namespace_flow_map.get(cat.id, []) + root_flow_outs = [enriched_map[f.id] for f in cat_flows if f.id in enriched_map] + result.append( + NamespaceTree( + id=cat.id, + name=cat.name, + parent_id=cat.parent_id, + level=cat.level, + description=cat.description, + owner_id=cat.owner_id, + created_at=cat.created_at, + updated_at=cat.updated_at, + children=children, + flows=root_flow_outs, + ) + ) + return result + + def get_default_namespace_id(self) -> int | None: + """Return the ID of the default 'user_flows' schema under 'General'.""" + general = self.repo.get_namespace_by_name("General", parent_id=None) + if general is None: + return None + user_flows = self.repo.get_namespace_by_name("user_flows", parent_id=general.id) + if user_flows is None: + return None + return user_flows.id + + # ------------------------------------------------------------------ # + # Flow registration operations + # ------------------------------------------------------------------ # + + def register_flow( + self, + name: str, + flow_path: str, + owner_id: int, + namespace_id: int | None = None, + description: str | None = None, + ) -> FlowRegistrationOut: + """Register a new flow in the catalog. + + Raises + ------ + NamespaceNotFoundError + If ``namespace_id`` is given but doesn't exist. + """ + if namespace_id is not None: + ns = self.repo.get_namespace(namespace_id) + if ns is None: + raise NamespaceNotFoundError(namespace_id=namespace_id) + flow = FlowRegistration( + name=name, + description=description, + flow_path=flow_path, + namespace_id=namespace_id, + owner_id=owner_id, + ) + flow = self.repo.create_flow(flow) + return self._enrich_flow_registration(flow, owner_id) + + def update_flow( + self, + registration_id: int, + requesting_user_id: int, + name: str | None = None, + description: str | None = None, + namespace_id: int | None = None, + ) -> FlowRegistrationOut: + """Update a flow registration. + + Raises + ------ + FlowNotFoundError + If the flow doesn't exist. + """ + flow = self.repo.get_flow(registration_id) + if flow is None: + raise FlowNotFoundError(registration_id=registration_id) + if name is not None: + flow.name = name + if description is not None: + flow.description = description + if namespace_id is not None: + flow.namespace_id = namespace_id + flow = self.repo.update_flow(flow) + return self._enrich_flow_registration(flow, requesting_user_id) + + def delete_flow(self, registration_id: int) -> None: + """Delete a flow and its related favourites/follows. + + Raises + ------ + FlowNotFoundError + If the flow doesn't exist. + """ + flow = self.repo.get_flow(registration_id) + if flow is None: + raise FlowNotFoundError(registration_id=registration_id) + self.repo.delete_flow(registration_id) + + def get_flow(self, registration_id: int, user_id: int) -> FlowRegistrationOut: + """Get an enriched flow registration. + + Raises + ------ + FlowNotFoundError + If the flow doesn't exist. + """ + flow = self.repo.get_flow(registration_id) + if flow is None: + raise FlowNotFoundError(registration_id=registration_id) + return self._enrich_flow_registration(flow, user_id) + + def list_flows( + self, user_id: int, namespace_id: int | None = None + ) -> list[FlowRegistrationOut]: + """List flows, optionally filtered by namespace, enriched with user context. + + Uses bulk enrichment to avoid N+1 queries. + """ + flows = self.repo.list_flows(namespace_id=namespace_id) + return self._bulk_enrich_flows(flows, user_id) + + # ------------------------------------------------------------------ # + # Run operations + # ------------------------------------------------------------------ # + + def list_runs( + self, + registration_id: int | None = None, + limit: int = 50, + offset: int = 0, + ) -> list[FlowRunOut]: + """List run summaries (without snapshots).""" + runs = self.repo.list_runs( + registration_id=registration_id, limit=limit, offset=offset + ) + return [self._run_to_out(r) for r in runs] + + def get_run_detail(self, run_id: int) -> FlowRunDetail: + """Get a single run including the YAML snapshot. + + Raises + ------ + RunNotFoundError + If the run doesn't exist. + """ + run = self.repo.get_run(run_id) + if run is None: + raise RunNotFoundError(run_id=run_id) + return FlowRunDetail( + id=run.id, + registration_id=run.registration_id, + flow_name=run.flow_name, + flow_path=run.flow_path, + user_id=run.user_id, + started_at=run.started_at, + ended_at=run.ended_at, + success=run.success, + nodes_completed=run.nodes_completed, + number_of_nodes=run.number_of_nodes, + duration_seconds=run.duration_seconds, + run_type=run.run_type, + has_snapshot=run.flow_snapshot is not None, + flow_snapshot=run.flow_snapshot, + node_results_json=run.node_results_json, + ) + + def get_run(self, run_id: int) -> FlowRun: + """Get a raw FlowRun model. + + Raises + ------ + RunNotFoundError + If the run doesn't exist. + """ + run = self.repo.get_run(run_id) + if run is None: + raise RunNotFoundError(run_id=run_id) + return run + + def start_run( + self, + registration_id: int | None, + flow_name: str, + flow_path: str | None, + user_id: int, + number_of_nodes: int, + run_type: str = "full_run", + flow_snapshot: str | None = None, + ) -> FlowRun: + """Record a new flow run start.""" + run = FlowRun( + registration_id=registration_id, + flow_name=flow_name, + flow_path=flow_path, + user_id=user_id, + started_at=datetime.now(timezone.utc), + number_of_nodes=number_of_nodes, + run_type=run_type, + flow_snapshot=flow_snapshot, + ) + return self.repo.create_run(run) + + def complete_run( + self, + run_id: int, + success: bool, + nodes_completed: int, + node_results_json: str | None = None, + ) -> FlowRun: + """Mark a run as completed. + + Raises + ------ + RunNotFoundError + If the run doesn't exist. + """ + run = self.repo.get_run(run_id) + if run is None: + raise RunNotFoundError(run_id=run_id) + now = datetime.now(timezone.utc) + run.ended_at = now + run.success = success + run.nodes_completed = nodes_completed + if run.started_at: + run.duration_seconds = (now - run.started_at).total_seconds() + if node_results_json is not None: + run.node_results_json = node_results_json + return self.repo.update_run(run) + + def get_run_snapshot(self, run_id: int) -> str: + """Return the flow snapshot text for a run. + + Raises + ------ + RunNotFoundError + If the run doesn't exist. + NoSnapshotError + If the run has no snapshot. + """ + run = self.repo.get_run(run_id) + if run is None: + raise RunNotFoundError(run_id=run_id) + if not run.flow_snapshot: + raise NoSnapshotError(run_id=run_id) + return run.flow_snapshot + + # ------------------------------------------------------------------ # + # Favorites + # ------------------------------------------------------------------ # + + def add_favorite(self, user_id: int, registration_id: int) -> FlowFavorite: + """Add a flow to user's favourites (idempotent). + + Raises + ------ + FlowNotFoundError + If the flow doesn't exist. + """ + flow = self.repo.get_flow(registration_id) + if flow is None: + raise FlowNotFoundError(registration_id=registration_id) + existing = self.repo.get_favorite(user_id, registration_id) + if existing is not None: + return existing + fav = FlowFavorite(user_id=user_id, registration_id=registration_id) + return self.repo.add_favorite(fav) + + def remove_favorite(self, user_id: int, registration_id: int) -> None: + """Remove a flow from user's favourites. + + Raises + ------ + FavoriteNotFoundError + If the favourite doesn't exist. + """ + existing = self.repo.get_favorite(user_id, registration_id) + if existing is None: + raise FavoriteNotFoundError(user_id=user_id, registration_id=registration_id) + self.repo.remove_favorite(user_id, registration_id) + + def list_favorites(self, user_id: int) -> list[FlowRegistrationOut]: + """List all flows the user has favourited, enriched. + + Uses bulk enrichment to avoid N+1 queries. + """ + favs = self.repo.list_favorites(user_id) + flows: list[FlowRegistration] = [] + for fav in favs: + flow = self.repo.get_flow(fav.registration_id) + if flow is not None: + flows.append(flow) + return self._bulk_enrich_flows(flows, user_id) + + # ------------------------------------------------------------------ # + # Follows + # ------------------------------------------------------------------ # + + def add_follow(self, user_id: int, registration_id: int) -> FlowFollow: + """Follow a flow (idempotent). + + Raises + ------ + FlowNotFoundError + If the flow doesn't exist. + """ + flow = self.repo.get_flow(registration_id) + if flow is None: + raise FlowNotFoundError(registration_id=registration_id) + existing = self.repo.get_follow(user_id, registration_id) + if existing is not None: + return existing + follow = FlowFollow(user_id=user_id, registration_id=registration_id) + return self.repo.add_follow(follow) + + def remove_follow(self, user_id: int, registration_id: int) -> None: + """Unfollow a flow. + + Raises + ------ + FollowNotFoundError + If the follow record doesn't exist. + """ + existing = self.repo.get_follow(user_id, registration_id) + if existing is None: + raise FollowNotFoundError(user_id=user_id, registration_id=registration_id) + self.repo.remove_follow(user_id, registration_id) + + def list_following(self, user_id: int) -> list[FlowRegistrationOut]: + """List all flows the user is following, enriched. + + Uses bulk enrichment to avoid N+1 queries. + """ + follows = self.repo.list_follows(user_id) + flows: list[FlowRegistration] = [] + for follow in follows: + flow = self.repo.get_flow(follow.registration_id) + if flow is not None: + flows.append(flow) + return self._bulk_enrich_flows(flows, user_id) + + # ------------------------------------------------------------------ # + # Dashboard / Stats + # ------------------------------------------------------------------ # + + def get_catalog_stats(self, user_id: int) -> CatalogStats: + """Return an overview of the catalog for the dashboard. + + Uses bulk enrichment for favourite flows to avoid N+1 queries. + """ + total_ns = self.repo.count_catalog_namespaces() + total_flows = self.repo.count_all_flows() + total_runs = self.repo.count_runs() + total_favs = self.repo.count_favorites(user_id) + + recent_runs = self.repo.list_runs(limit=10, offset=0) + recent_out = [self._run_to_out(r) for r in recent_runs] + + # Bulk enrich favourite flows + favs = self.repo.list_favorites(user_id) + flows: list[FlowRegistration] = [] + for fav in favs: + flow = self.repo.get_flow(fav.registration_id) + if flow is not None: + flows.append(flow) + fav_flows = self._bulk_enrich_flows(flows, user_id) + + return CatalogStats( + total_namespaces=total_ns, + total_flows=total_flows, + total_runs=total_runs, + total_favorites=total_favs, + recent_runs=recent_out, + favorite_flows=fav_flows, + ) diff --git a/flowfile_core/flowfile_core/routes/catalog.py b/flowfile_core/flowfile_core/routes/catalog.py index 8ceb87455..c6c5cd623 100644 --- a/flowfile_core/flowfile_core/routes/catalog.py +++ b/flowfile_core/flowfile_core/routes/catalog.py @@ -5,10 +5,12 @@ - Flow registration (persistent flow metadata) - Run history with versioned snapshots - Favorites and follows + +This module is a thin HTTP adapter: it delegates all business logic to +``CatalogService`` and translates domain exceptions into HTTP responses. """ import json -import os from pathlib import Path from fastapi import APIRouter, Depends, HTTPException, Query @@ -17,14 +19,20 @@ from flowfile_core import flow_file_handler from flowfile_core.auth.jwt import get_current_active_user -from flowfile_core.database.connection import get_db -from flowfile_core.database.models import ( - CatalogNamespace, - FlowFavorite, - FlowFollow, - FlowRegistration, - FlowRun, +from flowfile_core.catalog import ( + CatalogService, + FavoriteNotFoundError, + FlowNotFoundError, + FollowNotFoundError, + NamespaceExistsError, + NamespaceNotEmptyError, + NamespaceNotFoundError, + NestingLimitError, + NoSnapshotError, + RunNotFoundError, + SQLAlchemyCatalogRepository, ) +from flowfile_core.database.connection import get_db from flowfile_core.schemas.catalog_schema import ( CatalogStats, FavoriteOut, @@ -48,44 +56,14 @@ # --------------------------------------------------------------------------- -# Helpers +# Dependency injection # --------------------------------------------------------------------------- -def _enrich_flow( - flow: FlowRegistration, - db: Session, - user_id: int, -) -> FlowRegistrationOut: - """Attach favourite/follow flags and run stats to a FlowRegistration row.""" - is_fav = db.query(FlowFavorite).filter_by( - user_id=user_id, registration_id=flow.id - ).first() is not None - is_follow = db.query(FlowFollow).filter_by( - user_id=user_id, registration_id=flow.id - ).first() is not None - run_count = db.query(FlowRun).filter_by(registration_id=flow.id).count() - last_run = ( - db.query(FlowRun) - .filter_by(registration_id=flow.id) - .order_by(FlowRun.started_at.desc()) - .first() - ) - return FlowRegistrationOut( - id=flow.id, - name=flow.name, - description=flow.description, - flow_path=flow.flow_path, - namespace_id=flow.namespace_id, - owner_id=flow.owner_id, - created_at=flow.created_at, - updated_at=flow.updated_at, - is_favorite=is_fav, - is_following=is_follow, - run_count=run_count, - last_run_at=last_run.started_at if last_run else None, - last_run_success=last_run.success if last_run else None, - file_exists=os.path.exists(flow.flow_path) if flow.flow_path else False, - ) + +def get_catalog_service(db: Session = Depends(get_db)) -> CatalogService: + """FastAPI dependency that provides a configured ``CatalogService``.""" + repo = SQLAlchemyCatalogRepository(db) + return CatalogService(repo) # --------------------------------------------------------------------------- @@ -96,155 +74,70 @@ def _enrich_flow( @router.get("/namespaces", response_model=list[NamespaceOut]) def list_namespaces( parent_id: int | None = None, - db: Session = Depends(get_db), + service: CatalogService = Depends(get_catalog_service), ): """List namespaces, optionally filtered by parent.""" - q = db.query(CatalogNamespace) - if parent_id is not None: - q = q.filter(CatalogNamespace.parent_id == parent_id) - else: - q = q.filter(CatalogNamespace.parent_id.is_(None)) - return q.order_by(CatalogNamespace.name).all() + return service.list_namespaces(parent_id) @router.post("/namespaces", response_model=NamespaceOut, status_code=201) def create_namespace( body: NamespaceCreate, current_user=Depends(get_current_active_user), - db: Session = Depends(get_db), + service: CatalogService = Depends(get_catalog_service), ): """Create a catalog (level 0) or schema (level 1) namespace.""" - level = 0 - if body.parent_id is not None: - parent = db.get(CatalogNamespace, body.parent_id) - if parent is None: - raise HTTPException(404, "Parent namespace not found") - if parent.level >= 1: - raise HTTPException(422, "Cannot nest deeper than catalog -> schema") - level = parent.level + 1 - - existing = ( - db.query(CatalogNamespace) - .filter_by(name=body.name, parent_id=body.parent_id) - .first() - ) - if existing: + try: + return service.create_namespace( + name=body.name, + owner_id=current_user.id, + parent_id=body.parent_id, + description=body.description, + ) + except NamespaceNotFoundError: + raise HTTPException(404, "Parent namespace not found") + except NamespaceExistsError: raise HTTPException(409, "Namespace with this name already exists at this level") - - ns = CatalogNamespace( - name=body.name, - parent_id=body.parent_id, - level=level, - description=body.description, - owner_id=current_user.id, - ) - db.add(ns) - db.commit() - db.refresh(ns) - return ns + except NestingLimitError: + raise HTTPException(422, "Cannot nest deeper than catalog -> schema") @router.put("/namespaces/{namespace_id}", response_model=NamespaceOut) def update_namespace( namespace_id: int, body: NamespaceUpdate, - db: Session = Depends(get_db), + service: CatalogService = Depends(get_catalog_service), ): - ns = db.get(CatalogNamespace, namespace_id) - if ns is None: + try: + return service.update_namespace( + namespace_id=namespace_id, + name=body.name, + description=body.description, + ) + except NamespaceNotFoundError: raise HTTPException(404, "Namespace not found") - if body.name is not None: - ns.name = body.name - if body.description is not None: - ns.description = body.description - db.commit() - db.refresh(ns) - return ns @router.delete("/namespaces/{namespace_id}", status_code=204) def delete_namespace( namespace_id: int, - db: Session = Depends(get_db), + service: CatalogService = Depends(get_catalog_service), ): - ns = db.get(CatalogNamespace, namespace_id) - if ns is None: + try: + service.delete_namespace(namespace_id) + except NamespaceNotFoundError: raise HTTPException(404, "Namespace not found") - # Prevent deletion if children or flows exist - children = db.query(CatalogNamespace).filter_by(parent_id=namespace_id).count() - flows = db.query(FlowRegistration).filter_by(namespace_id=namespace_id).count() - if children > 0 or flows > 0: + except NamespaceNotEmptyError: raise HTTPException(422, "Cannot delete namespace with children or flows") - db.delete(ns) - db.commit() @router.get("/namespaces/tree", response_model=list[NamespaceTree]) def get_namespace_tree( current_user=Depends(get_current_active_user), - db: Session = Depends(get_db), + service: CatalogService = Depends(get_catalog_service), ): """Return the full catalog tree with flows nested under schemas.""" - catalogs = ( - db.query(CatalogNamespace) - .filter(CatalogNamespace.parent_id.is_(None)) - .order_by(CatalogNamespace.name) - .all() - ) - result = [] - for cat in catalogs: - schemas_db = ( - db.query(CatalogNamespace) - .filter_by(parent_id=cat.id) - .order_by(CatalogNamespace.name) - .all() - ) - children = [] - for schema in schemas_db: - flows_db = ( - db.query(FlowRegistration) - .filter_by(namespace_id=schema.id) - .order_by(FlowRegistration.name) - .all() - ) - flow_outs = [_enrich_flow(f, db, current_user.id) for f in flows_db] - children.append( - NamespaceTree( - id=schema.id, - name=schema.name, - parent_id=schema.parent_id, - level=schema.level, - description=schema.description, - owner_id=schema.owner_id, - created_at=schema.created_at, - updated_at=schema.updated_at, - children=[], - flows=flow_outs, - ) - ) - # Also include flows directly under catalog (unschema'd) - root_flows_db = ( - db.query(FlowRegistration) - .filter_by(namespace_id=cat.id) - .order_by(FlowRegistration.name) - .all() - ) - root_flows = [_enrich_flow(f, db, current_user.id) for f in root_flows_db] - result.append( - NamespaceTree( - id=cat.id, - name=cat.name, - parent_id=cat.parent_id, - level=cat.level, - description=cat.description, - owner_id=cat.owner_id, - created_at=cat.created_at, - updated_at=cat.updated_at, - children=children, - flows=root_flows, - ) - ) - return result + return service.get_namespace_tree(user_id=current_user.id) # --------------------------------------------------------------------------- @@ -254,18 +147,10 @@ def get_namespace_tree( @router.get("/default-namespace-id") def get_default_namespace_id( - db: Session = Depends(get_db), + service: CatalogService = Depends(get_catalog_service), ): """Return the ID of the default 'user_flows' schema under 'General'.""" - general = db.query(CatalogNamespace).filter_by(name="General", parent_id=None).first() - if general is None: - return None - user_flows = db.query(CatalogNamespace).filter_by( - name="user_flows", parent_id=general.id - ).first() - if user_flows is None: - return None - return user_flows.id + return service.get_default_namespace_id() # --------------------------------------------------------------------------- @@ -277,48 +162,39 @@ def get_default_namespace_id( def list_flows( namespace_id: int | None = None, current_user=Depends(get_current_active_user), - db: Session = Depends(get_db), + service: CatalogService = Depends(get_catalog_service), ): - q = db.query(FlowRegistration) - if namespace_id is not None: - q = q.filter_by(namespace_id=namespace_id) - flows = q.order_by(FlowRegistration.name).all() - return [_enrich_flow(f, db, current_user.id) for f in flows] + return service.list_flows(user_id=current_user.id, namespace_id=namespace_id) @router.post("/flows", response_model=FlowRegistrationOut, status_code=201) def register_flow( body: FlowRegistrationCreate, current_user=Depends(get_current_active_user), - db: Session = Depends(get_db), + service: CatalogService = Depends(get_catalog_service), ): - if body.namespace_id is not None: - ns = db.get(CatalogNamespace, body.namespace_id) - if ns is None: - raise HTTPException(404, "Namespace not found") - flow = FlowRegistration( - name=body.name, - description=body.description, - flow_path=body.flow_path, - namespace_id=body.namespace_id, - owner_id=current_user.id, - ) - db.add(flow) - db.commit() - db.refresh(flow) - return _enrich_flow(flow, db, current_user.id) + try: + return service.register_flow( + name=body.name, + flow_path=body.flow_path, + owner_id=current_user.id, + namespace_id=body.namespace_id, + description=body.description, + ) + except NamespaceNotFoundError: + raise HTTPException(404, "Namespace not found") @router.get("/flows/{flow_id}", response_model=FlowRegistrationOut) def get_flow( flow_id: int, current_user=Depends(get_current_active_user), - db: Session = Depends(get_db), + service: CatalogService = Depends(get_catalog_service), ): - flow = db.get(FlowRegistration, flow_id) - if flow is None: + try: + return service.get_flow(registration_id=flow_id, user_id=current_user.id) + except FlowNotFoundError: raise HTTPException(404, "Flow not found") - return _enrich_flow(flow, db, current_user.id) @router.put("/flows/{flow_id}", response_model=FlowRegistrationOut) @@ -326,35 +202,29 @@ def update_flow( flow_id: int, body: FlowRegistrationUpdate, current_user=Depends(get_current_active_user), - db: Session = Depends(get_db), + service: CatalogService = Depends(get_catalog_service), ): - flow = db.get(FlowRegistration, flow_id) - if flow is None: + try: + return service.update_flow( + registration_id=flow_id, + requesting_user_id=current_user.id, + name=body.name, + description=body.description, + namespace_id=body.namespace_id, + ) + except FlowNotFoundError: raise HTTPException(404, "Flow not found") - if body.name is not None: - flow.name = body.name - if body.description is not None: - flow.description = body.description - if body.namespace_id is not None: - flow.namespace_id = body.namespace_id - db.commit() - db.refresh(flow) - return _enrich_flow(flow, db, current_user.id) @router.delete("/flows/{flow_id}", status_code=204) def delete_flow( flow_id: int, - db: Session = Depends(get_db), + service: CatalogService = Depends(get_catalog_service), ): - flow = db.get(FlowRegistration, flow_id) - if flow is None: + try: + service.delete_flow(registration_id=flow_id) + except FlowNotFoundError: raise HTTPException(404, "Flow not found") - # Clean up related records - db.query(FlowFavorite).filter_by(registration_id=flow_id).delete() - db.query(FlowFollow).filter_by(registration_id=flow_id).delete() - db.delete(flow) - db.commit() # --------------------------------------------------------------------------- @@ -367,63 +237,23 @@ def list_runs( registration_id: int | None = None, limit: int = Query(50, ge=1, le=500), offset: int = Query(0, ge=0), - db: Session = Depends(get_db), + service: CatalogService = Depends(get_catalog_service), ): - q = db.query(FlowRun) - if registration_id is not None: - q = q.filter_by(registration_id=registration_id) - runs = ( - q.order_by(FlowRun.started_at.desc()) - .offset(offset) - .limit(limit) - .all() + return service.list_runs( + registration_id=registration_id, limit=limit, offset=offset ) - return [ - FlowRunOut( - id=r.id, - registration_id=r.registration_id, - flow_name=r.flow_name, - flow_path=r.flow_path, - user_id=r.user_id, - started_at=r.started_at, - ended_at=r.ended_at, - success=r.success, - nodes_completed=r.nodes_completed, - number_of_nodes=r.number_of_nodes, - duration_seconds=r.duration_seconds, - run_type=r.run_type, - has_snapshot=r.flow_snapshot is not None, - ) - for r in runs - ] @router.get("/runs/{run_id}", response_model=FlowRunDetail) def get_run_detail( run_id: int, - db: Session = Depends(get_db), + service: CatalogService = Depends(get_catalog_service), ): """Get a single run including the YAML snapshot of the flow version that ran.""" - run = db.get(FlowRun, run_id) - if run is None: + try: + return service.get_run_detail(run_id) + except RunNotFoundError: raise HTTPException(404, "Run not found") - return FlowRunDetail( - id=run.id, - registration_id=run.registration_id, - flow_name=run.flow_name, - flow_path=run.flow_path, - user_id=run.user_id, - started_at=run.started_at, - ended_at=run.ended_at, - success=run.success, - nodes_completed=run.nodes_completed, - number_of_nodes=run.number_of_nodes, - duration_seconds=run.duration_seconds, - run_type=run.run_type, - has_snapshot=run.flow_snapshot is not None, - flow_snapshot=run.flow_snapshot, - node_results_json=run.node_results_json, - ) # --------------------------------------------------------------------------- @@ -435,17 +265,17 @@ def get_run_detail( def open_run_snapshot( run_id: int, current_user=Depends(get_current_active_user), - db: Session = Depends(get_db), + service: CatalogService = Depends(get_catalog_service), ): """Write the run's flow snapshot to a temp file and import it into the designer.""" - run = db.get(FlowRun, run_id) - if run is None: + try: + snapshot_data = service.get_run_snapshot(run_id) + except RunNotFoundError: raise HTTPException(404, "Run not found") - if not run.flow_snapshot: + except NoSnapshotError: raise HTTPException(422, "No flow snapshot available for this run") # Determine file extension based on content - snapshot_data = run.flow_snapshot try: json.loads(snapshot_data) suffix = ".json" @@ -473,56 +303,35 @@ def open_run_snapshot( @router.get("/favorites", response_model=list[FlowRegistrationOut]) def list_favorites( current_user=Depends(get_current_active_user), - db: Session = Depends(get_db), + service: CatalogService = Depends(get_catalog_service), ): - favs = ( - db.query(FlowFavorite) - .filter_by(user_id=current_user.id) - .order_by(FlowFavorite.created_at.desc()) - .all() - ) - result = [] - for fav in favs: - flow = db.get(FlowRegistration, fav.registration_id) - if flow: - result.append(_enrich_flow(flow, db, current_user.id)) - return result + return service.list_favorites(user_id=current_user.id) @router.post("/flows/{flow_id}/favorite", response_model=FavoriteOut, status_code=201) def add_favorite( flow_id: int, current_user=Depends(get_current_active_user), - db: Session = Depends(get_db), + service: CatalogService = Depends(get_catalog_service), ): - flow = db.get(FlowRegistration, flow_id) - if flow is None: + try: + return service.add_favorite( + user_id=current_user.id, registration_id=flow_id + ) + except FlowNotFoundError: raise HTTPException(404, "Flow not found") - existing = db.query(FlowFavorite).filter_by( - user_id=current_user.id, registration_id=flow_id - ).first() - if existing: - return existing - fav = FlowFavorite(user_id=current_user.id, registration_id=flow_id) - db.add(fav) - db.commit() - db.refresh(fav) - return fav @router.delete("/flows/{flow_id}/favorite", status_code=204) def remove_favorite( flow_id: int, current_user=Depends(get_current_active_user), - db: Session = Depends(get_db), + service: CatalogService = Depends(get_catalog_service), ): - fav = db.query(FlowFavorite).filter_by( - user_id=current_user.id, registration_id=flow_id - ).first() - if fav is None: + try: + service.remove_favorite(user_id=current_user.id, registration_id=flow_id) + except FavoriteNotFoundError: raise HTTPException(404, "Favorite not found") - db.delete(fav) - db.commit() # --------------------------------------------------------------------------- @@ -533,56 +342,35 @@ def remove_favorite( @router.get("/following", response_model=list[FlowRegistrationOut]) def list_following( current_user=Depends(get_current_active_user), - db: Session = Depends(get_db), + service: CatalogService = Depends(get_catalog_service), ): - follows = ( - db.query(FlowFollow) - .filter_by(user_id=current_user.id) - .order_by(FlowFollow.created_at.desc()) - .all() - ) - result = [] - for follow in follows: - flow = db.get(FlowRegistration, follow.registration_id) - if flow: - result.append(_enrich_flow(flow, db, current_user.id)) - return result + return service.list_following(user_id=current_user.id) @router.post("/flows/{flow_id}/follow", response_model=FollowOut, status_code=201) def add_follow( flow_id: int, current_user=Depends(get_current_active_user), - db: Session = Depends(get_db), + service: CatalogService = Depends(get_catalog_service), ): - flow = db.get(FlowRegistration, flow_id) - if flow is None: + try: + return service.add_follow( + user_id=current_user.id, registration_id=flow_id + ) + except FlowNotFoundError: raise HTTPException(404, "Flow not found") - existing = db.query(FlowFollow).filter_by( - user_id=current_user.id, registration_id=flow_id - ).first() - if existing: - return existing - follow = FlowFollow(user_id=current_user.id, registration_id=flow_id) - db.add(follow) - db.commit() - db.refresh(follow) - return follow @router.delete("/flows/{flow_id}/follow", status_code=204) def remove_follow( flow_id: int, current_user=Depends(get_current_active_user), - db: Session = Depends(get_db), + service: CatalogService = Depends(get_catalog_service), ): - follow = db.query(FlowFollow).filter_by( - user_id=current_user.id, registration_id=flow_id - ).first() - if follow is None: + try: + service.remove_follow(user_id=current_user.id, registration_id=flow_id) + except FollowNotFoundError: raise HTTPException(404, "Follow not found") - db.delete(follow) - db.commit() # --------------------------------------------------------------------------- @@ -593,50 +381,6 @@ def remove_follow( @router.get("/stats", response_model=CatalogStats) def get_catalog_stats( current_user=Depends(get_current_active_user), - db: Session = Depends(get_db), + service: CatalogService = Depends(get_catalog_service), ): - total_ns = db.query(CatalogNamespace).filter_by(level=0).count() - total_flows = db.query(FlowRegistration).count() - total_runs = db.query(FlowRun).count() - total_favs = db.query(FlowFavorite).filter_by(user_id=current_user.id).count() - recent = ( - db.query(FlowRun) - .order_by(FlowRun.started_at.desc()) - .limit(10) - .all() - ) - recent_out = [ - FlowRunOut( - id=r.id, - registration_id=r.registration_id, - flow_name=r.flow_name, - flow_path=r.flow_path, - user_id=r.user_id, - started_at=r.started_at, - ended_at=r.ended_at, - success=r.success, - nodes_completed=r.nodes_completed, - number_of_nodes=r.number_of_nodes, - duration_seconds=r.duration_seconds, - run_type=r.run_type, - has_snapshot=r.flow_snapshot is not None, - ) - for r in recent - ] - fav_ids = [ - f.registration_id - for f in db.query(FlowFavorite).filter_by(user_id=current_user.id).all() - ] - fav_flows = [] - for fid in fav_ids: - flow = db.get(FlowRegistration, fid) - if flow: - fav_flows.append(_enrich_flow(flow, db, current_user.id)) - return CatalogStats( - total_namespaces=total_ns, - total_flows=total_flows, - total_runs=total_runs, - total_favorites=total_favs, - recent_runs=recent_out, - favorite_flows=fav_flows, - ) + return service.get_catalog_stats(user_id=current_user.id) diff --git a/flowfile_core/flowfile_core/routes/routes.py b/flowfile_core/flowfile_core/routes/routes.py index 0b9ace110..1fec7a1e4 100644 --- a/flowfile_core/flowfile_core/routes/routes.py +++ b/flowfile_core/flowfile_core/routes/routes.py @@ -69,22 +69,28 @@ def get_node_model(setting_name_ref: str): def _auto_register_flow(flow_path: str, name: str, user_id: int | None) -> None: - """Register a flow in the default catalog namespace (General > user_flows) if it exists.""" + """Register a flow in the default catalog namespace (General > user_flows) if it exists. + + Failures are logged at info level since users may wonder why some flows + don't appear in the catalog. + """ if user_id is None or flow_path is None: return try: with get_db_context() as db: general = db.query(CatalogNamespace).filter_by(name="General", parent_id=None).first() if general is None: + logger.info("Auto-registration skipped: 'General' catalog namespace not found") return user_flows = db.query(CatalogNamespace).filter_by( name="user_flows", parent_id=general.id ).first() if user_flows is None: + logger.info("Auto-registration skipped: 'user_flows' schema not found under 'General'") return existing = db.query(FlowRegistration).filter_by(flow_path=flow_path).first() if existing: - return + return # Already registered, silent success reg = FlowRegistration( name=name or Path(flow_path).stem, flow_path=flow_path, @@ -93,8 +99,9 @@ def _auto_register_flow(flow_path: str, name: str, user_id: int | None) -> None: ) db.add(reg) db.commit() + logger.info(f"Auto-registered flow '{reg.name}' in default namespace") except Exception: - logger.debug("Auto-registration in default namespace failed (non-critical)", exc_info=True) + logger.info(f"Auto-registration failed for '{flow_path}' (non-critical)", exc_info=True) @router.post("/upload/") @@ -238,27 +245,38 @@ async def trigger_fetch_node_data(flow_id: int, node_id: int, background_tasks: def _run_and_track(flow, user_id: int | None): - """Wrapper that runs a flow and persists the run record to the database.""" + """Wrapper that runs a flow and persists the run record to the database. + + This runs in a BackgroundTask. If DB persistence fails, the run still + completed but won't appear in the run history. Failures are logged at + ERROR level so they're visible in logs. + """ + flow_name = getattr(flow.flow_settings, "name", None) or getattr(flow, "__name__", "unknown") + run_info = flow.run_graph() if run_info is None: + logger.error(f"Flow '{flow_name}' returned no run_info - run tracking skipped") return # Persist run record + tracking_succeeded = False try: - # Build snapshot + # Build snapshot (non-critical if fails) + snapshot_yaml = None try: snapshot_data = flow.get_flowfile_data() snapshot_yaml = snapshot_data.model_dump_json() - except Exception: - snapshot_yaml = None + except Exception as snap_err: + logger.warning(f"Flow '{flow_name}': snapshot serialization failed: {snap_err}") - # Serialise node results + # Serialise node results (non-critical if fails) + node_results = None try: node_results = json.dumps( [nr.model_dump(mode="json") for nr in (run_info.node_step_result or [])], ) - except Exception: - node_results = None + except Exception as node_err: + logger.warning(f"Flow '{flow_name}': node results serialization failed: {node_err}") duration = None if run_info.start_time and run_info.end_time: @@ -275,7 +293,7 @@ def _run_and_track(flow, user_id: int | None): db_run = FlowRun( registration_id=reg_id, - flow_name=flow.flow_settings.name or flow.__name__, + flow_name=flow_name, flow_path=flow_path, user_id=user_id if user_id is not None else 0, started_at=run_info.start_time, @@ -290,8 +308,25 @@ def _run_and_track(flow, user_id: int | None): ) db.add(db_run) db.commit() + tracking_succeeded = True + logger.info( + f"Flow '{flow_name}' run tracked: success={run_info.success}, " + f"nodes={run_info.nodes_completed}/{run_info.number_of_nodes}, " + f"duration={duration:.2f}s" if duration else f"duration=N/A" + ) except Exception as exc: - logger.warning(f"Failed to persist flow run record: {exc}") + logger.error( + f"Failed to persist run record for flow '{flow_name}'. " + f"The flow {'succeeded' if run_info.success else 'failed'} but won't appear in run history. " + f"Error: {exc}", + exc_info=True, + ) + + if not tracking_succeeded: + logger.error( + f"Run tracking failed for flow '{flow_name}'. " + "Check database connectivity and FlowRun table schema." + ) @router.post('/flow/run/', tags=['editor']) diff --git a/kernel_runtime/kernel_runtime/artifact_store.py b/kernel_runtime/kernel_runtime/artifact_store.py index f9382493d..42eedc5c8 100644 --- a/kernel_runtime/kernel_runtime/artifact_store.py +++ b/kernel_runtime/kernel_runtime/artifact_store.py @@ -9,6 +9,21 @@ class ArtifactStore: Artifacts are scoped by ``flow_id`` so that multiple flows sharing the same kernel container cannot collide on artifact names. + + .. note:: **Tech Debt / Future Improvement** + + Currently stores the entire object in memory via ``self._artifacts``. + For very large artifacts (e.g., ML models >1GB), this causes memory + pressure and potential OOM. A future improvement would be to: + + 1. Implement a spill-to-disk mechanism (e.g., pickle to temp file when + size exceeds threshold, keep only metadata in memory). + 2. Or integrate with an external object store (S3, MinIO) for truly + large artifacts, storing only a reference here. + 3. For blob uploads, consider a streaming/chunked approach rather than + reading the entire file into memory before storage. + + See: https://github.com/Edwardvaneechoud/Flowfile/issues/XXX (placeholder) """ def __init__(self): From 431feaae6f294b513c174a11abed7c076286c5ec Mon Sep 17 00:00:00 2001 From: Edward van Eechoud <41021650+Edwardvaneechoud@users.noreply.github.com> Date: Tue, 3 Feb 2026 08:56:52 +0100 Subject: [PATCH 11/38] Add artifact visualization with edges and node badges (#288) --- .../flowfile_core/flowfile/artifacts.py | 129 +++++++- flowfile_core/flowfile_core/routes/routes.py | 18 ++ .../tests/flowfile/test_artifact_context.py | 17 +- .../src/renderer/app/api/flow.api.ts | 16 + .../app/components/nodes/ArtifactBadge.vue | 247 ++++++++++++++ .../app/components/nodes/NodeWrapper.vue | 4 + .../app/features/designer/dataPreview.vue | 304 ++++++++++++++++-- .../src/renderer/app/stores/flow-store.ts | 26 +- .../src/renderer/app/types/flow.types.ts | 39 +++ .../app/views/DesignerView/Canvas.vue | 15 +- 10 files changed, 763 insertions(+), 52 deletions(-) create mode 100644 flowfile_frontend/src/renderer/app/components/nodes/ArtifactBadge.vue diff --git a/flowfile_core/flowfile_core/flowfile/artifacts.py b/flowfile_core/flowfile_core/flowfile/artifacts.py index 8b6d33337..4199d9881 100644 --- a/flowfile_core/flowfile_core/flowfile/artifacts.py +++ b/flowfile_core/flowfile_core/flowfile/artifacts.py @@ -142,8 +142,10 @@ def record_deleted( ) -> None: """Record that *node_id* deleted the given artifacts from *kernel_id*. - Removes the artifacts from the kernel index and from published - lists of the publishing nodes (looked up via reverse index). + Removes the artifacts from the kernel index so they are no longer + available to downstream nodes. The original publisher's + ``state.published`` list is **not** modified — it serves as a + permanent record of what the node produced. """ state = self._get_or_create_state(node_id) state.deleted.extend(artifact_names) @@ -151,8 +153,7 @@ def record_deleted( kernel_map = self._kernel_artifacts.get(kernel_id, {}) for name in artifact_names: kernel_map.pop(name, None) - - # Use the reverse index to update only the affected nodes + # Clean up the reverse index entry but leave published intact key = (kernel_id, name) publisher_ids = self._publisher_index.pop(key, set()) @@ -162,14 +163,9 @@ def record_deleted( self._deletion_origins.setdefault(node_id, []).append( (kernel_id, name, pid) ) - - for pid in publisher_ids: - ns = self._node_states.get(pid) - if ns is not None: - ns.published = [ - r for r in ns.published - if not (r.kernel_id == kernel_id and r.name == name) - ] + # NOTE: We do NOT remove from publisher's published list here. + # The published list serves as a permanent historical record + # for visualization (badges showing what the node produced). logger.debug( "Node %s deleted %d artifact(s) on kernel '%s': %s", @@ -192,16 +188,29 @@ def compute_available( """Compute which artifacts are available to *node_id*. An artifact is available if it was published by an upstream node - (direct or transitive) that used the **same** ``kernel_id``. + (direct or transitive) that used the **same** ``kernel_id`` and + has **not** been deleted by a later upstream node. + + Upstream nodes are processed in topological order (sorted by node ID). + For each node, deletions are applied first, then publications — so + a later node can delete-then-republish an artifact and the new + version will be available downstream. The result is stored on the node's :class:`NodeArtifactState` and also returned. """ available: dict[str, ArtifactRef] = {} - for uid in upstream_node_ids: + + # Sort by node ID to ensure topological processing order + # (FlowGraph._get_upstream_node_ids returns BFS order which is reversed) + for uid in sorted(upstream_node_ids): upstream_state = self._node_states.get(uid) if upstream_state is None: continue + # First, remove artifacts deleted by this upstream node + for name in upstream_state.deleted: + available.pop(name, None) + # Then, add artifacts published by this upstream node for ref in upstream_state.published: if ref.kernel_id == kernel_id: available[ref.name] = ref @@ -268,8 +277,8 @@ def get_producer_nodes_for_deletions( def clear_kernel(self, kernel_id: str) -> None: """Remove tracking for a specific kernel. - Also removes the corresponding published refs from node states - and cleans up the reverse index. + Clears the kernel index and availability maps. The ``published`` + lists on node states are preserved as historical records. """ # Clean reverse index entries for this kernel keys_to_remove = [k for k in self._publisher_index if k[0] == kernel_id] @@ -287,7 +296,6 @@ def clear_kernel(self, kernel_id: str) -> None: self._kernel_artifacts.pop(kernel_id, None) for state in self._node_states.values(): - state.published = [r for r in state.published if r.kernel_id != kernel_id] state.available = { k: v for k, v in state.available.items() if v.kernel_id != kernel_id } @@ -352,6 +360,93 @@ def restore_node_state(self, node_id: int, state: NodeArtifactState) -> None: key = (ref.kernel_id, ref.name) self._publisher_index.setdefault(key, set()).add(node_id) + # ------------------------------------------------------------------ + # Visualisation helpers + # ------------------------------------------------------------------ + + def get_artifact_edges(self) -> list[dict[str, Any]]: + """Build a list of artifact edges for canvas visualisation. + + Each edge connects a publisher node to every consumer node that + consumed one of its artifacts (on the same kernel). + + Returns a list of dicts with keys: + source, target, artifact_name, artifact_type, kernel_id + """ + edges: list[dict[str, Any]] = [] + seen: set[tuple[int, int, str]] = set() + + for nid, state in self._node_states.items(): + if not state.consumed: + continue + for art_name in state.consumed: + # Look up the publisher via the available map first + ref = state.available.get(art_name) + if ref is None: + # Fallback: scan kernel artifacts + for km in self._kernel_artifacts.values(): + if art_name in km: + ref = km[art_name] + break + if ref is None: + continue + key = (ref.source_node_id, nid, art_name) + if key in seen: + continue + seen.add(key) + edges.append({ + "source": ref.source_node_id, + "target": nid, + "artifact_name": art_name, + "artifact_type": ref.type_name, + "kernel_id": ref.kernel_id, + }) + + return edges + + def get_node_summaries(self) -> dict[str, dict[str, Any]]: + """Return per-node artifact summary for badge/tab display. + + Returns a dict keyed by str(node_id) with: + published_count, consumed_count, deleted_count, + published, consumed, deleted, kernel_id + """ + summaries: dict[str, dict[str, Any]] = {} + for nid, state in self._node_states.items(): + if not state.published and not state.consumed and not state.deleted: + continue + kernel_id = "" + if state.published: + kernel_id = state.published[0].kernel_id + summaries[str(nid)] = { + "published_count": len(state.published), + "consumed_count": len(state.consumed), + "deleted_count": len(state.deleted), + "published": [ + { + "name": r.name, + "type_name": r.type_name, + "module": r.module, + } + for r in state.published + ], + "consumed": [ + { + "name": name, + "source_node_id": state.available[name].source_node_id + if name in state.available + else None, + "type_name": state.available[name].type_name + if name in state.available + else "", + } + for name in state.consumed + ], + "deleted": list(state.deleted), + "kernel_id": kernel_id, + } + return summaries + # ------------------------------------------------------------------ # Serialisation # ------------------------------------------------------------------ diff --git a/flowfile_core/flowfile_core/routes/routes.py b/flowfile_core/flowfile_core/routes/routes.py index 1fec7a1e4..7c6c8145a 100644 --- a/flowfile_core/flowfile_core/routes/routes.py +++ b/flowfile_core/flowfile_core/routes/routes.py @@ -1042,6 +1042,24 @@ def get_vue_flow_data(flow_id: int) -> schemas.VueFlowInput: return data +@router.get('/flow/artifacts', tags=['editor']) +def get_flow_artifacts(flow_id: int): + """Returns artifact visualization data for the canvas. + + Includes per-node artifact summaries (for badges/tooltips) and + artifact edges (for dashed-line connections between publisher and + consumer nodes). + """ + flow = flow_file_handler.get_flow(flow_id) + if flow is None: + raise HTTPException(404, 'Could not find the flow') + ctx = flow.artifact_context + return { + "nodes": ctx.get_node_summaries(), + "edges": ctx.get_artifact_edges(), + } + + @router.get('/analysis_data/graphic_walker_input', tags=['analysis'], response_model=input_schema.NodeExploreData) def get_graphic_walker_input(flow_id: int, node_id: int): """Gets the data and configuration for the Graphic Walker data exploration tool.""" diff --git a/flowfile_core/tests/flowfile/test_artifact_context.py b/flowfile_core/tests/flowfile/test_artifact_context.py index 1d6fd29fa..b4e8a8e37 100644 --- a/flowfile_core/tests/flowfile/test_artifact_context.py +++ b/flowfile_core/tests/flowfile/test_artifact_context.py @@ -194,14 +194,19 @@ def test_record_deleted_removes_from_kernel_index(self): ctx.record_deleted(2, "k1", ["model"]) assert ctx.get_kernel_artifacts("k1") == {} - def test_record_deleted_removes_from_published_lists(self): + def test_record_deleted_preserves_publisher_published_list(self): + """Deletion does NOT remove from publisher's published list (historical record).""" ctx = ArtifactContext() ctx.record_published(1, "k1", ["model", "scaler"]) ctx.record_deleted(2, "k1", ["model"]) + # Publisher's published list is preserved as historical record published = ctx.get_published_by_node(1) names = [r.name for r in published] - assert "model" not in names + assert "model" in names # Still there as historical record assert "scaler" in names + # The deleting node has it tracked in its deleted list + state = ctx._node_states[2] + assert "model" in state.deleted def test_record_deleted_tracks_on_node_state(self): ctx = ArtifactContext() @@ -245,15 +250,19 @@ def test_clear_kernel_removes_only_that_kernel(self): assert ctx.get_kernel_artifacts("k1") == {} assert "encoder" in ctx.get_kernel_artifacts("k2") - def test_clear_kernel_removes_from_node_states(self): + def test_clear_kernel_preserves_published_lists(self): + """clear_kernel removes from kernel index but preserves published (historical record).""" ctx = ArtifactContext() ctx.record_published(1, "k1", ["model"]) ctx.record_published(1, "k2", ["encoder"]) ctx.clear_kernel("k1") + # Published list is preserved as historical record published = ctx.get_published_by_node(1) names = [r.name for r in published] - assert "model" not in names + assert "model" in names # Still there as historical record assert "encoder" in names + # But the kernel index is cleared + assert ctx.get_kernel_artifacts("k1") == {} def test_clear_kernel_removes_from_available(self): ctx = ArtifactContext() diff --git a/flowfile_frontend/src/renderer/app/api/flow.api.ts b/flowfile_frontend/src/renderer/app/api/flow.api.ts index c6d4b8b47..53accc957 100644 --- a/flowfile_frontend/src/renderer/app/api/flow.api.ts +++ b/flowfile_frontend/src/renderer/app/api/flow.api.ts @@ -11,6 +11,7 @@ import type { HistoryState, UndoRedoResult, OperationResponse, + FlowArtifactData, } from "../types"; export class FlowApi { @@ -318,4 +319,19 @@ export class FlowApi { }); return response.data; } + + // ============================================================================ + // Artifact Operations + // ============================================================================ + + /** + * Get artifact visualization data for a flow (badges, edges) + */ + static async getArtifacts(flowId: number): Promise { + const response = await axios.get("/flow/artifacts", { + params: { flow_id: flowId }, + headers: { accept: "application/json" }, + }); + return response.data; + } } diff --git a/flowfile_frontend/src/renderer/app/components/nodes/ArtifactBadge.vue b/flowfile_frontend/src/renderer/app/components/nodes/ArtifactBadge.vue new file mode 100644 index 000000000..99403417b --- /dev/null +++ b/flowfile_frontend/src/renderer/app/components/nodes/ArtifactBadge.vue @@ -0,0 +1,247 @@ + + + + + diff --git a/flowfile_frontend/src/renderer/app/components/nodes/NodeWrapper.vue b/flowfile_frontend/src/renderer/app/components/nodes/NodeWrapper.vue index 663143d49..8f6931ccb 100644 --- a/flowfile_frontend/src/renderer/app/components/nodes/NodeWrapper.vue +++ b/flowfile_frontend/src/renderer/app/components/nodes/NodeWrapper.vue @@ -57,6 +57,9 @@ /> + + +
- -
-

- Displayed data might be outdated. - -

+ +
+
- - - -
-

Step has not stored any data yet. Click here to trigger a run for this node

- + +
+ +
+

+ Displayed data might be outdated. + +

+ +
+ + + + +
+

Step has not stored any data yet. Click here to trigger a run for this node

+ +
+
+ + +
+ + +
+
Published
+ + + + + + + + + + + + + + + +
NameTypeModule
{{ art.name }}{{ art.type_name || "-" }}{{ art.module || "-" }}
+
+ +
+
Consumed
+ + + + + + + + + + + + + + + +
NameTypeSource Node
{{ art.name }}{{ art.type_name || "-" }}{{ art.source_node_id != null ? `Node ${art.source_node_id}` : "-" }}
+
+ +
+
Deleted
+ + + + + + + + + + + +
Name
{{ name }}
+
+ +
+ No artifacts recorded for this node. +
+ + diff --git a/flowfile_frontend/src/renderer/app/components/nodes/node-types/elements/pythonScript/NotebookCell.vue b/flowfile_frontend/src/renderer/app/components/nodes/node-types/elements/pythonScript/NotebookCell.vue new file mode 100644 index 000000000..fd0f1f5c0 --- /dev/null +++ b/flowfile_frontend/src/renderer/app/components/nodes/node-types/elements/pythonScript/NotebookCell.vue @@ -0,0 +1,256 @@ + + + + + diff --git a/flowfile_frontend/src/renderer/app/components/nodes/node-types/elements/pythonScript/NotebookEditor.vue b/flowfile_frontend/src/renderer/app/components/nodes/node-types/elements/pythonScript/NotebookEditor.vue new file mode 100644 index 000000000..d1443c175 --- /dev/null +++ b/flowfile_frontend/src/renderer/app/components/nodes/node-types/elements/pythonScript/NotebookEditor.vue @@ -0,0 +1,266 @@ + + + + + diff --git a/flowfile_frontend/src/renderer/app/components/nodes/node-types/elements/pythonScript/PythonScript.vue b/flowfile_frontend/src/renderer/app/components/nodes/node-types/elements/pythonScript/PythonScript.vue index 86cfdc558..dd9a1afae 100644 --- a/flowfile_frontend/src/renderer/app/components/nodes/node-types/elements/pythonScript/PythonScript.vue +++ b/flowfile_frontend/src/renderer/app/components/nodes/node-types/elements/pythonScript/PythonScript.vue @@ -54,31 +54,7 @@
- -
-
- - -
-
- -
-
- - +
@@ -117,39 +93,92 @@
+ + +
+
+ +
+ + +
+
+ +
+ + + + +
+ +
+
+ + diff --git a/flowfile_frontend/src/renderer/app/views/CatalogView/CatalogTreeNode.vue b/flowfile_frontend/src/renderer/app/views/CatalogView/CatalogTreeNode.vue index 9db5b8074..5d67112a7 100644 --- a/flowfile_frontend/src/renderer/app/views/CatalogView/CatalogTreeNode.vue +++ b/flowfile_frontend/src/renderer/app/views/CatalogView/CatalogTreeNode.vue @@ -32,7 +32,9 @@ :key="child.id" :node="child" :selected-flow-id="selectedFlowId" + :selected-artifact-id="selectedArtifactId" @select-flow="$emit('selectFlow', $event)" + @select-artifact="$emit('selectArtifact', $event)" @toggle-favorite="$emit('toggleFavorite', $event)" @toggle-follow="$emit('toggleFollow', $event)" @register-flow="$emit('registerFlow', $event)" @@ -71,21 +73,43 @@ > + + +
+ + {{ group.name }} + {{ group.versionCount }} versions + v{{ group.latest.version }} +