diff --git a/frontend/src/components/editor/chrome/wrapper/footer-items/machine-stats.tsx b/frontend/src/components/editor/chrome/wrapper/footer-items/machine-stats.tsx index 59fd3522938..f73dd8e2767 100644 --- a/frontend/src/components/editor/chrome/wrapper/footer-items/machine-stats.tsx +++ b/frontend/src/components/editor/chrome/wrapper/footer-items/machine-stats.tsx @@ -55,8 +55,11 @@ const MemoryUsageBar: React.FC<{ kernel: UsageResponse["kernel"]; server: UsageResponse["server"]; }> = ({ memory, kernel, server }) => { - const { percent, total, available } = memory; + const { percent, total, available, has_cgroup_mem_limit } = memory; const roundedPercent = Math.round(percent); + const memoryLabel = has_cgroup_mem_limit + ? "container memory" + : "computer memory"; const gbFormatter = useNumberFormatter({ maximumFractionDigits: 2, @@ -82,7 +85,7 @@ const MemoryUsageBar: React.FC<{ content={
- computer memory: {formatGB(total - available)} /{" "} + {memoryLabel}: {formatGB(total - available)} /{" "} {formatGB(total)} GB ({roundedPercent}%) {server?.memory && ( diff --git a/marimo/_server/api/endpoints/health.py b/marimo/_server/api/endpoints/health.py index a1c0e8da66d..f06255763c0 100644 --- a/marimo/_server/api/endpoints/health.py +++ b/marimo/_server/api/endpoints/health.py @@ -12,6 +12,8 @@ from marimo._server.api.deps import AppState from marimo._server.router import APIRouter from marimo._utils.health import ( + get_cgroup_cpu_percent, + get_cgroup_mem_stats, get_node_version, get_python_version, get_required_modules_list, @@ -130,12 +132,15 @@ async def usage(request: Request) -> JSONResponse: type: integer free: type: integer + has_cgroup_mem_limit: + type: boolean required: - total - available - percent - used - free + - has_cgroup_mem_limit server: type: object properties: @@ -193,10 +198,28 @@ async def usage(request: Request) -> JSONResponse: import psutil - memory = psutil.virtual_memory() - # interval=None is nonblocking; first value is meaningless but after - # that it's useful. - cpu = psutil.cpu_percent(interval=None) + if cgroup_mem_stats := get_cgroup_mem_stats(): + memory_stats = { + "has_cgroup_mem_limit": True, + **cgroup_mem_stats, + } + else: + # Use host memory stats + memory = psutil.virtual_memory() + memory_stats = { + "total": memory.total, + "available": memory.available, + "percent": memory.percent, + "used": memory.used, + "free": memory.free, + "has_cgroup_mem_limit": False, + } + + cpu = get_cgroup_cpu_percent() + if cpu is None: + # interval=None is nonblocking; first call returns meaningless value + # subsequent calls return delta since last call + cpu = psutil.cpu_percent(interval=None) # Server memory (and children) main_process = psutil.Process() @@ -267,14 +290,8 @@ async def usage(request: Request) -> JSONResponse: return JSONResponse( { - # computer memory - "memory": { - "total": memory.total, - "available": memory.available, - "percent": memory.percent, - "used": memory.used, - "free": memory.free, - }, + # computer memory or container memory + "memory": memory_stats, # marimo server "server": { "memory": server_memory, diff --git a/marimo/_utils/health.py b/marimo/_utils/health.py index cebd331b47f..6eea27d51f8 100644 --- a/marimo/_utils/health.py +++ b/marimo/_utils/health.py @@ -2,9 +2,11 @@ from __future__ import annotations import importlib.metadata +import os import subprocess import sys -from typing import Optional +import time +from typing import Optional, TypedDict from marimo import _loggers @@ -12,6 +14,35 @@ TIMEOUT = 10 # seconds +# Module-level state for cgroup CPU percent calculation (like psutil does) +_LAST_CGROUP_CPU_SAMPLE: Optional[tuple[int, float]] = ( + None # (usage_usec, timestamp) +) + + +class MemoryStats(TypedDict): + total: int + used: int + available: int + percent: float + free: int + + +# Constants for cgroup v2 file locations +# Reference: https://www.kernel.org/doc/Documentation/cgroup-v2.txt +CGROUP_V2_MEMORY_MAX_FILE = "/sys/fs/cgroup/memory.max" +CGROUP_V2_MEMORY_CURRENT_FILE = "/sys/fs/cgroup/memory.current" +CGROUP_V2_CPU_STAT_FILE = "/sys/fs/cgroup/cpu.stat" +CGROUP_V2_CPU_MAX_FILE = "/sys/fs/cgroup/cpu.max" + +# cgroup v1 file locations (legacy hierarchy) +# Reference: https://www.kernel.org/doc/Documentation/cgroup-v1/ +CGROUP_V1_CPU_USAGE_FILE = "/sys/fs/cgroup/cpuacct/cpuacct.usage" +CGROUP_V1_CPU_CFS_QUOTA_US_FILE = "/sys/fs/cgroup/cpu/cpu.cfs_quota_us" +CGROUP_V1_CPU_CFS_PERIOD_US_FILE = "/sys/fs/cgroup/cpu/cpu.cfs_period_us" +CGROUP_V1_MEMORY_LIMIT_FILE = "/sys/fs/cgroup/memory/memory.limit_in_bytes" +CGROUP_V1_MEMORY_USAGE_FILE = "/sys/fs/cgroup/memory/memory.usage_in_bytes" + def get_node_version() -> Optional[str]: try: @@ -183,3 +214,172 @@ def communicate_with_timeout( except subprocess.TimeoutExpired: process.kill() return "", "Error: Process timed out" + + +def _has_cgroup_cpu_limit() -> bool: + """ + Returns True/False whether the container has a CPU limit set. + This function checks for both cgroups v1 and v2. + """ + if os.path.exists(CGROUP_V2_CPU_MAX_FILE): + with open(CGROUP_V2_CPU_MAX_FILE, encoding="utf-8") as f: + cpu_max = f.read().strip() + return cpu_max != "max" + # Fallback to cgroup v1 (legacy) + if os.path.exists(CGROUP_V1_CPU_CFS_QUOTA_US_FILE): + with open(CGROUP_V1_CPU_CFS_QUOTA_US_FILE, encoding="utf-8") as f: + quota = int(f.read().strip()) + return quota > 0 + return False + + +def get_cgroup_mem_stats() -> Optional[MemoryStats]: + """ + Get container memory stats from cgroup. + + Returns: + Dictionary with memory stats if cgroup limits are configured, + None if cgroup limits are not configured or unable to read. + + Example return value: + { + 'total': 2147483648, # bytes + 'used': 1073741824, # bytes + 'available': 1073741824, # bytes + 'free': 1073741824, # bytes + 'percent': 50.0, # percentage + } + """ + try: + if os.path.exists(CGROUP_V2_MEMORY_MAX_FILE): + with open(CGROUP_V2_MEMORY_MAX_FILE, encoding="utf-8") as f: + memory_max = f.read().strip() + with open(CGROUP_V2_MEMORY_CURRENT_FILE, encoding="utf-8") as f: + memory_current = f.read().strip() + + if memory_max != "max": + total = int(memory_max) + used = int(memory_current) + available = total - used + percent = (used / total) * 100 if total > 0 else 0 + return MemoryStats( + total=total, + used=used, + available=available, + percent=percent, + free=available, # free == available for cgroup memory + ) + elif os.path.exists(CGROUP_V1_MEMORY_LIMIT_FILE): + with open(CGROUP_V1_MEMORY_LIMIT_FILE, encoding="utf-8") as f: + total = int(f.read().strip()) + with open(CGROUP_V1_MEMORY_USAGE_FILE, encoding="utf-8") as f: + used = int(f.read().strip()) + available = total - used + percent = (used / total) * 100 if total > 0 else 0 + + return MemoryStats( + total=total, + used=used, + available=available, + percent=percent, + free=available, # free == available for cgroup memory + ) + except (FileNotFoundError, PermissionError, ValueError) as e: + LOGGER.debug(f"Error reading container memory stats: {e}") + + return None + + +def _get_cgroup_allocated_cores() -> Optional[float]: + """Get the number of CPU cores allocated to this cgroup (quota / period).""" + try: + if os.path.exists(CGROUP_V2_CPU_MAX_FILE): + with open(CGROUP_V2_CPU_MAX_FILE, encoding="utf-8") as f: + parts = f.read().strip().split() + if len(parts) == 2 and parts[0] != "max": + return int(parts[0]) / int(parts[1]) + elif os.path.exists(CGROUP_V1_CPU_CFS_QUOTA_US_FILE): + with open(CGROUP_V1_CPU_CFS_QUOTA_US_FILE, encoding="utf-8") as f: + quota = int(f.read().strip()) + with open(CGROUP_V1_CPU_CFS_PERIOD_US_FILE, encoding="utf-8") as f: + period = int(f.read().strip()) + if quota > 0: + return quota / period + except (FileNotFoundError, PermissionError, ValueError): + pass + return None + + +def get_cgroup_cpu_percent() -> Optional[float]: + """ + Get CPU usage percentage for a cgroup-limited container. + + Works like psutil.cpu_percent(interval=None): + - First call stores the current reading and returns 0.0 + - Subsequent calls return the CPU percent since the last call + + Returns: + CPU usage as a percentage (0-100). + 0.0 if cgroup limits are configured but unable to read current usage + (e.g., on the first call) + None if cgroup limits are not configured or unable to read. + """ + global _LAST_CGROUP_CPU_SAMPLE + + # Early return if no CPU limit is configured + if not _has_cgroup_cpu_limit(): + return None + + try: + # Read current usage (microseconds) + current_usage_microseconds: Optional[int] = None + + if os.path.exists(CGROUP_V2_CPU_STAT_FILE): + with open(CGROUP_V2_CPU_STAT_FILE, encoding="utf-8") as f: + for line in f: + if line.startswith("usage_usec"): + current_usage_microseconds = int(line.split()[1]) + break + + elif os.path.exists(CGROUP_V1_CPU_USAGE_FILE): + with open(CGROUP_V1_CPU_USAGE_FILE, encoding="utf-8") as f: + current_usage_microseconds = ( + int(f.read().strip()) // 1_000_000 + ) # ns -> μs + + if current_usage_microseconds is None: + return 0.0 + + allocated_cores = _get_cgroup_allocated_cores() + if allocated_cores is None or allocated_cores <= 0: + return 0.0 + + current_time = time.time() + + if _LAST_CGROUP_CPU_SAMPLE is None: + # First call - store reading, return 0.0 (like psutil's first call) + _LAST_CGROUP_CPU_SAMPLE = ( + current_usage_microseconds, + current_time, + ) + return 0.0 + + last_usage, last_time = _LAST_CGROUP_CPU_SAMPLE + _LAST_CGROUP_CPU_SAMPLE = (current_usage_microseconds, current_time) + + delta_time = current_time - last_time + if delta_time <= 0: + return 0.0 + + delta_usage_microseconds = current_usage_microseconds - last_usage + delta_time_microseconds = delta_time * 1_000_000 + percent = ( + delta_usage_microseconds + / (delta_time_microseconds * 1_000_000 * allocated_cores) + ) * 100 + + return min(100.0, max(0.0, percent)) + + except (FileNotFoundError, PermissionError, ValueError, IndexError): + # Error reading cgroup CPU stats — fall back to psutil + return None diff --git a/packages/openapi/api.yaml b/packages/openapi/api.yaml index 9110c167896..22e7edeef3a 100644 --- a/packages/openapi/api.yaml +++ b/packages/openapi/api.yaml @@ -5367,6 +5367,8 @@ paths: type: integer free: type: integer + has_cgroup_mem_limit: + type: boolean percent: type: number total: @@ -5379,6 +5381,7 @@ paths: - percent - used - free + - has_cgroup_mem_limit type: object server: properties: diff --git a/packages/openapi/src/api.ts b/packages/openapi/src/api.ts index d262e19e22f..c02eeb15fdc 100644 --- a/packages/openapi/src/api.ts +++ b/packages/openapi/src/api.ts @@ -2663,6 +2663,7 @@ export interface paths { memory: { available: number; free: number; + has_cgroup_mem_limit: boolean; percent: number; total: number; used: number; diff --git a/tests/_server/api/endpoints/test_health.py b/tests/_server/api/endpoints/test_health.py index 29ab8fe2615..50b42733b56 100644 --- a/tests/_server/api/endpoints/test_health.py +++ b/tests/_server/api/endpoints/test_health.py @@ -60,6 +60,8 @@ def test_memory(client: TestClient) -> None: assert memory["available"] > 0 assert memory["used"] > 0 assert memory["free"] > 0 + assert "has_cgroup_mem_limit" in memory + assert isinstance(memory["has_cgroup_mem_limit"], bool) cpu = response.json()["cpu"] assert cpu["percent"] >= 0 computer = response.json()["server"] diff --git a/tests/_utils/test_health_utils.py b/tests/_utils/test_health_utils.py index 5fa20536a0d..8ad4493e9de 100644 --- a/tests/_utils/test_health_utils.py +++ b/tests/_utils/test_health_utils.py @@ -2,6 +2,9 @@ from marimo._utils.health import ( _get_versions, + _has_cgroup_cpu_limit, + get_cgroup_cpu_percent, + get_cgroup_mem_stats, get_chrome_version, get_node_version, get_optional_modules_list, @@ -32,3 +35,23 @@ def test_get_chrome_version(): def test_get_python_version(): assert isinstance(get_python_version(), str) + + +def test_has_cgroup_cpu_limits(): + """Test that has_cgroup_limits returns a tuple of bools and doesn't crash""" + has_cgroup_cpu_limit = _has_cgroup_cpu_limit() + assert isinstance(has_cgroup_cpu_limit, bool) + + +def test_get_container_resources(): + """Test that get_container_resources returns None or a dict and doesn't crash""" + cpu_result = get_cgroup_cpu_percent() + memory_result = get_cgroup_mem_stats() + assert cpu_result is None or isinstance(cpu_result, float) + assert memory_result is None or isinstance(memory_result, dict) + if isinstance(memory_result, dict): + # If we happen to be in a container, verify structure + assert "total" in memory_result + assert "used" in memory_result + assert "free" in memory_result + assert "percent" in memory_result