Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
bbd42ed
feat: docker-aware machine stats
Ddfulton Dec 12, 2025
9a2baf8
feat: Show container memory in machine stats when running with cgroup…
Ddfulton Dec 16, 2025
7636ba8
Merge branch 'main' into ccloud/container-cgroups
Ddfulton Dec 16, 2025
66a9501
Merge branch 'main' into ccloud/container-cgroups
Ddfulton Dec 18, 2025
808b44d
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 18, 2025
91f4a91
Merge branch 'marimo-team:main' into ccloud/container-cgroups
Ddfulton Dec 26, 2025
50f15a4
refactor: split mem and cpu cgroups functions + TypedDict for memory …
Ddfulton Dec 26, 2025
975bccc
Merge branch 'main' into ccloud/container-cgroups
Ddfulton Dec 26, 2025
3e80595
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 26, 2025
16ea07b
regenerate: update OpenAPI schema with cgroup memory limit detection
Ddfulton Dec 26, 2025
8fa0f39
revert: remove accidental linting changes to json-parser.ts
Ddfulton Dec 26, 2025
25f4924
fix: revert api.ts to two-space indentation
Ddfulton Dec 26, 2025
ae23665
refactor: clean up cgroup health utility code
Ddfulton Dec 26, 2025
92c7f65
lint: import nit on marimo/_utils/health.py
Ddfulton Dec 26, 2025
766fae4
Merge branch 'marimo-team:main' into ccloud/container-cgroups
Ddfulton Dec 30, 2025
e52fd85
fix: removed spurious comments + added cgroup documentation + specifi…
Ddfulton Dec 31, 2025
ae415be
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 31, 2025
08d049a
fix: typeo with _LAST_CGROUP_CPU_SAMPLE constant
Ddfulton Dec 31, 2025
00d5a2f
fix: updated imports to health utils smoke test
Ddfulton Dec 31, 2025
6dc1701
fix: changed smoke test isinstance to dict, not TypedDict, which is u…
Ddfulton Dec 31, 2025
3cdea1c
fix: rename is_container to has_cgroup_mem_limit in test_health.py
Ddfulton Jan 2, 2026
7f690f9
Merge branch 'marimo-team:main' into ccloud/container-cgroups
Ddfulton Jan 4, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,11 @@ const MemoryUsageBar: React.FC<{
kernel: UsageResponse["kernel"];
server: UsageResponse["server"];
}> = ({ memory, kernel, server }) => {
const { percent, total, available } = memory;
const { percent, total, available, has_cgroup_mem_limit } = memory;
const roundedPercent = Math.round(percent);
const memoryLabel = has_cgroup_mem_limit
? "container memory"
: "computer memory";

const gbFormatter = useNumberFormatter({
maximumFractionDigits: 2,
Expand All @@ -82,7 +85,7 @@ const MemoryUsageBar: React.FC<{
content={
<div className="flex flex-col gap-1">
<span>
<b>computer memory:</b> {formatGB(total - available)} /{" "}
<b>{memoryLabel}:</b> {formatGB(total - available)} /{" "}
{formatGB(total)} GB ({roundedPercent}%)
</span>
{server?.memory && (
Expand Down
41 changes: 29 additions & 12 deletions marimo/_server/api/endpoints/health.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
from marimo._server.api.deps import AppState
from marimo._server.router import APIRouter
from marimo._utils.health import (
get_cgroup_cpu_percent,
get_cgroup_mem_stats,
get_node_version,
get_python_version,
get_required_modules_list,
Expand Down Expand Up @@ -130,12 +132,15 @@ async def usage(request: Request) -> JSONResponse:
type: integer
free:
type: integer
has_cgroup_mem_limit:
type: boolean
required:
- total
- available
- percent
- used
- free
- has_cgroup_mem_limit
server:
type: object
properties:
Expand Down Expand Up @@ -193,10 +198,28 @@ async def usage(request: Request) -> JSONResponse:

import psutil

memory = psutil.virtual_memory()
# interval=None is nonblocking; first value is meaningless but after
# that it's useful.
cpu = psutil.cpu_percent(interval=None)
if cgroup_mem_stats := get_cgroup_mem_stats():
memory_stats = {
"has_cgroup_mem_limit": True,
**cgroup_mem_stats,
}
else:
# Use host memory stats
memory = psutil.virtual_memory()
memory_stats = {
"total": memory.total,
"available": memory.available,
"percent": memory.percent,
"used": memory.used,
"free": memory.free,
"has_cgroup_mem_limit": False,
}

cpu = get_cgroup_cpu_percent()
if cpu is None:
# interval=None is nonblocking; first call returns meaningless value
# subsequent calls return delta since last call
cpu = psutil.cpu_percent(interval=None)

# Server memory (and children)
main_process = psutil.Process()
Expand Down Expand Up @@ -267,14 +290,8 @@ async def usage(request: Request) -> JSONResponse:

return JSONResponse(
{
# computer memory
"memory": {
"total": memory.total,
"available": memory.available,
"percent": memory.percent,
"used": memory.used,
"free": memory.free,
},
# computer memory or container memory
"memory": memory_stats,
# marimo server
"server": {
"memory": server_memory,
Expand Down
202 changes: 201 additions & 1 deletion marimo/_utils/health.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,47 @@
from __future__ import annotations

import importlib.metadata
import os
import subprocess
import sys
from typing import Optional
import time
from typing import Optional, TypedDict

from marimo import _loggers

LOGGER = _loggers.marimo_logger()

TIMEOUT = 10 # seconds

# Module-level state for cgroup CPU percent calculation (like psutil does)
_LAST_CGROUP_CPU_SAMPLE: Optional[tuple[int, float]] = (
None # (usage_usec, timestamp)
)


class MemoryStats(TypedDict):
total: int
used: int
available: int
percent: float
free: int


# Constants for cgroup v2 file locations
# Reference: https://www.kernel.org/doc/Documentation/cgroup-v2.txt
CGROUP_V2_MEMORY_MAX_FILE = "/sys/fs/cgroup/memory.max"
CGROUP_V2_MEMORY_CURRENT_FILE = "/sys/fs/cgroup/memory.current"
CGROUP_V2_CPU_STAT_FILE = "/sys/fs/cgroup/cpu.stat"
CGROUP_V2_CPU_MAX_FILE = "/sys/fs/cgroup/cpu.max"

# cgroup v1 file locations (legacy hierarchy)
# Reference: https://www.kernel.org/doc/Documentation/cgroup-v1/
CGROUP_V1_CPU_USAGE_FILE = "/sys/fs/cgroup/cpuacct/cpuacct.usage"
CGROUP_V1_CPU_CFS_QUOTA_US_FILE = "/sys/fs/cgroup/cpu/cpu.cfs_quota_us"
CGROUP_V1_CPU_CFS_PERIOD_US_FILE = "/sys/fs/cgroup/cpu/cpu.cfs_period_us"
CGROUP_V1_MEMORY_LIMIT_FILE = "/sys/fs/cgroup/memory/memory.limit_in_bytes"
CGROUP_V1_MEMORY_USAGE_FILE = "/sys/fs/cgroup/memory/memory.usage_in_bytes"


def get_node_version() -> Optional[str]:
try:
Expand Down Expand Up @@ -183,3 +214,172 @@ def communicate_with_timeout(
except subprocess.TimeoutExpired:
process.kill()
return "", "Error: Process timed out"


def _has_cgroup_cpu_limit() -> bool:
"""
Returns True/False whether the container has a CPU limit set.
This function checks for both cgroups v1 and v2.
"""
if os.path.exists(CGROUP_V2_CPU_MAX_FILE):
with open(CGROUP_V2_CPU_MAX_FILE, encoding="utf-8") as f:
cpu_max = f.read().strip()
return cpu_max != "max"
# Fallback to cgroup v1 (legacy)
if os.path.exists(CGROUP_V1_CPU_CFS_QUOTA_US_FILE):
with open(CGROUP_V1_CPU_CFS_QUOTA_US_FILE, encoding="utf-8") as f:
quota = int(f.read().strip())
return quota > 0
return False


def get_cgroup_mem_stats() -> Optional[MemoryStats]:
"""
Get container memory stats from cgroup.

Returns:
Dictionary with memory stats if cgroup limits are configured,
None if cgroup limits are not configured or unable to read.

Example return value:
{
'total': 2147483648, # bytes
'used': 1073741824, # bytes
'available': 1073741824, # bytes
'free': 1073741824, # bytes
'percent': 50.0, # percentage
}
"""
try:
if os.path.exists(CGROUP_V2_MEMORY_MAX_FILE):
with open(CGROUP_V2_MEMORY_MAX_FILE, encoding="utf-8") as f:
memory_max = f.read().strip()
with open(CGROUP_V2_MEMORY_CURRENT_FILE, encoding="utf-8") as f:
memory_current = f.read().strip()

if memory_max != "max":
total = int(memory_max)
used = int(memory_current)
available = total - used
percent = (used / total) * 100 if total > 0 else 0
return MemoryStats(
total=total,
used=used,
available=available,
percent=percent,
free=available, # free == available for cgroup memory
)
elif os.path.exists(CGROUP_V1_MEMORY_LIMIT_FILE):
with open(CGROUP_V1_MEMORY_LIMIT_FILE, encoding="utf-8") as f:
total = int(f.read().strip())
with open(CGROUP_V1_MEMORY_USAGE_FILE, encoding="utf-8") as f:
used = int(f.read().strip())
available = total - used
percent = (used / total) * 100 if total > 0 else 0

return MemoryStats(
total=total,
used=used,
available=available,
percent=percent,
free=available, # free == available for cgroup memory
)
except (FileNotFoundError, PermissionError, ValueError) as e:
LOGGER.debug(f"Error reading container memory stats: {e}")

return None


def _get_cgroup_allocated_cores() -> Optional[float]:
"""Get the number of CPU cores allocated to this cgroup (quota / period)."""
try:
if os.path.exists(CGROUP_V2_CPU_MAX_FILE):
with open(CGROUP_V2_CPU_MAX_FILE, encoding="utf-8") as f:
parts = f.read().strip().split()
if len(parts) == 2 and parts[0] != "max":
return int(parts[0]) / int(parts[1])
elif os.path.exists(CGROUP_V1_CPU_CFS_QUOTA_US_FILE):
with open(CGROUP_V1_CPU_CFS_QUOTA_US_FILE, encoding="utf-8") as f:
quota = int(f.read().strip())
with open(CGROUP_V1_CPU_CFS_PERIOD_US_FILE, encoding="utf-8") as f:
period = int(f.read().strip())
if quota > 0:
return quota / period
except (FileNotFoundError, PermissionError, ValueError):
pass
return None


def get_cgroup_cpu_percent() -> Optional[float]:
"""
Get CPU usage percentage for a cgroup-limited container.

Works like psutil.cpu_percent(interval=None):
- First call stores the current reading and returns 0.0
- Subsequent calls return the CPU percent since the last call

Returns:
CPU usage as a percentage (0-100).
0.0 if cgroup limits are configured but unable to read current usage
(e.g., on the first call)
None if cgroup limits are not configured or unable to read.
"""
global _LAST_CGROUP_CPU_SAMPLE

# Early return if no CPU limit is configured
if not _has_cgroup_cpu_limit():
return None

try:
# Read current usage (microseconds)
current_usage_microseconds: Optional[int] = None

if os.path.exists(CGROUP_V2_CPU_STAT_FILE):
with open(CGROUP_V2_CPU_STAT_FILE, encoding="utf-8") as f:
for line in f:
if line.startswith("usage_usec"):
current_usage_microseconds = int(line.split()[1])
break

elif os.path.exists(CGROUP_V1_CPU_USAGE_FILE):
with open(CGROUP_V1_CPU_USAGE_FILE, encoding="utf-8") as f:
current_usage_microseconds = (
int(f.read().strip()) // 1_000_000
) # ns -> μs

if current_usage_microseconds is None:
return 0.0

allocated_cores = _get_cgroup_allocated_cores()
if allocated_cores is None or allocated_cores <= 0:
return 0.0

current_time = time.time()

if _LAST_CGROUP_CPU_SAMPLE is None:
# First call - store reading, return 0.0 (like psutil's first call)
_LAST_CGROUP_CPU_SAMPLE = (
current_usage_microseconds,
current_time,
)
return 0.0

last_usage, last_time = _LAST_CGROUP_CPU_SAMPLE
_LAST_CGROUP_CPU_SAMPLE = (current_usage_microseconds, current_time)

delta_time = current_time - last_time
if delta_time <= 0:
return 0.0

delta_usage_microseconds = current_usage_microseconds - last_usage
delta_time_microseconds = delta_time * 1_000_000
percent = (
delta_usage_microseconds
/ (delta_time_microseconds * 1_000_000 * allocated_cores)
) * 100

return min(100.0, max(0.0, percent))

except (FileNotFoundError, PermissionError, ValueError, IndexError):
# Error reading cgroup CPU stats — fall back to psutil
return None
3 changes: 3 additions & 0 deletions packages/openapi/api.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5367,6 +5367,8 @@ paths:
type: integer
free:
type: integer
has_cgroup_mem_limit:
type: boolean
percent:
type: number
total:
Expand All @@ -5379,6 +5381,7 @@ paths:
- percent
- used
- free
- has_cgroup_mem_limit
type: object
server:
properties:
Expand Down
1 change: 1 addition & 0 deletions packages/openapi/src/api.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2663,6 +2663,7 @@ export interface paths {
memory: {
available: number;
free: number;
has_cgroup_mem_limit: boolean;
percent: number;
total: number;
used: number;
Expand Down
2 changes: 2 additions & 0 deletions tests/_server/api/endpoints/test_health.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ def test_memory(client: TestClient) -> None:
assert memory["available"] > 0
assert memory["used"] > 0
assert memory["free"] > 0
assert "has_cgroup_mem_limit" in memory
assert isinstance(memory["has_cgroup_mem_limit"], bool)
cpu = response.json()["cpu"]
assert cpu["percent"] >= 0
computer = response.json()["server"]
Expand Down
23 changes: 23 additions & 0 deletions tests/_utils/test_health_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@

from marimo._utils.health import (
_get_versions,
_has_cgroup_cpu_limit,
get_cgroup_cpu_percent,
get_cgroup_mem_stats,
get_chrome_version,
get_node_version,
get_optional_modules_list,
Expand Down Expand Up @@ -32,3 +35,23 @@ def test_get_chrome_version():

def test_get_python_version():
assert isinstance(get_python_version(), str)


def test_has_cgroup_cpu_limits():
"""Test that has_cgroup_limits returns a tuple of bools and doesn't crash"""
has_cgroup_cpu_limit = _has_cgroup_cpu_limit()
assert isinstance(has_cgroup_cpu_limit, bool)


def test_get_container_resources():
"""Test that get_container_resources returns None or a dict and doesn't crash"""
cpu_result = get_cgroup_cpu_percent()
memory_result = get_cgroup_mem_stats()
assert cpu_result is None or isinstance(cpu_result, float)
assert memory_result is None or isinstance(memory_result, dict)
if isinstance(memory_result, dict):
# If we happen to be in a container, verify structure
assert "total" in memory_result
assert "used" in memory_result
assert "free" in memory_result
assert "percent" in memory_result
Loading