FormicOS/.env.example at main · Intradyne/FormicOS · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# FormicOS environment configuration
# Copy to .env and set your API key. That's it.
#
# Cloud-first by default: 3 containers, no GPU needed.
# For local GPU: uncomment the "Local GPU" section below,
# then run: bash scripts/setup-local-gpu.sh

# --- Cloud API keys (set at least one) ---
ANTHROPIC_API_KEY=sk-ant-...
# GEMINI_API_KEY=AI...
# OPENAI_API_KEY=sk-...
# DEEPSEEK_API_KEY=sk-...

# --- Project binding (Wave 81) ---
# Set to your project directory. Colonies will read/write against this root.
# PROJECT_DIR=/path/to/your/project

# --- Local GPU override (uncomment entire block) ---
# COMPOSE_PROFILES=local-gpu
# QUEEN_MODEL=llama-cpp/qwen3.5-35b
# CODER_MODEL=llama-cpp/qwen3.5-35b
# REVIEWER_MODEL=llama-cpp/qwen3.5-35b
# RESEARCHER_MODEL=llama-cpp/qwen3.5-35b
# ARCHIVIST_MODEL=llama-cpp/qwen3.5-35b
# LLM_HOST=http://llm:8080
# EMBED_URL=http://formicos-embed:8200
# EMBED_MODEL=nomic-ai/nomic-embed-text-v1.5
# EMBED_DIMENSIONS=768
#
# Local LLM tuning:
# LLM_IMAGE=local/llama.cpp:server-cuda-blackwell
# LLM_MODEL_FILE=Qwen3.5-35B-A3B-Q4_K_M.gguf
# LLM_MODEL_ALIAS=qwen3.5-35b
# LLM_CHAT_TEMPLATE_ARGS=--chat-template-file /config/qwen35-chat.jinja
# LLM_FLASH_ATTN=on
# LLM_CACHE_TYPE_K=q4_0
# LLM_CACHE_TYPE_V=q4_0
# LLM_BATCH_SIZE=8192
# LLM_UBATCH_SIZE=4096
# LLM_CONTEXT_SIZE=65536
# LLM_SLOTS=2
# LLM_SLOT_PROMPT_SIMILARITY=0.5
# LLM_CACHE_RAM=1024
# EMBED_GPU_LAYERS=99
# CUDA_DEVICE=0

# --- Devstral local eval (24B dense; safer defaults on 32 GB VRAM) ---
# COMPOSE_PROFILES=local-gpu
# QUEEN_MODEL=llama-cpp/devstral-small-2-24b
# CODER_MODEL=llama-cpp/devstral-small-2-24b
# REVIEWER_MODEL=llama-cpp/devstral-small-2-24b
# RESEARCHER_MODEL=llama-cpp/devstral-small-2-24b
# ARCHIVIST_MODEL=llama-cpp/devstral-small-2-24b
# FORMICOS_ENV_FILE=.env.devstral
# LLM_HOST=http://llm:8080
# EMBED_URL=http://formicos-embed:8200
# LLM_MODEL_FILE=mistralai_Devstral-Small-2-24B-Instruct-2512-Q4_K_M.gguf
# LLM_MODEL_ALIAS=devstral-small-2-24b
# LLM_CHAT_TEMPLATE_ARGS=
# LLM_FLASH_ATTN=off
# LLM_CACHE_TYPE_K=f16
# LLM_CACHE_TYPE_V=f16
# LLM_BATCH_SIZE=4096
# LLM_UBATCH_SIZE=2048
# LLM_CACHE_RAM=0
# LLM_CONTEXT_SIZE=32768
# LLM_SLOTS=3

# --- Hybrid routing (GPU + API keys — RECOMMENDED for local GPU users) ---
# Queen on cloud (unlimited context), colonies on local GPU (fast parallel).
# COMPOSE_PROFILES=local-gpu
# QUEEN_MODEL=anthropic/claude-sonnet-4-6
# CODER_MODEL=llama-cpp/qwen3.5-35b
# REVIEWER_MODEL=llama-cpp/qwen3.5-35b
# RESEARCHER_MODEL=anthropic/claude-haiku-4-5
# ARCHIVIST_MODEL=llama-cpp/qwen3.5-35b
# LLM_SLOTS=3

# --- Multi-GPU pinning ---
# Each GPU-using service has its own device variable.
# Default: everything on GPU 0. Multi-GPU splits the load.
#
# Single GPU (default):
# CUDA_DEVICE=0
#
# Multi-GPU (recommended for 2+ GPUs):
# GPU 0 (primary, e.g. RTX 5090): Queen model only — full VRAM for large context
# GPU 1 (secondary, e.g. RTX 3080): Swarm workers + embedding — uses multi-arch image
# CUDA_DEVICE=0
# CUDA_DEVICE_SWARM=1
# CUDA_DEVICE_EMBED=1
# EMBED_IMAGE=ghcr.io/ggml-org/llama.cpp:server-cuda
#
# The swarm image defaults to the official multi-arch build (ghcr.io/ggml-org/llama.cpp:server-cuda)
# which runs on any CUDA GPU. Override LLM_SWARM_IMAGE for a native build on specific hardware.

# --- Local Swarm (parallel colony workers on a second llama.cpp instance) ---
# Setup: bash scripts/setup-local-swarm.sh
# Start: docker compose -f docker-compose.yml -f docker-compose.local-swarm.yml up -d
#
# Deep Queen (RECOMMENDED for multi-GPU):
# Queen gets full 65K context on GPU 0, 4 parallel workers on GPU 1.
# GPU 0 VRAM: ~23GB (35B weights + bf16 KV). GPU 1 VRAM: ~8.7GB (4B + embed).
# LLM_SWARM_HOST=http://llm-swarm:8080
# LLM_SLOTS=1
# LLM_CONTEXT_SIZE=65536
# LLM_SWARM_CTX=128000
# LLM_SWARM_SLOTS=4
# CODER_MODEL=llama-cpp-swarm/qwen3.5-4b-swarm
# REVIEWER_MODEL=llama-cpp-swarm/qwen3.5-4b-swarm
# ARCHIVIST_MODEL=llama-cpp-swarm/qwen3.5-4b-swarm

# --- Sandbox execution ---
# Set to false to disable Docker sandbox container spawning (code_execute tool).
# SANDBOX_ENABLED=true

# --- Data directory ---
# Default: ./data in development, /data in Docker.
# IMPORTANT: Use named Docker volumes for SQLite persistence.
# FORMICOS_DATA_DIR=./data

# --- Benchmark directory (dev only) ---
# Mount a benchmark exercises directory into the container.
# BENCHMARK_DIR=/path/to/polyglot-benchmark