fix: correct max-model-len typo 8096 -> 8192 in docs and Dockerfile

alchemystack · alchemystack · commit e5d169ef84d3 · 2026-03-29T16:18:59.000+02:00
diff --git a/README.md b/README.md
@@ -139,15 +139,15 @@ curl http://localhost:8000/v1/completions \
 pip install qr-sampler
 
 # Start vLLM — qr-sampler registers automatically via entry points
-vllm serve Qwen/Qwen2.5-1.5B-Instruct --dtype half --max-model-len 8096 --gpu-memory-utilization 0.80
+vllm serve Qwen/Qwen2.5-1.5B-Instruct --dtype half --max-model-len 8192 --gpu-memory-utilization 0.80
 ```
 
 Configure the entropy source via environment variables:
 
 ```bash
 export QR_ENTROPY_SOURCE_TYPE=quantum_grpc
 export QR_GRPC_SERVER_ADDRESS=localhost:50051
-vllm serve Qwen/Qwen2.5-1.5B-Instruct --dtype half --max-model-len 8096 --gpu-memory-utilization 0.80
+vllm serve Qwen/Qwen2.5-1.5B-Instruct --dtype half --max-model-len 8192 --gpu-memory-utilization 0.80
 ```
 
 ### Apple Silicon (macOS)
@@ -579,7 +579,7 @@ Or configure directly via environment variables (bare-metal):
 ```bash
 export QR_ENTROPY_SOURCE_TYPE=quantum_grpc
 export QR_GRPC_SERVER_ADDRESS=localhost:50051
-vllm serve Qwen/Qwen2.5-1.5B-Instruct --dtype half --max-model-len 8096 --gpu-memory-utilization 0.80
+vllm serve Qwen/Qwen2.5-1.5B-Instruct --dtype half --max-model-len 8192 --gpu-memory-utilization 0.80
 ```
 
 The template handles all gRPC boilerplate (unary + bidirectional streaming, health checks, graceful shutdown). You only write the hardware-specific code.
diff --git a/examples/docker/Dockerfile.vllm b/examples/docker/Dockerfile.vllm
@@ -50,4 +50,4 @@ ENTRYPOINT []
 
 # Start vLLM. The qr-sampler plugin is auto-discovered via entry points.
 # Shell form so environment variables are resolved at runtime.
-CMD vllm serve ${HF_MODEL} --host 0.0.0.0 --port 8000 --dtype half --max-model-len 8096 --gpu-memory-utilization 0.80
+CMD vllm serve ${HF_MODEL} --host 0.0.0.0 --port 8000 --dtype half --max-model-len 8192 --gpu-memory-utilization 0.80