From a532f3995e086218b1d06ab4d0d1f7f393d6feb1 Mon Sep 17 00:00:00 2001
From: Adrian Cole <adrian@tetrate.io>
Date: Wed, 24 Sep 2025 11:43:58 +0800
Subject: [PATCH 1/3] archgw: address drift in prometheus cluster name

Signed-off-by: Adrian Cole <adrian@tetrate.io>
---
 inference-platforms/archgw/README.md                  | 3 +++
 inference-platforms/archgw/arch_config.yaml           | 3 ++-
 inference-platforms/archgw/docker-compose-elastic.yml | 3 +--
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/inference-platforms/archgw/README.md b/inference-platforms/archgw/README.md
index fd327b2..1547af0 100644
--- a/inference-platforms/archgw/README.md
+++ b/inference-platforms/archgw/README.md
@@ -64,6 +64,9 @@ and anything added in Arch Gateway's [wasm filter][archgw-wasm].
   instructions to run from Docker (to avoid nested docker).
 * Traces come from Envoy, whose configuration is written by `archgw`. At the
   moment, this hard-codes aspects including default ports.
+* Prometheus metrics show the cluster as "openai_host" - the provider_interface
+  plus the first segment of the hostname (dots truncate the rest). The "host"
+  comes from "host.docker.internal".
 * Until [this][openai-responses] resolves, don't use `--use-responses-api`.
 
 The chat prompt was designed to be idempotent, but the results are not. You may
diff --git a/inference-platforms/archgw/arch_config.yaml b/inference-platforms/archgw/arch_config.yaml
index cf6dabd..7ad9355 100644
--- a/inference-platforms/archgw/arch_config.yaml
+++ b/inference-platforms/archgw/arch_config.yaml
@@ -8,7 +8,8 @@ listeners:
     timeout: 30s
 
 llm_providers:
-  - model: ollama/qwen3:0.6b
+  # We don't use 'ollama', as we want to use 'openai' provider_interface.
+  - model: local/qwen3:0.6b
     provider_interface: openai
     # This configuration is converted to Envoy and run inside Docker.
     base_url: http://host.docker.internal:11434
diff --git a/inference-platforms/archgw/docker-compose-elastic.yml b/inference-platforms/archgw/docker-compose-elastic.yml
index 8f04c2c..2037ddc 100644
--- a/inference-platforms/archgw/docker-compose-elastic.yml
+++ b/inference-platforms/archgw/docker-compose-elastic.yml
@@ -2,8 +2,7 @@ configs:
   # Configuration is simplified from archgw here:
   # https://github.com/katanemo/archgw/blob/main/docs/source/guides/observability/monitoring.rst
   #
-  # Note: The prometheus cluster name for qwen3:0.65b will shows up as '6b'
-  # See https://github.com/katanemo/archgw/issues/504
+  # Note: The cluster name for openai + host.docker.internal = openai_host
   prometheus-pump-config:
     content: |
       receivers:

From 56f2d97184e576448818a2554e2026460675bd5e Mon Sep 17 00:00:00 2001
From: Adrian Cole <adrian@tetrate.io>
Date: Wed, 24 Sep 2025 11:59:19 +0800
Subject: [PATCH 2/3] feedback

Signed-off-by: Adrian Cole <adrian@tetrate.io>
---
 inference-platforms/archgw/arch_config.yaml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/inference-platforms/archgw/arch_config.yaml b/inference-platforms/archgw/arch_config.yaml
index 7ad9355..fa9e6e9 100644
--- a/inference-platforms/archgw/arch_config.yaml
+++ b/inference-platforms/archgw/arch_config.yaml
@@ -8,7 +8,8 @@ listeners:
     timeout: 30s
 
 llm_providers:
-  # We don't use 'ollama', as we want to use 'openai' provider_interface.
+  # ollama/ in the model name forces using the ollama interface even though we
+  # want provider_interface: openai. local/ avoids this.
   - model: local/qwen3:0.6b
     provider_interface: openai
     # This configuration is converted to Envoy and run inside Docker.

From 9230f4a89a3a963903e3c748fb97518b5f6d6c6c Mon Sep 17 00:00:00 2001
From: Adrian Cole <adrian@tetrate.io>
Date: Wed, 24 Sep 2025 13:07:45 +0800
Subject: [PATCH 3/3] feedback

Signed-off-by: Adrian Cole <adrian@tetrate.io>
---
 inference-platforms/archgw/README.md                  | 4 +++-
 inference-platforms/archgw/arch_config.yaml           | 7 +++----
 inference-platforms/archgw/docker-compose-elastic.yml | 2 +-
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/inference-platforms/archgw/README.md b/inference-platforms/archgw/README.md
index 1547af0..08c6612 100644
--- a/inference-platforms/archgw/README.md
+++ b/inference-platforms/archgw/README.md
@@ -64,10 +64,11 @@ and anything added in Arch Gateway's [wasm filter][archgw-wasm].
   instructions to run from Docker (to avoid nested docker).
 * Traces come from Envoy, whose configuration is written by `archgw`. At the
   moment, this hard-codes aspects including default ports.
-* Prometheus metrics show the cluster as "openai_host" - the provider_interface
+* Prometheus metrics show the cluster as "ollama_host" - the provider_interface
   plus the first segment of the hostname (dots truncate the rest). The "host"
   comes from "host.docker.internal".
 * Until [this][openai-responses] resolves, don't use `--use-responses-api`.
+* Until [this][docker-env] resolves, make sure your PATH has /usr/local/bin.
 
 The chat prompt was designed to be idempotent, but the results are not. You may
 see something besides 'South Atlantic Ocean.'.
@@ -81,3 +82,4 @@ Just run it again until we find a way to make the results idempotent.
 [uv]: https://docs.astral.sh/uv/getting-started/installation/
 [openai-responses]: https://github.com/katanemo/archgw/issues/476
 [otel-tui]: https://github.com/ymtdzzz/otel-tui
+[docker-env]: https://github.com/katanemo/archgw/issues/573
diff --git a/inference-platforms/archgw/arch_config.yaml b/inference-platforms/archgw/arch_config.yaml
index fa9e6e9..da6238a 100644
--- a/inference-platforms/archgw/arch_config.yaml
+++ b/inference-platforms/archgw/arch_config.yaml
@@ -8,10 +8,9 @@ listeners:
     timeout: 30s
 
 llm_providers:
-  # ollama/ in the model name forces using the ollama interface even though we
-  # want provider_interface: openai. local/ avoids this.
-  - model: local/qwen3:0.6b
-    provider_interface: openai
+  # Use ollama directly, since we can't inherit OPENAI_BASE_URL etc and need
+  # to hard-code the model anyway.
+  - model: ollama/qwen3:0.6b
     # This configuration is converted to Envoy and run inside Docker.
     base_url: http://host.docker.internal:11434
     default: true
diff --git a/inference-platforms/archgw/docker-compose-elastic.yml b/inference-platforms/archgw/docker-compose-elastic.yml
index 2037ddc..892faf8 100644
--- a/inference-platforms/archgw/docker-compose-elastic.yml
+++ b/inference-platforms/archgw/docker-compose-elastic.yml
@@ -2,7 +2,7 @@ configs:
   # Configuration is simplified from archgw here:
   # https://github.com/katanemo/archgw/blob/main/docs/source/guides/observability/monitoring.rst
   #
-  # Note: The cluster name for openai + host.docker.internal = openai_host
+  # Note: The cluster name for ollama + host.docker.internal = ollama_host
   prometheus-pump-config:
     content: |
       receivers: