From a532f3995e086218b1d06ab4d0d1f7f393d6feb1 Mon Sep 17 00:00:00 2001 From: Adrian Cole Date: Wed, 24 Sep 2025 11:43:58 +0800 Subject: [PATCH 1/3] archgw: address drift in prometheus cluster name Signed-off-by: Adrian Cole --- inference-platforms/archgw/README.md | 3 +++ inference-platforms/archgw/arch_config.yaml | 3 ++- inference-platforms/archgw/docker-compose-elastic.yml | 3 +-- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/inference-platforms/archgw/README.md b/inference-platforms/archgw/README.md index fd327b2..1547af0 100644 --- a/inference-platforms/archgw/README.md +++ b/inference-platforms/archgw/README.md @@ -64,6 +64,9 @@ and anything added in Arch Gateway's [wasm filter][archgw-wasm]. instructions to run from Docker (to avoid nested docker). * Traces come from Envoy, whose configuration is written by `archgw`. At the moment, this hard-codes aspects including default ports. +* Prometheus metrics show the cluster as "openai_host" - the provider_interface + plus the first segment of the hostname (dots truncate the rest). The "host" + comes from "host.docker.internal". * Until [this][openai-responses] resolves, don't use `--use-responses-api`. The chat prompt was designed to be idempotent, but the results are not. You may diff --git a/inference-platforms/archgw/arch_config.yaml b/inference-platforms/archgw/arch_config.yaml index cf6dabd..7ad9355 100644 --- a/inference-platforms/archgw/arch_config.yaml +++ b/inference-platforms/archgw/arch_config.yaml @@ -8,7 +8,8 @@ listeners: timeout: 30s llm_providers: - - model: ollama/qwen3:0.6b + # We don't use 'ollama', as we want to use 'openai' provider_interface. + - model: local/qwen3:0.6b provider_interface: openai # This configuration is converted to Envoy and run inside Docker. base_url: http://host.docker.internal:11434 diff --git a/inference-platforms/archgw/docker-compose-elastic.yml b/inference-platforms/archgw/docker-compose-elastic.yml index 8f04c2c..2037ddc 100644 --- a/inference-platforms/archgw/docker-compose-elastic.yml +++ b/inference-platforms/archgw/docker-compose-elastic.yml @@ -2,8 +2,7 @@ configs: # Configuration is simplified from archgw here: # https://github.com/katanemo/archgw/blob/main/docs/source/guides/observability/monitoring.rst # - # Note: The prometheus cluster name for qwen3:0.65b will shows up as '6b' - # See https://github.com/katanemo/archgw/issues/504 + # Note: The cluster name for openai + host.docker.internal = openai_host prometheus-pump-config: content: | receivers: From 56f2d97184e576448818a2554e2026460675bd5e Mon Sep 17 00:00:00 2001 From: Adrian Cole Date: Wed, 24 Sep 2025 11:59:19 +0800 Subject: [PATCH 2/3] feedback Signed-off-by: Adrian Cole --- inference-platforms/archgw/arch_config.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/inference-platforms/archgw/arch_config.yaml b/inference-platforms/archgw/arch_config.yaml index 7ad9355..fa9e6e9 100644 --- a/inference-platforms/archgw/arch_config.yaml +++ b/inference-platforms/archgw/arch_config.yaml @@ -8,7 +8,8 @@ listeners: timeout: 30s llm_providers: - # We don't use 'ollama', as we want to use 'openai' provider_interface. + # ollama/ in the model name forces using the ollama interface even though we + # want provider_interface: openai. local/ avoids this. - model: local/qwen3:0.6b provider_interface: openai # This configuration is converted to Envoy and run inside Docker. From 9230f4a89a3a963903e3c748fb97518b5f6d6c6c Mon Sep 17 00:00:00 2001 From: Adrian Cole Date: Wed, 24 Sep 2025 13:07:45 +0800 Subject: [PATCH 3/3] feedback Signed-off-by: Adrian Cole --- inference-platforms/archgw/README.md | 4 +++- inference-platforms/archgw/arch_config.yaml | 7 +++---- inference-platforms/archgw/docker-compose-elastic.yml | 2 +- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/inference-platforms/archgw/README.md b/inference-platforms/archgw/README.md index 1547af0..08c6612 100644 --- a/inference-platforms/archgw/README.md +++ b/inference-platforms/archgw/README.md @@ -64,10 +64,11 @@ and anything added in Arch Gateway's [wasm filter][archgw-wasm]. instructions to run from Docker (to avoid nested docker). * Traces come from Envoy, whose configuration is written by `archgw`. At the moment, this hard-codes aspects including default ports. -* Prometheus metrics show the cluster as "openai_host" - the provider_interface +* Prometheus metrics show the cluster as "ollama_host" - the provider_interface plus the first segment of the hostname (dots truncate the rest). The "host" comes from "host.docker.internal". * Until [this][openai-responses] resolves, don't use `--use-responses-api`. +* Until [this][docker-env] resolves, make sure your PATH has /usr/local/bin. The chat prompt was designed to be idempotent, but the results are not. You may see something besides 'South Atlantic Ocean.'. @@ -81,3 +82,4 @@ Just run it again until we find a way to make the results idempotent. [uv]: https://docs.astral.sh/uv/getting-started/installation/ [openai-responses]: https://github.com/katanemo/archgw/issues/476 [otel-tui]: https://github.com/ymtdzzz/otel-tui +[docker-env]: https://github.com/katanemo/archgw/issues/573 diff --git a/inference-platforms/archgw/arch_config.yaml b/inference-platforms/archgw/arch_config.yaml index fa9e6e9..da6238a 100644 --- a/inference-platforms/archgw/arch_config.yaml +++ b/inference-platforms/archgw/arch_config.yaml @@ -8,10 +8,9 @@ listeners: timeout: 30s llm_providers: - # ollama/ in the model name forces using the ollama interface even though we - # want provider_interface: openai. local/ avoids this. - - model: local/qwen3:0.6b - provider_interface: openai + # Use ollama directly, since we can't inherit OPENAI_BASE_URL etc and need + # to hard-code the model anyway. + - model: ollama/qwen3:0.6b # This configuration is converted to Envoy and run inside Docker. base_url: http://host.docker.internal:11434 default: true diff --git a/inference-platforms/archgw/docker-compose-elastic.yml b/inference-platforms/archgw/docker-compose-elastic.yml index 2037ddc..892faf8 100644 --- a/inference-platforms/archgw/docker-compose-elastic.yml +++ b/inference-platforms/archgw/docker-compose-elastic.yml @@ -2,7 +2,7 @@ configs: # Configuration is simplified from archgw here: # https://github.com/katanemo/archgw/blob/main/docs/source/guides/observability/monitoring.rst # - # Note: The cluster name for openai + host.docker.internal = openai_host + # Note: The cluster name for ollama + host.docker.internal = ollama_host prometheus-pump-config: content: | receivers: