From 048826783d35a42c0f1a548a7f0b7f0ae50fa462 Mon Sep 17 00:00:00 2001 From: xeniape Date: Wed, 18 Jun 2025 15:29:30 +0200 Subject: [PATCH 1/4] chore: add test and docs for metrics --- .../trino/pages/usage-guide/monitoring.adoc | 35 ++++++++ tests/templates/kuttl/smoke/21-assert.yaml | 1 + .../kuttl/smoke/21-copy-scripts.yaml | 1 + tests/templates/kuttl/smoke/31-assert.yaml | 1 + tests/templates/kuttl/smoke/check-metrics.py | 83 +++++++++++++++++++ .../templates/kuttl/smoke_aws/21-assert.yaml | 1 + .../kuttl/smoke_aws/21-copy-scripts.yaml | 1 + .../kuttl/smoke_aws/check-metrics.py | 83 +++++++++++++++++++ 8 files changed, 206 insertions(+) create mode 100644 tests/templates/kuttl/smoke/check-metrics.py create mode 100644 tests/templates/kuttl/smoke_aws/check-metrics.py diff --git a/docs/modules/trino/pages/usage-guide/monitoring.adoc b/docs/modules/trino/pages/usage-guide/monitoring.adoc index ad96afe8..0740b22a 100644 --- a/docs/modules/trino/pages/usage-guide/monitoring.adoc +++ b/docs/modules/trino/pages/usage-guide/monitoring.adoc @@ -3,3 +3,38 @@ The managed Trino instances are automatically configured to export Prometheus metrics. See xref:operators:monitoring.adoc[] for more details. + +== Metrics + +Trino automatically exposes built-in Prometheus metrics on coordinators and workers. The metrics are available on the `http` (`8080/metrics`) or +`https` (`8443/metrics`) port, depending on the TLS settings. + +The following `ServiceMonitor` example, demonstrates how the metrics could be scraped using the https://prometheus-operator.dev/[Prometheus Operator]. + +[source,yaml] +---- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: scrape-label +spec: + endpoints: + - port: https # or http + scheme: https # or http + path: /metrics + basicAuth: # <1> + username: + name: trino-user-secret + key: username + password: + name: trino-user-secret + key: password + jobLabel: app.kubernetes.io/instance + namespaceSelector: + any: true + selector: + matchLabels: + prometheus.io/scrape: "true" +---- + +<1> Add user information if Trino is configuration to use authentication diff --git a/tests/templates/kuttl/smoke/21-assert.yaml b/tests/templates/kuttl/smoke/21-assert.yaml index 600736ce..b3f78bfc 100644 --- a/tests/templates/kuttl/smoke/21-assert.yaml +++ b/tests/templates/kuttl/smoke/21-assert.yaml @@ -6,3 +6,4 @@ commands: - script: kubectl exec -n $NAMESPACE trino-test-helper-0 -- python /tmp/check-active-workers.py -u admin -p admin -n $NAMESPACE -w 1 - script: kubectl exec -n $NAMESPACE trino-test-helper-0 -- python /tmp/check-opa.py -n $NAMESPACE - script: kubectl exec -n $NAMESPACE trino-test-helper-0 -- python /tmp/check-s3.py -n $NAMESPACE + - script: kubectl exec -n $NAMESPACE trino-test-helper-0 -- python /tmp/check-metrics.py -n $NAMESPACE diff --git a/tests/templates/kuttl/smoke/21-copy-scripts.yaml b/tests/templates/kuttl/smoke/21-copy-scripts.yaml index f38f3274..8c8ce409 100644 --- a/tests/templates/kuttl/smoke/21-copy-scripts.yaml +++ b/tests/templates/kuttl/smoke/21-copy-scripts.yaml @@ -5,3 +5,4 @@ commands: - script: kubectl cp -n $NAMESPACE ./check-active-workers.py trino-test-helper-0:/tmp || true - script: kubectl cp -n $NAMESPACE ./check-opa.py trino-test-helper-0:/tmp || true - script: kubectl cp -n $NAMESPACE ./check-s3.py trino-test-helper-0:/tmp || true + - script: kubectl cp -n $NAMESPACE ./check-metrics.py trino-test-helper-0:/tmp || true diff --git a/tests/templates/kuttl/smoke/31-assert.yaml b/tests/templates/kuttl/smoke/31-assert.yaml index 0690b385..fa6250c7 100644 --- a/tests/templates/kuttl/smoke/31-assert.yaml +++ b/tests/templates/kuttl/smoke/31-assert.yaml @@ -6,3 +6,4 @@ commands: - script: kubectl exec -n $NAMESPACE trino-test-helper-0 -- python /tmp/check-active-workers.py -u admin -p admin -n $NAMESPACE -w 2 - script: kubectl exec -n $NAMESPACE trino-test-helper-0 -- python /tmp/check-opa.py -n $NAMESPACE - script: kubectl exec -n $NAMESPACE trino-test-helper-0 -- python /tmp/check-s3.py -n $NAMESPACE + - script: kubectl exec -n $NAMESPACE trino-test-helper-0 -- python /tmp/check-metrics.py -n $NAMESPACE diff --git a/tests/templates/kuttl/smoke/check-metrics.py b/tests/templates/kuttl/smoke/check-metrics.py new file mode 100644 index 00000000..5e06797b --- /dev/null +++ b/tests/templates/kuttl/smoke/check-metrics.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python3 +import argparse +import requests +import time + + +def print_request_error_and_sleep(message, err, retry_count): + print("[" + str(retry_count) + "] " + message, err) + time.sleep(5) + + +def try_get(url): + retries = 3 + for i in range(retries): + try: + r = requests.get(url, timeout=5, auth=("trino", "")) + r.raise_for_status() + return r + except requests.exceptions.HTTPError as errh: + print_request_error_and_sleep("Http Error: ", errh, i) + except requests.exceptions.ConnectionError as errc: + print_request_error_and_sleep("Error Connecting: ", errc, i) + except requests.exceptions.Timeout as errt: + print_request_error_and_sleep("Timeout Error: ", errt, i) + except requests.exceptions.RequestException as err: + print_request_error_and_sleep("Error: ", err, i) + + exit(-1) + + +def check_monitoring(hosts): + for host in hosts: + # test for the jmx exporter metrics + url = host + ":8081" + response = try_get(url) + + if response.ok: + continue + else: + print("Error for [" + url + "]: could not access monitoring") + exit(-1) + + # test for the native metrics + url = host + ":8443/metrics" + response = try_get(url) + + if response.ok: + # arbitrary metric was chosen to test if metrics are present in the response + if "io_airlift_node_name_NodeInfo_StartTime" in response.text: + continue + else: + print("Error for [" + url + "]: missing metrics") + exit(-1) + continue + else: + print("Error for [" + url + "]: could not access monitoring") + exit(-1) + + +if __name__ == "__main__": + all_args = argparse.ArgumentParser(description="Test Trino metrics.") + all_args.add_argument( + "-n", "--namespace", help="The namespace to run in", required=True + ) + args = vars(all_args.parse_args()) + namespace = args["namespace"] + + host_coordinator_0 = ( + "http://trino-coordinator-default-0.trino-coordinator-default." + + namespace + + ".svc.cluster.local" + ) + host_worker_0 = ( + "http://trino-worker-default-0.trino-worker-default." + + namespace + + ".svc.cluster.local" + ) + + hosts = [host_coordinator_0, host_worker_0] + + check_monitoring(hosts) + + print("Test check-metrics.py succeeded!") diff --git a/tests/templates/kuttl/smoke_aws/21-assert.yaml b/tests/templates/kuttl/smoke_aws/21-assert.yaml index 600736ce..b3f78bfc 100644 --- a/tests/templates/kuttl/smoke_aws/21-assert.yaml +++ b/tests/templates/kuttl/smoke_aws/21-assert.yaml @@ -6,3 +6,4 @@ commands: - script: kubectl exec -n $NAMESPACE trino-test-helper-0 -- python /tmp/check-active-workers.py -u admin -p admin -n $NAMESPACE -w 1 - script: kubectl exec -n $NAMESPACE trino-test-helper-0 -- python /tmp/check-opa.py -n $NAMESPACE - script: kubectl exec -n $NAMESPACE trino-test-helper-0 -- python /tmp/check-s3.py -n $NAMESPACE + - script: kubectl exec -n $NAMESPACE trino-test-helper-0 -- python /tmp/check-metrics.py -n $NAMESPACE diff --git a/tests/templates/kuttl/smoke_aws/21-copy-scripts.yaml b/tests/templates/kuttl/smoke_aws/21-copy-scripts.yaml index f38f3274..8c8ce409 100644 --- a/tests/templates/kuttl/smoke_aws/21-copy-scripts.yaml +++ b/tests/templates/kuttl/smoke_aws/21-copy-scripts.yaml @@ -5,3 +5,4 @@ commands: - script: kubectl cp -n $NAMESPACE ./check-active-workers.py trino-test-helper-0:/tmp || true - script: kubectl cp -n $NAMESPACE ./check-opa.py trino-test-helper-0:/tmp || true - script: kubectl cp -n $NAMESPACE ./check-s3.py trino-test-helper-0:/tmp || true + - script: kubectl cp -n $NAMESPACE ./check-metrics.py trino-test-helper-0:/tmp || true diff --git a/tests/templates/kuttl/smoke_aws/check-metrics.py b/tests/templates/kuttl/smoke_aws/check-metrics.py new file mode 100644 index 00000000..5e06797b --- /dev/null +++ b/tests/templates/kuttl/smoke_aws/check-metrics.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python3 +import argparse +import requests +import time + + +def print_request_error_and_sleep(message, err, retry_count): + print("[" + str(retry_count) + "] " + message, err) + time.sleep(5) + + +def try_get(url): + retries = 3 + for i in range(retries): + try: + r = requests.get(url, timeout=5, auth=("trino", "")) + r.raise_for_status() + return r + except requests.exceptions.HTTPError as errh: + print_request_error_and_sleep("Http Error: ", errh, i) + except requests.exceptions.ConnectionError as errc: + print_request_error_and_sleep("Error Connecting: ", errc, i) + except requests.exceptions.Timeout as errt: + print_request_error_and_sleep("Timeout Error: ", errt, i) + except requests.exceptions.RequestException as err: + print_request_error_and_sleep("Error: ", err, i) + + exit(-1) + + +def check_monitoring(hosts): + for host in hosts: + # test for the jmx exporter metrics + url = host + ":8081" + response = try_get(url) + + if response.ok: + continue + else: + print("Error for [" + url + "]: could not access monitoring") + exit(-1) + + # test for the native metrics + url = host + ":8443/metrics" + response = try_get(url) + + if response.ok: + # arbitrary metric was chosen to test if metrics are present in the response + if "io_airlift_node_name_NodeInfo_StartTime" in response.text: + continue + else: + print("Error for [" + url + "]: missing metrics") + exit(-1) + continue + else: + print("Error for [" + url + "]: could not access monitoring") + exit(-1) + + +if __name__ == "__main__": + all_args = argparse.ArgumentParser(description="Test Trino metrics.") + all_args.add_argument( + "-n", "--namespace", help="The namespace to run in", required=True + ) + args = vars(all_args.parse_args()) + namespace = args["namespace"] + + host_coordinator_0 = ( + "http://trino-coordinator-default-0.trino-coordinator-default." + + namespace + + ".svc.cluster.local" + ) + host_worker_0 = ( + "http://trino-worker-default-0.trino-worker-default." + + namespace + + ".svc.cluster.local" + ) + + hosts = [host_coordinator_0, host_worker_0] + + check_monitoring(hosts) + + print("Test check-metrics.py succeeded!") From d56eaef3bb6f7a2f5bd98b60e75c372649cd5057 Mon Sep 17 00:00:00 2001 From: Xenia Date: Thu, 19 Jun 2025 08:08:48 +0200 Subject: [PATCH 2/4] Update tests/templates/kuttl/smoke/check-metrics.py Co-authored-by: Razvan-Daniel Mihai <84674+razvan@users.noreply.github.com> --- tests/templates/kuttl/smoke/check-metrics.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/tests/templates/kuttl/smoke/check-metrics.py b/tests/templates/kuttl/smoke/check-metrics.py index 5e06797b..f06d81a9 100644 --- a/tests/templates/kuttl/smoke/check-metrics.py +++ b/tests/templates/kuttl/smoke/check-metrics.py @@ -65,16 +65,8 @@ def check_monitoring(hosts): args = vars(all_args.parse_args()) namespace = args["namespace"] - host_coordinator_0 = ( - "http://trino-coordinator-default-0.trino-coordinator-default." - + namespace - + ".svc.cluster.local" - ) - host_worker_0 = ( - "http://trino-worker-default-0.trino-worker-default." - + namespace - + ".svc.cluster.local" - ) + host_coordinator_0 = f"http://trino-coordinator-default-0.trino-coordinator-default.{namespace}.svc.cluster.local" + host_worker_0 = f"http://trino-worker-default-0.trino-worker-default.{namespace}.svc.cluster.local" hosts = [host_coordinator_0, host_worker_0] From 97973b821cb34be00642b116f85b546ad33605cc Mon Sep 17 00:00:00 2001 From: xeniape Date: Thu, 19 Jun 2025 08:12:20 +0200 Subject: [PATCH 3/4] adjust smoke_aws tests --- tests/templates/kuttl/smoke_aws/check-metrics.py | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/tests/templates/kuttl/smoke_aws/check-metrics.py b/tests/templates/kuttl/smoke_aws/check-metrics.py index 5e06797b..d042765f 100644 --- a/tests/templates/kuttl/smoke_aws/check-metrics.py +++ b/tests/templates/kuttl/smoke_aws/check-metrics.py @@ -13,7 +13,7 @@ def try_get(url): retries = 3 for i in range(retries): try: - r = requests.get(url, timeout=5, auth=("trino", "")) + r = requests.get(url, timeout=5, auth=("admin", "admin")) r.raise_for_status() return r except requests.exceptions.HTTPError as errh: @@ -65,16 +65,8 @@ def check_monitoring(hosts): args = vars(all_args.parse_args()) namespace = args["namespace"] - host_coordinator_0 = ( - "http://trino-coordinator-default-0.trino-coordinator-default." - + namespace - + ".svc.cluster.local" - ) - host_worker_0 = ( - "http://trino-worker-default-0.trino-worker-default." - + namespace - + ".svc.cluster.local" - ) + host_coordinator_0 = f"https://trino-coordinator-default-0.trino-coordinator-default.{namespace}.svc.cluster.local" + host_worker_0 = f"https://trino-worker-default-0.trino-worker-default.{namespace}.svc.cluster.local" hosts = [host_coordinator_0, host_worker_0] From b064649c26e0ee27b021c4652c7196451279968a Mon Sep 17 00:00:00 2001 From: xeniape Date: Fri, 20 Jun 2025 14:09:26 +0200 Subject: [PATCH 4/4] fix metrics test and move to commons folder --- .../kuttl/{smoke => commons}/check-metrics.py | 28 ++++--- .../kuttl/smoke/21-copy-scripts.yaml | 2 +- .../kuttl/smoke_aws/21-copy-scripts.yaml | 2 +- .../kuttl/smoke_aws/check-metrics.py | 75 ------------------- 4 files changed, 21 insertions(+), 86 deletions(-) rename tests/templates/kuttl/{smoke => commons}/check-metrics.py (71%) delete mode 100644 tests/templates/kuttl/smoke_aws/check-metrics.py diff --git a/tests/templates/kuttl/smoke/check-metrics.py b/tests/templates/kuttl/commons/check-metrics.py similarity index 71% rename from tests/templates/kuttl/smoke/check-metrics.py rename to tests/templates/kuttl/commons/check-metrics.py index f06d81a9..9a4a42c5 100644 --- a/tests/templates/kuttl/smoke/check-metrics.py +++ b/tests/templates/kuttl/commons/check-metrics.py @@ -13,7 +13,18 @@ def try_get(url): retries = 3 for i in range(retries): try: - r = requests.get(url, timeout=5, auth=("trino", "")) + if "coordinator" in url: + r = requests.get( + url, + timeout=5, + headers={"x-trino-user": "admin"}, + auth=("admin", "admin"), + verify=False, + ) + else: + r = requests.get( + url, timeout=5, headers={"x-trino-user": "admin"}, verify=False + ) r.raise_for_status() return r except requests.exceptions.HTTPError as errh: @@ -31,17 +42,15 @@ def try_get(url): def check_monitoring(hosts): for host in hosts: # test for the jmx exporter metrics - url = host + ":8081" + url = "http://" + host + ":8081/metrics" response = try_get(url) - if response.ok: - continue - else: + if not response.ok: print("Error for [" + url + "]: could not access monitoring") exit(-1) # test for the native metrics - url = host + ":8443/metrics" + url = "https://" + host + ":8443/metrics" response = try_get(url) if response.ok: @@ -51,7 +60,6 @@ def check_monitoring(hosts): else: print("Error for [" + url + "]: missing metrics") exit(-1) - continue else: print("Error for [" + url + "]: could not access monitoring") exit(-1) @@ -65,8 +73,10 @@ def check_monitoring(hosts): args = vars(all_args.parse_args()) namespace = args["namespace"] - host_coordinator_0 = f"http://trino-coordinator-default-0.trino-coordinator-default.{namespace}.svc.cluster.local" - host_worker_0 = f"http://trino-worker-default-0.trino-worker-default.{namespace}.svc.cluster.local" + host_coordinator_0 = f"trino-coordinator-default-0.trino-coordinator-default.{namespace}.svc.cluster.local" + host_worker_0 = ( + f"trino-worker-default-0.trino-worker-default.{namespace}.svc.cluster.local" + ) hosts = [host_coordinator_0, host_worker_0] diff --git a/tests/templates/kuttl/smoke/21-copy-scripts.yaml b/tests/templates/kuttl/smoke/21-copy-scripts.yaml index 8c8ce409..fc51e8f8 100644 --- a/tests/templates/kuttl/smoke/21-copy-scripts.yaml +++ b/tests/templates/kuttl/smoke/21-copy-scripts.yaml @@ -5,4 +5,4 @@ commands: - script: kubectl cp -n $NAMESPACE ./check-active-workers.py trino-test-helper-0:/tmp || true - script: kubectl cp -n $NAMESPACE ./check-opa.py trino-test-helper-0:/tmp || true - script: kubectl cp -n $NAMESPACE ./check-s3.py trino-test-helper-0:/tmp || true - - script: kubectl cp -n $NAMESPACE ./check-metrics.py trino-test-helper-0:/tmp || true + - script: kubectl cp -n $NAMESPACE ../../../../templates/kuttl/commons/check-metrics.py trino-test-helper-0:/tmp || true diff --git a/tests/templates/kuttl/smoke_aws/21-copy-scripts.yaml b/tests/templates/kuttl/smoke_aws/21-copy-scripts.yaml index 8c8ce409..fc51e8f8 100644 --- a/tests/templates/kuttl/smoke_aws/21-copy-scripts.yaml +++ b/tests/templates/kuttl/smoke_aws/21-copy-scripts.yaml @@ -5,4 +5,4 @@ commands: - script: kubectl cp -n $NAMESPACE ./check-active-workers.py trino-test-helper-0:/tmp || true - script: kubectl cp -n $NAMESPACE ./check-opa.py trino-test-helper-0:/tmp || true - script: kubectl cp -n $NAMESPACE ./check-s3.py trino-test-helper-0:/tmp || true - - script: kubectl cp -n $NAMESPACE ./check-metrics.py trino-test-helper-0:/tmp || true + - script: kubectl cp -n $NAMESPACE ../../../../templates/kuttl/commons/check-metrics.py trino-test-helper-0:/tmp || true diff --git a/tests/templates/kuttl/smoke_aws/check-metrics.py b/tests/templates/kuttl/smoke_aws/check-metrics.py deleted file mode 100644 index d042765f..00000000 --- a/tests/templates/kuttl/smoke_aws/check-metrics.py +++ /dev/null @@ -1,75 +0,0 @@ -#!/usr/bin/env python3 -import argparse -import requests -import time - - -def print_request_error_and_sleep(message, err, retry_count): - print("[" + str(retry_count) + "] " + message, err) - time.sleep(5) - - -def try_get(url): - retries = 3 - for i in range(retries): - try: - r = requests.get(url, timeout=5, auth=("admin", "admin")) - r.raise_for_status() - return r - except requests.exceptions.HTTPError as errh: - print_request_error_and_sleep("Http Error: ", errh, i) - except requests.exceptions.ConnectionError as errc: - print_request_error_and_sleep("Error Connecting: ", errc, i) - except requests.exceptions.Timeout as errt: - print_request_error_and_sleep("Timeout Error: ", errt, i) - except requests.exceptions.RequestException as err: - print_request_error_and_sleep("Error: ", err, i) - - exit(-1) - - -def check_monitoring(hosts): - for host in hosts: - # test for the jmx exporter metrics - url = host + ":8081" - response = try_get(url) - - if response.ok: - continue - else: - print("Error for [" + url + "]: could not access monitoring") - exit(-1) - - # test for the native metrics - url = host + ":8443/metrics" - response = try_get(url) - - if response.ok: - # arbitrary metric was chosen to test if metrics are present in the response - if "io_airlift_node_name_NodeInfo_StartTime" in response.text: - continue - else: - print("Error for [" + url + "]: missing metrics") - exit(-1) - continue - else: - print("Error for [" + url + "]: could not access monitoring") - exit(-1) - - -if __name__ == "__main__": - all_args = argparse.ArgumentParser(description="Test Trino metrics.") - all_args.add_argument( - "-n", "--namespace", help="The namespace to run in", required=True - ) - args = vars(all_args.parse_args()) - namespace = args["namespace"] - - host_coordinator_0 = f"https://trino-coordinator-default-0.trino-coordinator-default.{namespace}.svc.cluster.local" - host_worker_0 = f"https://trino-worker-default-0.trino-worker-default.{namespace}.svc.cluster.local" - - hosts = [host_coordinator_0, host_worker_0] - - check_monitoring(hosts) - - print("Test check-metrics.py succeeded!")