From 0489c3e9912432e1b8a434005aabe592157ea894 Mon Sep 17 00:00:00 2001 From: Dasith Wijes Date: Mon, 16 Sep 2024 18:16:05 +0000 Subject: [PATCH 01/15] add otlp exporter config --- docs/config.md | 2 ++ infra/main.bicep | 8 ++++---- loadtest/common/config.py | 3 ++- loadtest/common/locust_app_insights.py | 10 +++++----- loadtest/loadtest_chat_completions_1s_latency.py | 8 ++++---- loadtest/loadtest_chat_completions_no_added_latency.py | 8 ++++---- sample.env | 2 ++ scripts/_run-load-test-aca.sh | 6 +++--- src/aoai-api-simulator/requirements.txt | 3 +++ src/aoai-api-simulator/src/aoai_api_simulator/main.py | 6 +++--- 10 files changed, 32 insertions(+), 24 deletions(-) diff --git a/docs/config.md b/docs/config.md index d87c37b..446951f 100644 --- a/docs/config.md +++ b/docs/config.md @@ -107,6 +107,8 @@ The simulator supports a set of basic Open Telemetry configuration options. Thes | ----------------------------- | ----------------------------------------------------------------------------------------------- | | `OTEL_SERVICE_NAME` | Sets the value of the service name reported to Open Telemetry. Defaults to `aoai-api-simulator` | | `OTEL_METRIC_EXPORT_INTERVAL` | The time interval (in milliseconds) between the start of two export attempts.. | +| `APPLICATIONINSIGHTS_CONNECTION_STRING` | sets up the app insights connection string for telemetry | +`OTEL_EXPORTER_OTLP_ENDPOINT` | sets up the OpenTelemetry OTLP exporter endpoint | ## Config API Endpoint diff --git a/infra/main.bicep b/infra/main.bicep index 2c4cda5..3a77b34 100644 --- a/infra/main.bicep +++ b/infra/main.bicep @@ -170,7 +170,7 @@ resource azureOpenAIKeySecret 'Microsoft.KeyVault/vaults/secrets@2023-07-01' = { } resource appInsightsConnectionStringSecret 'Microsoft.KeyVault/vaults/secrets@2023-07-01' = { parent: vault - name: 'app-insights-connection-string' + name: 'applicationinsights-connection-string' properties: { value: appInsights.properties.ConnectionString } @@ -208,8 +208,8 @@ resource apiSim 'Microsoft.App/containerApps@2023-05-01' = { identity: managedIdentity.id } { - name: 'app-insights-connection-string' - keyVaultUrl: '${keyVaultUri}secrets/app-insights-connection-string' + name: 'applicationinsights-connection-string' + keyVaultUrl: '${keyVaultUri}secrets/applicationinsights-connection-string' identity: managedIdentity.id } { @@ -243,7 +243,7 @@ resource apiSim 'Microsoft.App/containerApps@2023-05-01' = { { name: 'AZURE_OPENAI_KEY', secretRef: 'azure-openai-key' } { name: 'OPENAI_DEPLOYMENT_CONFIG_PATH', value: '/mnt/deployment-config/simulator_deployment_config.json' } { name: 'LOG_LEVEL', value: logLevel } - { name: 'APPLICATIONINSIGHTS_CONNECTION_STRING', secretRef: 'app-insights-connection-string' } + { name: 'APPLICATIONINSIGHTS_CONNECTION_STRING', secretRef: 'applicationinsights-connection-string' } // Ensure cloudRoleName is set in telemetry // https://opentelemetry-python.readthedocs.io/en/latest/sdk/environment_variables.html#opentelemetry.sdk.environment_variables.OTEL_SERVICE_NAME { name: 'OTEL_SERVICE_NAME', value: apiSimulatorName } diff --git a/loadtest/common/config.py b/loadtest/common/config.py index fa765bf..5973add 100644 --- a/loadtest/common/config.py +++ b/loadtest/common/config.py @@ -1,7 +1,8 @@ import os api_key = os.getenv("API_KEY", os.getenv("SIMULATOR_API_KEY")) -app_insights_connection_string = os.getenv("APP_INSIGHTS_CONNECTION_STRING") +opentelemetry_exporter_otlp_endpoint = os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT") +applicationinsights_connection_string = os.getenv("APPLICATIONINSIGHTS_CONNECTION_STRING") log_analytics_workspace_id = os.getenv("LOG_ANALYTICS_WORKSPACE_ID") log_analytics_workspace_name = os.getenv("LOG_ANALYTICS_WORKSPACE_NAME") tenant_id = os.getenv("TENANT_ID") diff --git a/loadtest/common/locust_app_insights.py b/loadtest/common/locust_app_insights.py index fee407d..2572912 100644 --- a/loadtest/common/locust_app_insights.py +++ b/loadtest/common/locust_app_insights.py @@ -1,18 +1,18 @@ import logging -from opentelemetry import metrics + from azure.monitor.opentelemetry import configure_azure_monitor +from opentelemetry import metrics from .config import ( - app_insights_connection_string, + applicationinsights_connection_string, ) - histogram_request_latency: metrics.Histogram -if app_insights_connection_string: +if applicationinsights_connection_string: # Options: https://github.com/Azure/azure-sdk-for-python/tree/main/sdk/monitor/azure-monitor-opentelemetry#usage logging.getLogger("azure").setLevel(logging.WARNING) - configure_azure_monitor(connection_string=app_insights_connection_string) + configure_azure_monitor(connection_string=applicationinsights_connection_string) histogram_request_latency = metrics.get_meter(__name__).create_histogram( "locust.request_latency", "Request latency", "s" ) diff --git a/loadtest/loadtest_chat_completions_1s_latency.py b/loadtest/loadtest_chat_completions_1s_latency.py index 4be41ad..27487a5 100644 --- a/loadtest/loadtest_chat_completions_1s_latency.py +++ b/loadtest/loadtest_chat_completions_1s_latency.py @@ -1,13 +1,13 @@ import logging import os -from locust import HttpUser, task, constant, events -from locust.env import Environment -from common.config import api_key, app_insights_connection_string +from common.config import api_key, applicationinsights_connection_string from common.latency import set_simulator_chat_completions_latency from common.locust_app_insights import ( report_request_metric, ) +from locust import HttpUser, constant, events, task +from locust.env import Environment max_tokens = 100 deployment_name = os.getenv("DEPLOYMENT_NAME", None) @@ -21,7 +21,7 @@ def on_locust_init(environment: Environment, **_): """ Configure test """ - if app_insights_connection_string: + if applicationinsights_connection_string: logging.info("App Insights connection string found - enabling request metrics") environment.events.request.add_listener(report_request_metric) else: diff --git a/loadtest/loadtest_chat_completions_no_added_latency.py b/loadtest/loadtest_chat_completions_no_added_latency.py index 2acdc78..6e4f888 100644 --- a/loadtest/loadtest_chat_completions_no_added_latency.py +++ b/loadtest/loadtest_chat_completions_no_added_latency.py @@ -1,13 +1,13 @@ import logging import os -from locust import HttpUser, task, constant, events -from locust.env import Environment -from common.config import api_key, app_insights_connection_string +from common.config import api_key, applicationinsights_connection_string from common.latency import set_simulator_chat_completions_latency from common.locust_app_insights import ( report_request_metric, ) +from locust import HttpUser, constant, events, task +from locust.env import Environment max_tokens = int(os.getenv("MAX_TOKENS", "100")) deployment_name = os.getenv("DEPLOYMENT_NAME", None) @@ -25,7 +25,7 @@ def on_locust_init(environment: Environment, **_): """ Configure test """ - if app_insights_connection_string: + if applicationinsights_connection_string: logging.info("App Insights connection string found - enabling request metrics") environment.events.request.add_listener(report_request_metric) else: diff --git a/sample.env b/sample.env index a7c67a1..3c73e20 100644 --- a/sample.env +++ b/sample.env @@ -23,6 +23,8 @@ AZURE_FORM_RECOGNIZER_KEY= # Open Telemetry Config (used within the simulator) OTEL_SERVICE_NAME=aoai-api-simulator-local-dev OTEL_METRIC_EXPORT_INTERVAL=10000 +OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317 +APPLICATIONINSIGHTS_CONNECTION_STRING= # Test Client Config (used to direct the tests and test clients) diff --git a/scripts/_run-load-test-aca.sh b/scripts/_run-load-test-aca.sh index 870504b..bcf7de4 100755 --- a/scripts/_run-load-test-aca.sh +++ b/scripts/_run-load-test-aca.sh @@ -84,8 +84,8 @@ if [[ -z "${key_vault_name}" ]]; then echo "Key Vault Name not found in output.json" exit 1 fi -app_insights_connection_string=$(az keyvault secret show --vault-name "$key_vault_name" --name app-insights-connection-string --query value --output tsv) -if [[ -z "${app_insights_connection_string}" ]]; then +applicationinsights_connection_string=$(az keyvault secret show --vault-name "$key_vault_name" --name applicationinsights-connection-string --query value --output tsv) +if [[ -z "${applicationinsights_connection_string}" ]]; then echo "App Insights Connection String not found in Key Vault" exit 1 fi @@ -127,7 +127,7 @@ az containerapp job create \ --cpu "1" \ --memory "2Gi" \ --command "locust" \ - --env-vars "LOCUST_LOCUSTFILE=$TEST_FILE" "LOCUST_HOST=https://${api_fqdn}/" "LOCUST_USERS=$LOCUST_USERS" "LOCUST_SPAWN_RATE=$LOCUST_SPAWN_RATE" "LOCUST_AUTOSTART=true" "LOCUST_RUN_TIME=$LOCUST_RUN_TIME" "LOCUST_AUTOQUIT=10" "SIMULATOR_API_KEY=${SIMULATOR_API_KEY}" "APP_INSIGHTS_CONNECTION_STRING=${app_insights_connection_string}" "MAX_TOKENS=${MAX_TOKENS}" "DEPLOYMENT_NAME=${DEPLOYMENT_NAME}" ALLOW_429_RESPONSES=${ALLOW_429_RESPONSES} 1>&2 + --env-vars "LOCUST_LOCUSTFILE=$TEST_FILE" "LOCUST_HOST=https://${api_fqdn}/" "LOCUST_USERS=$LOCUST_USERS" "LOCUST_SPAWN_RATE=$LOCUST_SPAWN_RATE" "LOCUST_AUTOSTART=true" "LOCUST_RUN_TIME=$LOCUST_RUN_TIME" "LOCUST_AUTOQUIT=10" "SIMULATOR_API_KEY=${SIMULATOR_API_KEY}" "APP_INSIGHTS_CONNECTION_STRING=${applicationinsights_connection_string}" "MAX_TOKENS=${MAX_TOKENS}" "DEPLOYMENT_NAME=${DEPLOYMENT_NAME}" ALLOW_429_RESPONSES=${ALLOW_429_RESPONSES} 1>&2 start_time=$(date -u +"%Y-%m-%dT%H:%M:%SZ") diff --git a/src/aoai-api-simulator/requirements.txt b/src/aoai-api-simulator/requirements.txt index 51eea7d..702bbb4 100644 --- a/src/aoai-api-simulator/requirements.txt +++ b/src/aoai-api-simulator/requirements.txt @@ -6,5 +6,8 @@ PyYAML==6.0.1 tiktoken==0.6.0 nanoid==2.0.0 limits==3.8.0 +opentelemetry-api==1.27.0 +opentelemetry-sdk==1.27.0 +opentelemetry-exporter-otlp==1.27.0 azure-monitor-opentelemetry==1.3.0 pydantic-settings==2.2.1 diff --git a/src/aoai-api-simulator/src/aoai_api_simulator/main.py b/src/aoai-api-simulator/src/aoai_api_simulator/main.py index 05a8e75..d5ff01a 100644 --- a/src/aoai-api-simulator/src/aoai_api_simulator/main.py +++ b/src/aoai-api-simulator/src/aoai_api_simulator/main.py @@ -1,12 +1,12 @@ import logging import os -from azure.monitor.opentelemetry import configure_azure_monitor +from aoai_api_simulator.app_builder import app as builder_app +from aoai_api_simulator.app_builder import apply_config # from opentelemetry import trace - from aoai_api_simulator.config_loader import get_config_from_env_vars, set_config -from aoai_api_simulator.app_builder import app as builder_app, apply_config +from azure.monitor.opentelemetry import configure_azure_monitor log_level = os.getenv("LOG_LEVEL") or "INFO" From c22b758c216dc14c9244aac7cf72ccf61131572d Mon Sep 17 00:00:00 2001 From: Dasith Wijes Date: Mon, 16 Sep 2024 19:08:35 +0000 Subject: [PATCH 02/15] add otlp exporters --- .../src/aoai_api_simulator/main.py | 62 ++++++++++++++++++- 1 file changed, 60 insertions(+), 2 deletions(-) diff --git a/src/aoai-api-simulator/src/aoai_api_simulator/main.py b/src/aoai-api-simulator/src/aoai_api_simulator/main.py index d5ff01a..054b6bf 100644 --- a/src/aoai-api-simulator/src/aoai_api_simulator/main.py +++ b/src/aoai-api-simulator/src/aoai_api_simulator/main.py @@ -7,6 +7,20 @@ # from opentelemetry import trace from aoai_api_simulator.config_loader import get_config_from_env_vars, set_config from azure.monitor.opentelemetry import configure_azure_monitor +from opentelemetry import metrics, trace +from opentelemetry.exporter.otlp.proto.grpc._log_exporter import OTLPLogExporter +from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter +from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter +from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor + +# from opentelemetry.sdk._logs.export import ConsoleLogExporter +from opentelemetry.sdk._logs import LoggerProvider, LoggingHandler +from opentelemetry.sdk._logs.export import BatchLogRecordProcessor +from opentelemetry.sdk.metrics import MeterProvider +from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader +from opentelemetry.sdk.resources import Resource +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import BatchSpanProcessor log_level = os.getenv("LOG_LEVEL") or "INFO" @@ -14,20 +28,64 @@ logging.basicConfig(level=log_level) logging.getLogger("azure").setLevel(logging.WARNING) +opentelemetry_exporter_otlp_endpoint = os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT") application_insights_connection_string = os.getenv("APPLICATIONINSIGHTS_CONNECTION_STRING") + +using_azure_monitor: bool + if application_insights_connection_string: logger.info("🚀 Configuring Azure Monitor telemetry") # Options: https://github.com/Azure/azure-sdk-for-python/tree/main/sdk/monitor/azure-monitor-opentelemetry#usage configure_azure_monitor(connection_string=application_insights_connection_string) + using_azure_monitor = True else: + using_azure_monitor = False logger.info("🚀 Azure Monitor telemetry not configured (set APPLICATIONINSIGHTS_CONNECTION_STRING)") -# tracer = trace.get_tracer(__name__) + if opentelemetry_exporter_otlp_endpoint: + logger.info("🚀 Configuring OTLP telemetry") + + # setup the instrumentors + resource = Resource(attributes={"service.name": "aoai-api-simulator"}) + + trace.set_tracer_provider(TracerProvider(resource=resource)) + tracer = trace.get_tracer(__name__) + + # https://learn.microsoft.com/en-us/azure/azure-monitor/app/opentelemetry-configuration?tabs=python#enable-the-otlp-exporter + + otlp_exporter = OTLPSpanExporter(endpoint=opentelemetry_exporter_otlp_endpoint) + + # tracing + tracer = trace.get_tracer(__name__) + span_processor = BatchSpanProcessor(otlp_exporter) + trace.get_tracer_provider().add_span_processor(span_processor) + + # metrics + reader = PeriodicExportingMetricReader(OTLPMetricExporter(endpoint=opentelemetry_exporter_otlp_endpoint)) + meterProvider = MeterProvider(resource=resource, metric_readers=[reader]) + metrics.set_meter_provider(meterProvider) + + # logging + logger_provider = LoggerProvider( + resource=resource, + ) + + otlp_exporter = OTLPLogExporter(endpoint=opentelemetry_exporter_otlp_endpoint) + logger_provider.add_log_record_processor(BatchLogRecordProcessor(otlp_exporter)) + + handler = LoggingHandler(level=logging.NOTSET, logger_provider=logger_provider) + # Attach OTLP handler to root logger + logging.getLogger().addHandler(handler) + else: + logger.info("🚀 OTLP telemetry exporter not configured (set OTEL_EXPORTER_OTLP_ENDPOINT)") config = get_config_from_env_vars(logger) set_config(config) - apply_config() + app = builder_app # expose to gunicorn + +if not using_azure_monitor: + FastAPIInstrumentor.instrument_app(app) From 0f156a8b14b7f8d68f2034af50a4faf087410b68 Mon Sep 17 00:00:00 2001 From: Dasith Wijes Date: Mon, 16 Sep 2024 19:31:54 +0000 Subject: [PATCH 03/15] fixup --- docs/config.md | 4 ++-- src/aoai-api-simulator/src/aoai_api_simulator/main.py | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/config.md b/docs/config.md index 446951f..c3c5b4e 100644 --- a/docs/config.md +++ b/docs/config.md @@ -107,8 +107,8 @@ The simulator supports a set of basic Open Telemetry configuration options. Thes | ----------------------------- | ----------------------------------------------------------------------------------------------- | | `OTEL_SERVICE_NAME` | Sets the value of the service name reported to Open Telemetry. Defaults to `aoai-api-simulator` | | `OTEL_METRIC_EXPORT_INTERVAL` | The time interval (in milliseconds) between the start of two export attempts.. | -| `APPLICATIONINSIGHTS_CONNECTION_STRING` | sets up the app insights connection string for telemetry | -`OTEL_EXPORTER_OTLP_ENDPOINT` | sets up the OpenTelemetry OTLP exporter endpoint | +| `APPLICATIONINSIGHTS_CONNECTION_STRING` | Sets up the app insights connection string for telemetry | +`OTEL_EXPORTER_OTLP_ENDPOINT` | Sets up the OpenTelemetry OTLP exporter endpoint. This can be further customised using environment variables described [here](https://opentelemetry.io/docs/specs/otel/protocol/exporter/). i.e. `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT`, `OTEL_EXPORTER_OTLP_METRICS_ENDPOINT` or `OTEL_EXPORTER_OTLP_LOGS_ENDPOINT` | ## Config API Endpoint diff --git a/src/aoai-api-simulator/src/aoai_api_simulator/main.py b/src/aoai-api-simulator/src/aoai_api_simulator/main.py index 054b6bf..6fa8625 100644 --- a/src/aoai-api-simulator/src/aoai_api_simulator/main.py +++ b/src/aoai-api-simulator/src/aoai_api_simulator/main.py @@ -54,7 +54,7 @@ # https://learn.microsoft.com/en-us/azure/azure-monitor/app/opentelemetry-configuration?tabs=python#enable-the-otlp-exporter - otlp_exporter = OTLPSpanExporter(endpoint=opentelemetry_exporter_otlp_endpoint) + otlp_exporter = OTLPSpanExporter() # tracing tracer = trace.get_tracer(__name__) @@ -62,7 +62,7 @@ trace.get_tracer_provider().add_span_processor(span_processor) # metrics - reader = PeriodicExportingMetricReader(OTLPMetricExporter(endpoint=opentelemetry_exporter_otlp_endpoint)) + reader = PeriodicExportingMetricReader(OTLPMetricExporter()) meterProvider = MeterProvider(resource=resource, metric_readers=[reader]) metrics.set_meter_provider(meterProvider) @@ -71,10 +71,10 @@ resource=resource, ) - otlp_exporter = OTLPLogExporter(endpoint=opentelemetry_exporter_otlp_endpoint) + otlp_exporter = OTLPLogExporter() logger_provider.add_log_record_processor(BatchLogRecordProcessor(otlp_exporter)) - handler = LoggingHandler(level=logging.NOTSET, logger_provider=logger_provider) + handler = LoggingHandler(level=logging.INFO, logger_provider=logger_provider) # Attach OTLP handler to root logger logging.getLogger().addHandler(handler) else: From 1dfd7c96eb2695d03fe5773d896cfe47bd1733a0 Mon Sep 17 00:00:00 2001 From: Dasith Wijes Date: Mon, 16 Sep 2024 19:37:46 +0000 Subject: [PATCH 04/15] fixup --- src/aoai-api-simulator/src/aoai_api_simulator/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/aoai-api-simulator/src/aoai_api_simulator/main.py b/src/aoai-api-simulator/src/aoai_api_simulator/main.py index 6fa8625..ac705ec 100644 --- a/src/aoai-api-simulator/src/aoai_api_simulator/main.py +++ b/src/aoai-api-simulator/src/aoai_api_simulator/main.py @@ -74,7 +74,7 @@ otlp_exporter = OTLPLogExporter() logger_provider.add_log_record_processor(BatchLogRecordProcessor(otlp_exporter)) - handler = LoggingHandler(level=logging.INFO, logger_provider=logger_provider) + handler = LoggingHandler(level=os.getenv("OTEL_LOG_LEVEL", "INFO"), logger_provider=logger_provider) # Attach OTLP handler to root logger logging.getLogger().addHandler(handler) else: From d5d15cafdccc47ace0f922eb8ea3689e0a747d5b Mon Sep 17 00:00:00 2001 From: Dasith Wijes Date: Tue, 17 Sep 2024 10:54:47 -0700 Subject: [PATCH 05/15] add docker compose for telemetry --- .gitignore | 2 +- Makefile | 3 +++ build/telemetry-docker-compose.yaml | 8 ++++++++ 3 files changed, 12 insertions(+), 1 deletion(-) create mode 100644 build/telemetry-docker-compose.yaml diff --git a/.gitignore b/.gitignore index 057bbf5..0e063ae 100644 --- a/.gitignore +++ b/.gitignore @@ -8,7 +8,7 @@ __pycache__/ # Distribution / packaging .Python -build/ +*/build/ develop-eggs/ dist/ downloads/ diff --git a/Makefile b/Makefile index c23c815..9119cb3 100644 --- a/Makefile +++ b/Makefile @@ -91,3 +91,6 @@ docker-build-load-test: ## Build the AOAI Simulated API Load Test as a docker im erase-recording: ## Erase all *.recording files rm -rf "${makefile_dir}.recording" +start-telemetry: + -docker-compose -f build/telemetry-docker-compose.yaml down + docker-compose -f ./build/telemetry-docker-compose.yaml up \ No newline at end of file diff --git a/build/telemetry-docker-compose.yaml b/build/telemetry-docker-compose.yaml new file mode 100644 index 0000000..7db8c05 --- /dev/null +++ b/build/telemetry-docker-compose.yaml @@ -0,0 +1,8 @@ +services: + grafana-all-in-one: + image: grafana/otel-lgtm + container_name: otel-lgtm + ports: + - "3000:3000" # Grafana Web UI + - "4317:4317" # OTLP gRPC receiver + - "4318:4318" # OTLP http receiver From d30b6a4853d819b2db8eb46eb1e6e047ede15cfd Mon Sep 17 00:00:00 2001 From: Dasith Wijes Date: Tue, 17 Sep 2024 21:01:56 +0000 Subject: [PATCH 06/15] read service name from env --- build/telemetry-docker-compose.yaml | 2 +- src/aoai-api-simulator/src/aoai_api_simulator/main.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/build/telemetry-docker-compose.yaml b/build/telemetry-docker-compose.yaml index 7db8c05..0649f7c 100644 --- a/build/telemetry-docker-compose.yaml +++ b/build/telemetry-docker-compose.yaml @@ -1,5 +1,5 @@ services: - grafana-all-in-one: + grafana-all-in-one: # https://grafana.com/blog/2024/03/13/an-opentelemetry-backend-in-a-docker-image-introducing-grafana/otel-lgtm/ image: grafana/otel-lgtm container_name: otel-lgtm ports: diff --git a/src/aoai-api-simulator/src/aoai_api_simulator/main.py b/src/aoai-api-simulator/src/aoai_api_simulator/main.py index ac705ec..b4dbb49 100644 --- a/src/aoai-api-simulator/src/aoai_api_simulator/main.py +++ b/src/aoai-api-simulator/src/aoai_api_simulator/main.py @@ -47,7 +47,7 @@ logger.info("🚀 Configuring OTLP telemetry") # setup the instrumentors - resource = Resource(attributes={"service.name": "aoai-api-simulator"}) + resource = Resource(attributes={"service.name": os.getenv("OTEL_SERVICE_NAME", "aoai-api-simulator")}) trace.set_tracer_provider(TracerProvider(resource=resource)) tracer = trace.get_tracer(__name__) From bce47c75bdf5c9ddace033ea8f064de74dbdf0c5 Mon Sep 17 00:00:00 2001 From: Dasith Wijes Date: Tue, 17 Sep 2024 14:56:58 -0700 Subject: [PATCH 07/15] add request instrumentor --- sample.env | 2 +- src/aoai-api-simulator/requirements.txt | 1 + src/aoai-api-simulator/src/aoai_api_simulator/main.py | 4 ++-- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/sample.env b/sample.env index 3c73e20..794225a 100644 --- a/sample.env +++ b/sample.env @@ -23,7 +23,7 @@ AZURE_FORM_RECOGNIZER_KEY= # Open Telemetry Config (used within the simulator) OTEL_SERVICE_NAME=aoai-api-simulator-local-dev OTEL_METRIC_EXPORT_INTERVAL=10000 -OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317 +OTEL_EXPORTER_OTLP_ENDPOINT=http://host.docker.internal:4317 APPLICATIONINSIGHTS_CONNECTION_STRING= diff --git a/src/aoai-api-simulator/requirements.txt b/src/aoai-api-simulator/requirements.txt index 702bbb4..502b8ca 100644 --- a/src/aoai-api-simulator/requirements.txt +++ b/src/aoai-api-simulator/requirements.txt @@ -2,6 +2,7 @@ fastapi==0.109.2 uvicorn[standard]==0.27.0.post1 gunicorn==22.0.0 requests==2.32.0 +opentelemetry-instrumentation-requests==0.48b0 PyYAML==6.0.1 tiktoken==0.6.0 nanoid==2.0.0 diff --git a/src/aoai-api-simulator/src/aoai_api_simulator/main.py b/src/aoai-api-simulator/src/aoai_api_simulator/main.py index b4dbb49..12c31eb 100644 --- a/src/aoai-api-simulator/src/aoai_api_simulator/main.py +++ b/src/aoai-api-simulator/src/aoai_api_simulator/main.py @@ -12,6 +12,7 @@ from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor +from opentelemetry.instrumentation.requests import RequestsInstrumentor # from opentelemetry.sdk._logs.export import ConsoleLogExporter from opentelemetry.sdk._logs import LoggerProvider, LoggingHandler @@ -52,8 +53,6 @@ trace.set_tracer_provider(TracerProvider(resource=resource)) tracer = trace.get_tracer(__name__) - # https://learn.microsoft.com/en-us/azure/azure-monitor/app/opentelemetry-configuration?tabs=python#enable-the-otlp-exporter - otlp_exporter = OTLPSpanExporter() # tracing @@ -88,4 +87,5 @@ app = builder_app # expose to gunicorn if not using_azure_monitor: + RequestsInstrumentor().instrument() FastAPIInstrumentor.instrument_app(app) From 103ba8bed5b00ffed95acfd665d2a7f8428fd1b3 Mon Sep 17 00:00:00 2001 From: Dasith Wijes Date: Tue, 17 Sep 2024 15:00:13 -0700 Subject: [PATCH 08/15] fixup --- docs/config.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/config.md b/docs/config.md index c3c5b4e..c2825b4 100644 --- a/docs/config.md +++ b/docs/config.md @@ -108,7 +108,7 @@ The simulator supports a set of basic Open Telemetry configuration options. Thes | `OTEL_SERVICE_NAME` | Sets the value of the service name reported to Open Telemetry. Defaults to `aoai-api-simulator` | | `OTEL_METRIC_EXPORT_INTERVAL` | The time interval (in milliseconds) between the start of two export attempts.. | | `APPLICATIONINSIGHTS_CONNECTION_STRING` | Sets up the app insights connection string for telemetry | -`OTEL_EXPORTER_OTLP_ENDPOINT` | Sets up the OpenTelemetry OTLP exporter endpoint. This can be further customised using environment variables described [here](https://opentelemetry.io/docs/specs/otel/protocol/exporter/). i.e. `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT`, `OTEL_EXPORTER_OTLP_METRICS_ENDPOINT` or `OTEL_EXPORTER_OTLP_LOGS_ENDPOINT` | +|`OTEL_EXPORTER_OTLP_ENDPOINT` | Sets up the OpenTelemetry OTLP exporter endpoint. This can be further customised using environment variables described [here](https://opentelemetry.io/docs/specs/otel/protocol/exporter/). i.e. `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT`, `OTEL_EXPORTER_OTLP_METRICS_ENDPOINT` or `OTEL_EXPORTER_OTLP_LOGS_ENDPOINT` | ## Config API Endpoint From bfffbe89deeb2d182564c4209d35ef5e6f4699e2 Mon Sep 17 00:00:00 2001 From: Dasith Wijes Date: Wed, 18 Sep 2024 21:23:15 +0000 Subject: [PATCH 09/15] move telemetry to separate file --- .devcontainer/devcontainer.json | 3 + sample.env | 3 +- .../src/aoai_api_simulator/config_loader.py | 7 +- .../src/aoai_api_simulator/main.py | 84 +----------------- .../src/aoai_api_simulator/telemetry.py | 86 +++++++++++++++++++ 5 files changed, 98 insertions(+), 85 deletions(-) create mode 100644 src/aoai-api-simulator/src/aoai_api_simulator/telemetry.py diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 40cd9b1..c0be6df 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -34,4 +34,7 @@ // Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root. // "remoteUser": "root" + "mounts": [ + "source=${env:HOME}${env:USERPROFILE}/.ssh,target=/home/vscode/.ssh,type=bind,readonly" + ] } diff --git a/sample.env b/sample.env index 794225a..3c4d131 100644 --- a/sample.env +++ b/sample.env @@ -23,7 +23,8 @@ AZURE_FORM_RECOGNIZER_KEY= # Open Telemetry Config (used within the simulator) OTEL_SERVICE_NAME=aoai-api-simulator-local-dev OTEL_METRIC_EXPORT_INTERVAL=10000 -OTEL_EXPORTER_OTLP_ENDPOINT=http://host.docker.internal:4317 +# OTEL_EXPORTER_OTLP_ENDPOINT=http://host.docker.internal:4317 +OTEL_EXPORTER_OTLP_ENDPOINT= APPLICATIONINSIGHTS_CONNECTION_STRING= diff --git a/src/aoai-api-simulator/src/aoai_api_simulator/config_loader.py b/src/aoai-api-simulator/src/aoai_api_simulator/config_loader.py index 38d91da..d56aa9e 100644 --- a/src/aoai-api-simulator/src/aoai_api_simulator/config_loader.py +++ b/src/aoai-api-simulator/src/aoai_api_simulator/config_loader.py @@ -2,19 +2,19 @@ import json import logging import os - import sys +from aoai_api_simulator.generator.manager import get_default_generators from aoai_api_simulator.limiters import get_default_limiters from aoai_api_simulator.models import Config, OpenAIDeployment from aoai_api_simulator.record_replay.handler import get_default_forwarders -from aoai_api_simulator.generator.manager import get_default_generators -def get_config_from_env_vars(logger: logging.Logger) -> Config: +def get_config_from_env_vars() -> Config: """ Load configuration from environment variables """ + logger = logging.getLogger() config = Config(generators=get_default_generators()) config.recording.forwarders = get_default_forwarders() config.openai_deployments = _load_openai_deployments(logger) @@ -95,7 +95,6 @@ def _default_openai_deployments() -> dict[str, OpenAIDeployment]: def load_extension(config: Config): - extension_path = config.extension_path if not extension_path: return diff --git a/src/aoai-api-simulator/src/aoai_api_simulator/main.py b/src/aoai-api-simulator/src/aoai_api_simulator/main.py index 12c31eb..ef527a0 100644 --- a/src/aoai-api-simulator/src/aoai_api_simulator/main.py +++ b/src/aoai-api-simulator/src/aoai_api_simulator/main.py @@ -1,91 +1,15 @@ -import logging -import os - from aoai_api_simulator.app_builder import app as builder_app from aoai_api_simulator.app_builder import apply_config # from opentelemetry import trace from aoai_api_simulator.config_loader import get_config_from_env_vars, set_config -from azure.monitor.opentelemetry import configure_azure_monitor -from opentelemetry import metrics, trace -from opentelemetry.exporter.otlp.proto.grpc._log_exporter import OTLPLogExporter -from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter -from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter -from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor -from opentelemetry.instrumentation.requests import RequestsInstrumentor - -# from opentelemetry.sdk._logs.export import ConsoleLogExporter -from opentelemetry.sdk._logs import LoggerProvider, LoggingHandler -from opentelemetry.sdk._logs.export import BatchLogRecordProcessor -from opentelemetry.sdk.metrics import MeterProvider -from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader -from opentelemetry.sdk.resources import Resource -from opentelemetry.sdk.trace import TracerProvider -from opentelemetry.sdk.trace.export import BatchSpanProcessor - -log_level = os.getenv("LOG_LEVEL") or "INFO" - -logger = logging.getLogger(__name__) -logging.basicConfig(level=log_level) -logging.getLogger("azure").setLevel(logging.WARNING) - -opentelemetry_exporter_otlp_endpoint = os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT") -application_insights_connection_string = os.getenv("APPLICATIONINSIGHTS_CONNECTION_STRING") - -using_azure_monitor: bool - -if application_insights_connection_string: - logger.info("🚀 Configuring Azure Monitor telemetry") - - # Options: https://github.com/Azure/azure-sdk-for-python/tree/main/sdk/monitor/azure-monitor-opentelemetry#usage - configure_azure_monitor(connection_string=application_insights_connection_string) - using_azure_monitor = True -else: - using_azure_monitor = False - logger.info("🚀 Azure Monitor telemetry not configured (set APPLICATIONINSIGHTS_CONNECTION_STRING)") - - if opentelemetry_exporter_otlp_endpoint: - logger.info("🚀 Configuring OTLP telemetry") +from aoai_api_simulator.telemetry import setup_auto_instrumentation, setup_telemetry - # setup the instrumentors - resource = Resource(attributes={"service.name": os.getenv("OTEL_SERVICE_NAME", "aoai-api-simulator")}) +using_azure_monitor: bool = setup_telemetry() - trace.set_tracer_provider(TracerProvider(resource=resource)) - tracer = trace.get_tracer(__name__) - - otlp_exporter = OTLPSpanExporter() - - # tracing - tracer = trace.get_tracer(__name__) - span_processor = BatchSpanProcessor(otlp_exporter) - trace.get_tracer_provider().add_span_processor(span_processor) - - # metrics - reader = PeriodicExportingMetricReader(OTLPMetricExporter()) - meterProvider = MeterProvider(resource=resource, metric_readers=[reader]) - metrics.set_meter_provider(meterProvider) - - # logging - logger_provider = LoggerProvider( - resource=resource, - ) - - otlp_exporter = OTLPLogExporter() - logger_provider.add_log_record_processor(BatchLogRecordProcessor(otlp_exporter)) - - handler = LoggingHandler(level=os.getenv("OTEL_LOG_LEVEL", "INFO"), logger_provider=logger_provider) - # Attach OTLP handler to root logger - logging.getLogger().addHandler(handler) - else: - logger.info("🚀 OTLP telemetry exporter not configured (set OTEL_EXPORTER_OTLP_ENDPOINT)") - -config = get_config_from_env_vars(logger) +config = get_config_from_env_vars() set_config(config) - apply_config() app = builder_app # expose to gunicorn - -if not using_azure_monitor: - RequestsInstrumentor().instrument() - FastAPIInstrumentor.instrument_app(app) +setup_auto_instrumentation(app, using_azure_monitor) diff --git a/src/aoai-api-simulator/src/aoai_api_simulator/telemetry.py b/src/aoai-api-simulator/src/aoai_api_simulator/telemetry.py new file mode 100644 index 0000000..5d4034b --- /dev/null +++ b/src/aoai-api-simulator/src/aoai_api_simulator/telemetry.py @@ -0,0 +1,86 @@ +import logging +import os + +# from opentelemetry import trace +from azure.monitor.opentelemetry import configure_azure_monitor +from fastapi import FastAPI +from opentelemetry import metrics, trace +from opentelemetry.exporter.otlp.proto.grpc._log_exporter import OTLPLogExporter +from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter +from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter +from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor +from opentelemetry.instrumentation.requests import RequestsInstrumentor +from opentelemetry.sdk._logs import LoggerProvider, LoggingHandler +from opentelemetry.sdk._logs.export import BatchLogRecordProcessor +from opentelemetry.sdk.metrics import MeterProvider +from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader +from opentelemetry.sdk.resources import Resource +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import BatchSpanProcessor + +log_level = os.getenv("LOG_LEVEL") or "INFO" + +logger = logging.getLogger(__name__) +logging.basicConfig(level=log_level) +logging.getLogger("azure").setLevel(logging.WARNING) + +opentelemetry_exporter_otlp_endpoint = os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT") +application_insights_connection_string = os.getenv("APPLICATIONINSIGHTS_CONNECTION_STRING") + + +def setup_telemetry() -> bool: + using_azure_monitor: bool + + if application_insights_connection_string: + logger.info("🚀 Configuring Azure Monitor telemetry") + + # Options: https://github.com/Azure/azure-sdk-for-python/tree/main/sdk/monitor/azure-monitor-opentelemetry#usage + configure_azure_monitor(connection_string=application_insights_connection_string) + using_azure_monitor = True + else: + using_azure_monitor = False + logger.info("Azure Monitor telemetry not configured (set APPLICATIONINSIGHTS_CONNECTION_STRING)") + + if opentelemetry_exporter_otlp_endpoint: + logger.info("🚀 Configuring OTLP telemetry") + + # setup the instrumentors + if not using_azure_monitor: + resource = Resource(attributes={"service.name": os.getenv("OTEL_SERVICE_NAME", "aoai-api-simulator")}) + trace.set_tracer_provider(TracerProvider(resource=resource)) + + # tracing + span_processor = BatchSpanProcessor(OTLPSpanExporter()) + trace.get_tracer_provider().add_span_processor(span_processor) + + # metrics + if not using_azure_monitor: + meter_provider = MeterProvider(resource=resource, metric_readers=[]) + metrics.set_meter_provider(meter_provider) + + metric_reader = PeriodicExportingMetricReader(OTLPMetricExporter()) + metrics.get_meter_provider().add_metric_reader(metric_reader) + + # logging + logger_provider = LoggerProvider( + resource=resource, + ) + + batch_log_record_processor = BatchLogRecordProcessor(OTLPLogExporter()) + logger_provider.add_log_record_processor(batch_log_record_processor) + + handler = LoggingHandler(level=os.getenv("OTEL_LOG_LEVEL", "INFO"), logger_provider=logger_provider) + # Attach OTLP handler to root logger + logging.getLogger().addHandler(handler) + else: + logger.info("🚀 OTLP telemetry exporter not configured (set OTEL_EXPORTER_OTLP_ENDPOINT)") + + return using_azure_monitor + + +def setup_auto_instrumentation(app: FastAPI, using_azure_monitor: bool): + if not using_azure_monitor: + RequestsInstrumentor().instrument() + FastAPIInstrumentor.instrument_app(app) + else: + logger.info("Skipping instrumenting libraries as they are done by the Azure OTEL Distro already.") From 52026cfed33b77066b071ba5380f6a1a0b4b1c11 Mon Sep 17 00:00:00 2001 From: Dasith Wijes Date: Wed, 18 Sep 2024 21:50:26 +0000 Subject: [PATCH 10/15] fixup --- .../src/aoai_api_simulator/telemetry.py | 23 +++++++++++++------ 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/src/aoai-api-simulator/src/aoai_api_simulator/telemetry.py b/src/aoai-api-simulator/src/aoai_api_simulator/telemetry.py index 5d4034b..2cfc772 100644 --- a/src/aoai-api-simulator/src/aoai_api_simulator/telemetry.py +++ b/src/aoai-api-simulator/src/aoai_api_simulator/telemetry.py @@ -13,6 +13,12 @@ from opentelemetry.sdk._logs import LoggerProvider, LoggingHandler from opentelemetry.sdk._logs.export import BatchLogRecordProcessor from opentelemetry.sdk.metrics import MeterProvider +from opentelemetry.sdk.metrics._internal.measurement_consumer import ( + SynchronousMeasurementConsumer, +) +from opentelemetry.sdk.metrics._internal.sdk_configuration import ( + SdkConfiguration, +) from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader from opentelemetry.sdk.resources import Resource from opentelemetry.sdk.trace import TracerProvider @@ -45,21 +51,24 @@ def setup_telemetry() -> bool: logger.info("🚀 Configuring OTLP telemetry") # setup the instrumentors - if not using_azure_monitor: - resource = Resource(attributes={"service.name": os.getenv("OTEL_SERVICE_NAME", "aoai-api-simulator")}) - trace.set_tracer_provider(TracerProvider(resource=resource)) + resource = Resource(attributes={"service.name": os.getenv("OTEL_SERVICE_NAME", "aoai-api-simulator")}) # tracing + if not using_azure_monitor: + trace.set_tracer_provider(TracerProvider(resource=resource)) span_processor = BatchSpanProcessor(OTLPSpanExporter()) trace.get_tracer_provider().add_span_processor(span_processor) # metrics + metric_reader = PeriodicExportingMetricReader(OTLPMetricExporter()) if not using_azure_monitor: - meter_provider = MeterProvider(resource=resource, metric_readers=[]) + meter_provider = MeterProvider(resource=resource, metric_readers=[metric_reader]) metrics.set_meter_provider(meter_provider) - - metric_reader = PeriodicExportingMetricReader(OTLPMetricExporter()) - metrics.get_meter_provider().add_metric_reader(metric_reader) + else: + sdk_config = SdkConfiguration(resource=resource, metric_readers=[metric_reader], views=[]) + measurement_consumer = SynchronousMeasurementConsumer(sdk_config=sdk_config) + meter_provider._all_metric_readers.add(metric_reader) + meter_provider._set_collect_callback(measurement_consumer.collect) # logging logger_provider = LoggerProvider( From bfb8d0c8bc7fbca94e1fd03a575f5ac6e60b488c Mon Sep 17 00:00:00 2001 From: Dasith Wijes Date: Wed, 18 Sep 2024 22:05:35 +0000 Subject: [PATCH 11/15] fixup --- .../src/aoai_api_simulator/telemetry.py | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/src/aoai-api-simulator/src/aoai_api_simulator/telemetry.py b/src/aoai-api-simulator/src/aoai_api_simulator/telemetry.py index 2cfc772..1a634d9 100644 --- a/src/aoai-api-simulator/src/aoai_api_simulator/telemetry.py +++ b/src/aoai-api-simulator/src/aoai_api_simulator/telemetry.py @@ -13,12 +13,6 @@ from opentelemetry.sdk._logs import LoggerProvider, LoggingHandler from opentelemetry.sdk._logs.export import BatchLogRecordProcessor from opentelemetry.sdk.metrics import MeterProvider -from opentelemetry.sdk.metrics._internal.measurement_consumer import ( - SynchronousMeasurementConsumer, -) -from opentelemetry.sdk.metrics._internal.sdk_configuration import ( - SdkConfiguration, -) from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader from opentelemetry.sdk.resources import Resource from opentelemetry.sdk.trace import TracerProvider @@ -61,14 +55,14 @@ def setup_telemetry() -> bool: # metrics metric_reader = PeriodicExportingMetricReader(OTLPMetricExporter()) + if not using_azure_monitor: meter_provider = MeterProvider(resource=resource, metric_readers=[metric_reader]) metrics.set_meter_provider(meter_provider) - else: - sdk_config = SdkConfiguration(resource=resource, metric_readers=[metric_reader], views=[]) - measurement_consumer = SynchronousMeasurementConsumer(sdk_config=sdk_config) - meter_provider._all_metric_readers.add(metric_reader) - meter_provider._set_collect_callback(measurement_consumer.collect) + + meter_provider = metrics.get_meter_provider() + meter_provider._all_metric_readers.add(metric_reader) + metric_reader._set_collect_callback(meter_provider._measurement_consumer.collect) # logging logger_provider = LoggerProvider( From ebeb49dde14ced2666c9588c5abe82dab896cafd Mon Sep 17 00:00:00 2001 From: Dasith Wijes Date: Wed, 18 Sep 2024 22:24:44 +0000 Subject: [PATCH 12/15] fixup --- .../src/aoai_api_simulator/telemetry.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/aoai-api-simulator/src/aoai_api_simulator/telemetry.py b/src/aoai-api-simulator/src/aoai_api_simulator/telemetry.py index 1a634d9..42fdae7 100644 --- a/src/aoai-api-simulator/src/aoai_api_simulator/telemetry.py +++ b/src/aoai-api-simulator/src/aoai_api_simulator/telemetry.py @@ -50,6 +50,7 @@ def setup_telemetry() -> bool: # tracing if not using_azure_monitor: trace.set_tracer_provider(TracerProvider(resource=resource)) + span_processor = BatchSpanProcessor(OTLPSpanExporter()) trace.get_tracer_provider().add_span_processor(span_processor) @@ -59,10 +60,12 @@ def setup_telemetry() -> bool: if not using_azure_monitor: meter_provider = MeterProvider(resource=resource, metric_readers=[metric_reader]) metrics.set_meter_provider(meter_provider) - - meter_provider = metrics.get_meter_provider() - meter_provider._all_metric_readers.add(metric_reader) - metric_reader._set_collect_callback(meter_provider._measurement_consumer.collect) + else: + meter_provider = metrics.get_meter_provider() + # meter_provider.add_metric_reader() is not implemented in python sdk yet. + # adding it manually + meter_provider._all_metric_readers.add(metric_reader) + metric_reader._set_collect_callback(meter_provider._measurement_consumer.collect) # logging logger_provider = LoggerProvider( From 6db7cbb6bafbcdeccf7c553698fe0336d9692bed Mon Sep 17 00:00:00 2001 From: Dasith Wijes Date: Wed, 25 Sep 2024 18:25:01 +1000 Subject: [PATCH 13/15] refactor --- .devcontainer/devcontainer.json | 7 +-- README.md | 7 +++ docs/config.md | 2 +- docs/{metrics.md => telemetry.md} | 42 +++++++++----- sample.env | 2 +- .../src/aoai_api_simulator/latency.py | 2 +- .../src/aoai_api_simulator/limiters.py | 2 +- .../src/aoai_api_simulator/metrics.py | 57 ------------------- .../src/aoai_api_simulator/telemetry.py | 55 ++++++++++++++++++ 9 files changed, 97 insertions(+), 79 deletions(-) rename docs/{metrics.md => telemetry.md} (53%) delete mode 100644 src/aoai-api-simulator/src/aoai_api_simulator/metrics.py diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index c0be6df..654cf4d 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -23,7 +23,8 @@ } }, "forwardPorts": [ - 5000 // test-client-web port + 5000, // test-client-web port + 3000 // grafana UI port ], // Use 'postCreateCommand' to run commands after the container is created. @@ -34,7 +35,5 @@ // Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root. // "remoteUser": "root" - "mounts": [ - "source=${env:HOME}${env:USERPROFILE}/.ssh,target=/home/vscode/.ssh,type=bind,readonly" - ] + "mounts": [] } diff --git a/README.md b/README.md index b4a36a7..fbb804b 100644 --- a/README.md +++ b/README.md @@ -17,6 +17,7 @@ WARNING: This is a work in progress! - [How to Get Started with the Azure OpenAI API Simulator](#how-to-get-started-with-the-azure-openai-api-simulator) - [Running and Deploying the Azure OpenAI API Simulator](#running-and-deploying-the-azure-openai-api-simulator) - [Configuring the Azure OpenAI API Simulator](#configuring-the-azure-openai-api-simulator) + - [Monitoring the Azure OpenAI API Simulator](#monitoring-the-azure-openai-api-simulator) - [Extending the Azure OpenAI API Simulator](#extending-the-azure-openai-api-simulator) - [Contributing to the Azure OpenAI API Simulator](#contributing-to-the-azure-openai-api-simulator) - [Changelog](#changelog) @@ -91,6 +92,12 @@ The document [Running and Deploying the Azure OpenAI API Simulator](./docs/runni The behaviour of the Azure OpenAI API Simulator is controlled via a range of [Azure OpenAI API Simulator Configuration Options](./docs/config.md). +### Monitoring the Azure OpenAI API Simulator + +The Azure OpenAI API Simulator is instrumented using OpenTelemetry and supports exporting telemetry to Azure Monitor or an OTLP endpoint. + +See the [telemetry documentation](./docs/telemetry.md) on how to configure the application to export telemetry and the types of metrics captured. + ### Extending the Azure OpenAI API Simulator There are also a number of [Azure OpenAI API Simulator Extension points](./docs/extending.md) that allow you to customise the behaviour of the Azure OpenAI API Simulator. Extensions can be used to modify the request/response, add latency, or even generate responses. diff --git a/docs/config.md b/docs/config.md index a77c483..48cf3cc 100644 --- a/docs/config.md +++ b/docs/config.md @@ -115,7 +115,7 @@ The simulator supports a set of basic Open Telemetry configuration options. Thes | `OTEL_SERVICE_NAME` | Sets the value of the service name reported to Open Telemetry. Defaults to `aoai-api-simulator` | | `OTEL_METRIC_EXPORT_INTERVAL` | The time interval (in milliseconds) between the start of two export attempts.. | | `APPLICATIONINSIGHTS_CONNECTION_STRING` | Sets up the app insights connection string for telemetry | -|`OTEL_EXPORTER_OTLP_ENDPOINT` | Sets up the OpenTelemetry OTLP exporter endpoint. This can be further customised using environment variables described [here](https://opentelemetry.io/docs/specs/otel/protocol/exporter/). i.e. `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT`, `OTEL_EXPORTER_OTLP_METRICS_ENDPOINT` or `OTEL_EXPORTER_OTLP_LOGS_ENDPOINT` | +| `OTEL_EXPORTER_OTLP_ENDPOINT` | Sets up the OpenTelemetry OTLP exporter endpoint. This can be further customised using environment variables described [here](https://opentelemetry.io/docs/specs/otel/protocol/exporter/). i.e. `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT`, `OTEL_EXPORTER_OTLP_METRICS_ENDPOINT` or `OTEL_EXPORTER_OTLP_LOGS_ENDPOINT` | ## Config API Endpoint diff --git a/docs/metrics.md b/docs/telemetry.md similarity index 53% rename from docs/metrics.md rename to docs/telemetry.md index a4c0e38..471d623 100644 --- a/docs/metrics.md +++ b/docs/telemetry.md @@ -1,16 +1,24 @@ -# Azure OpenAI API Simulator Metrics +# Azure OpenAI API Simulator Telemetry + +This solution is instrumented using OpenTelemetry. The [Azure OpenTelemetry distribution](https://learn.microsoft.com/en-us/python/api/overview/azure/monitor-opentelemetry-readme?view=azure-python) library is used to instrument and export telemetry to an Azure Monitor instance defined in the `APPLICATIONINSIGHTS_CONNECTION_STRING` environment variable. + +In addition to this, the solution also supports exporting to an [OTLP receiver](https://github.com/open-telemetry/opentelemetry-collector/blob/main/receiver/otlpreceiver/README.md) (i.e. OpenTelemetry Collector) using the `OTEL_EXPORTER_OTLP_ENDPOINT` environment variable. + +## Metrics To help you understand how the API Simulator is performing, we provide a number of metrics that you can use to monitor the simulator. -- [Azure OpenAI API Simulator Metrics](#azure-openai-api-simulator-metrics) - - [aoai-api-simulator.latency.base](#aoai-api-simulatorlatencybase) - - [aoai-api-simulator.latency.full](#aoai-api-simulatorlatencyfull) - - [aoai-api-simulator.tokens.used](#aoai-api-simulatortokensused) - - [aoai-api-simulator.tokens.requested](#aoai-api-simulatortokensrequested) - - [aoai-api-simulator.tokens.rate-limit](#aoai-api-simulatortokensrate-limit) - - [aoai-api-simulator.limits](#aoai-api-simulatorlimits) +- [Azure OpenAI API Simulator Telemetry](#azure-openai-api-simulator-telemetry) + - [Metrics](#metrics) + - [aoai-api-simulator.latency.base](#aoai-api-simulatorlatencybase) + - [aoai-api-simulator.latency.full](#aoai-api-simulatorlatencyfull) + - [aoai-api-simulator.tokens.used](#aoai-api-simulatortokensused) + - [aoai-api-simulator.tokens.requested](#aoai-api-simulatortokensrequested) + - [aoai-api-simulator.tokens.rate-limit](#aoai-api-simulatortokensrate-limit) + - [aoai-api-simulator.limits](#aoai-api-simulatorlimits) + - [Running Locally](#running-locally) -## aoai-api-simulator.latency.base +### aoai-api-simulator.latency.base Units: `seconds` @@ -21,7 +29,7 @@ Dimensions: - `deployment`: The name of the deployment the metric relates to. - `status_code`: The HTTP status code of the response. -## aoai-api-simulator.latency.full +### aoai-api-simulator.latency.full Units: `seconds` @@ -32,7 +40,7 @@ Dimensions: - `deployment`: The name of the deployment the metric relates to. - `status_code`: The HTTP status code of the response. -## aoai-api-simulator.tokens.used +### aoai-api-simulator.tokens.used Units: `tokens` @@ -43,7 +51,7 @@ Dimensions: - `deployment`: The name of the deployment the metric relates to. - `token_type`: The type of token, e.g. `prompt` or `completion`. -## aoai-api-simulator.tokens.requested +### aoai-api-simulator.tokens.requested Units: `tokens` @@ -54,7 +62,7 @@ Dimensions: - `deployment`: The name of the deployment the metric relates to. - `token_type`: The type of token, e.g. `prompt` or `completion`. -## aoai-api-simulator.tokens.rate-limit +### aoai-api-simulator.tokens.rate-limit Units: `tokens` @@ -64,7 +72,7 @@ Dimensions: - `deployment`: The name of the deployment the metric relates to. -## aoai-api-simulator.limits +### aoai-api-simulator.limits Units: `requests` @@ -74,3 +82,9 @@ Dimensions: - `deployment`: The name of the deployment the metric relates to. - `limit_type`: The type of limit that was hit, e.g. `requests` or `tokens`. + +## Running Locally + +The `make start-telemetry` command starts the `grafana/otel-lgtm` container. This is an [all-in-one container](https://grafana.com/blog/2024/03/13/an-opentelemetry-backend-in-a-docker-image-introducing-grafana/otel-lgtm/) to capture traces, metrics and logs. + +It exposes `grafana` UI on port `3000`. \ No newline at end of file diff --git a/sample.env b/sample.env index d10951d..89ce5ad 100644 --- a/sample.env +++ b/sample.env @@ -28,7 +28,7 @@ AZURE_FORM_RECOGNIZER_KEY= # Open Telemetry Config (used within the simulator) OTEL_SERVICE_NAME=aoai-api-simulator-local-dev OTEL_METRIC_EXPORT_INTERVAL=10000 -# OTEL_EXPORTER_OTLP_ENDPOINT=http://host.docker.internal:4317 +# OTEL_EXPORTER_OTLP_ENDPOINT=http://host.docker.internal:4317 ## if running in docker outside of docker OTEL_EXPORTER_OTLP_ENDPOINT= APPLICATIONINSIGHTS_CONNECTION_STRING= diff --git a/src/aoai-api-simulator/src/aoai_api_simulator/latency.py b/src/aoai-api-simulator/src/aoai_api_simulator/latency.py index 5b7fa7d..f708646 100644 --- a/src/aoai-api-simulator/src/aoai_api_simulator/latency.py +++ b/src/aoai-api-simulator/src/aoai_api_simulator/latency.py @@ -3,7 +3,7 @@ from fastapi import Response from aoai_api_simulator import constants -from aoai_api_simulator.metrics import simulator_metrics +from aoai_api_simulator.telemetry import simulator_metrics from aoai_api_simulator.models import RequestContext diff --git a/src/aoai-api-simulator/src/aoai_api_simulator/limiters.py b/src/aoai-api-simulator/src/aoai_api_simulator/limiters.py index fd06298..66ab10b 100644 --- a/src/aoai-api-simulator/src/aoai_api_simulator/limiters.py +++ b/src/aoai-api-simulator/src/aoai_api_simulator/limiters.py @@ -7,7 +7,7 @@ from typing import Awaitable, Callable from aoai_api_simulator import constants -from aoai_api_simulator.metrics import simulator_metrics +from aoai_api_simulator.telemetry import simulator_metrics from aoai_api_simulator.models import Config, RequestContext from fastapi import Response diff --git a/src/aoai-api-simulator/src/aoai_api_simulator/metrics.py b/src/aoai-api-simulator/src/aoai_api_simulator/metrics.py deleted file mode 100644 index a010db2..0000000 --- a/src/aoai-api-simulator/src/aoai_api_simulator/metrics.py +++ /dev/null @@ -1,57 +0,0 @@ -from dataclasses import dataclass -from opentelemetry import metrics - - -@dataclass -class SimulatorMetrics: - histogram_latency_base: metrics.Histogram - histogram_latency_full: metrics.Histogram - histogram_tokens_used: metrics.Histogram - histogram_tokens_requested: metrics.Histogram - histogram_tokens_rate_limit: metrics.Histogram - histogram_rate_limit: metrics.Histogram - - -def _get_simulator_metrics() -> SimulatorMetrics: - meter = metrics.get_meter(__name__) - return SimulatorMetrics( - # dimensions: deployment, status_code - histogram_latency_base=meter.create_histogram( - name="aoai-api-simulator.latency.base", - description="Latency of handling the request (before adding simulated latency)", - unit="seconds", - ), - # dimensions: deployment, status_code - histogram_latency_full=meter.create_histogram( - name="aoai-api-simulator.latency.full", - description="Full latency of handling the request (including simulated latency)", - unit="seconds", - ), - # dimensions: deployment, token_type - histogram_tokens_used=meter.create_histogram( - name="aoai-api-simulator.tokens.used", - description="Number of tokens used per request", - unit="tokens", - ), - # dimensions: deployment, token_type - histogram_tokens_requested=meter.create_histogram( - name="aoai-api-simulator.tokens.requested", - description="Number of tokens across all requests (success or not)", - unit="tokens", - ), - # dimensions: deployment - histogram_tokens_rate_limit=meter.create_histogram( - name="aoai-api-simulator.tokens.rate-limit", - description="Number of tokens that were counted for rate-limiting", - unit="tokens", - ), - # dimensions: deployment, reason - histogram_rate_limit=meter.create_histogram( - name="aoai-api-simulator.limits", - description="Number of requests that were rate-limited", - unit="requests", - ), - ) - - -simulator_metrics = _get_simulator_metrics() diff --git a/src/aoai-api-simulator/src/aoai_api_simulator/telemetry.py b/src/aoai-api-simulator/src/aoai_api_simulator/telemetry.py index 42fdae7..2d1a3cd 100644 --- a/src/aoai-api-simulator/src/aoai_api_simulator/telemetry.py +++ b/src/aoai-api-simulator/src/aoai_api_simulator/telemetry.py @@ -3,6 +3,7 @@ # from opentelemetry import trace from azure.monitor.opentelemetry import configure_azure_monitor +from dataclasses import dataclass from fastapi import FastAPI from opentelemetry import metrics, trace from opentelemetry.exporter.otlp.proto.grpc._log_exporter import OTLPLogExporter @@ -28,6 +29,60 @@ application_insights_connection_string = os.getenv("APPLICATIONINSIGHTS_CONNECTION_STRING") +@dataclass +class SimulatorMetrics: + histogram_latency_base: metrics.Histogram + histogram_latency_full: metrics.Histogram + histogram_tokens_used: metrics.Histogram + histogram_tokens_requested: metrics.Histogram + histogram_tokens_rate_limit: metrics.Histogram + histogram_rate_limit: metrics.Histogram + + +def _get_simulator_metrics() -> SimulatorMetrics: + meter = metrics.get_meter(__name__) + return SimulatorMetrics( + # dimensions: deployment, status_code + histogram_latency_base=meter.create_histogram( + name="aoai-api-simulator.latency.base", + description="Latency of handling the request (before adding simulated latency)", + unit="seconds", + ), + # dimensions: deployment, status_code + histogram_latency_full=meter.create_histogram( + name="aoai-api-simulator.latency.full", + description="Full latency of handling the request (including simulated latency)", + unit="seconds", + ), + # dimensions: deployment, token_type + histogram_tokens_used=meter.create_histogram( + name="aoai-api-simulator.tokens.used", + description="Number of tokens used per request", + unit="tokens", + ), + # dimensions: deployment, token_type + histogram_tokens_requested=meter.create_histogram( + name="aoai-api-simulator.tokens.requested", + description="Number of tokens across all requests (success or not)", + unit="tokens", + ), + # dimensions: deployment + histogram_tokens_rate_limit=meter.create_histogram( + name="aoai-api-simulator.tokens.rate-limit", + description="Number of tokens that were counted for rate-limiting", + unit="tokens", + ), + # dimensions: deployment, reason + histogram_rate_limit=meter.create_histogram( + name="aoai-api-simulator.limits", + description="Number of requests that were rate-limited", + unit="requests", + ), + ) + + +simulator_metrics = _get_simulator_metrics() + def setup_telemetry() -> bool: using_azure_monitor: bool From fb9fe4248229f5ca1ee3941e62076f0227c4c1b0 Mon Sep 17 00:00:00 2001 From: Dasith Wijes Date: Wed, 25 Sep 2024 18:32:23 +1000 Subject: [PATCH 14/15] fixup --- docs/telemetry.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/telemetry.md b/docs/telemetry.md index 471d623..5be1f3c 100644 --- a/docs/telemetry.md +++ b/docs/telemetry.md @@ -87,4 +87,4 @@ Dimensions: The `make start-telemetry` command starts the `grafana/otel-lgtm` container. This is an [all-in-one container](https://grafana.com/blog/2024/03/13/an-opentelemetry-backend-in-a-docker-image-introducing-grafana/otel-lgtm/) to capture traces, metrics and logs. -It exposes `grafana` UI on port `3000`. \ No newline at end of file +It exposes `grafana` UI on port `3000`. From 5f113a2ed12df215da3913120d70650992dec10e Mon Sep 17 00:00:00 2001 From: Stuart Leeks Date: Thu, 17 Oct 2024 12:57:13 +0000 Subject: [PATCH 15/15] fixup low-hanging linter warnings --- .../src/aoai_api_simulator/generator/openai.py | 4 ++-- src/aoai-api-simulator/src/aoai_api_simulator/telemetry.py | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/aoai-api-simulator/src/aoai_api_simulator/generator/openai.py b/src/aoai-api-simulator/src/aoai_api_simulator/generator/openai.py index 5cd02bb..790b35f 100644 --- a/src/aoai-api-simulator/src/aoai_api_simulator/generator/openai.py +++ b/src/aoai-api-simulator/src/aoai_api_simulator/generator/openai.py @@ -261,7 +261,7 @@ def create_completion_response( ) -# pylint: disable-next=too-many-arguments +# pylint: disable-next=too-many-positional-arguments, too-many-arguments def create_lorem_chat_completion_response( context: RequestContext, deployment_name: str, @@ -290,7 +290,7 @@ def create_lorem_chat_completion_response( ) -# pylint: disable-next=too-many-arguments +# pylint: disable-next=too-many-positional-arguments, too-many-arguments def create_chat_completion_response( context: RequestContext, deployment_name: str, diff --git a/src/aoai-api-simulator/src/aoai_api_simulator/telemetry.py b/src/aoai-api-simulator/src/aoai_api_simulator/telemetry.py index 2d1a3cd..287e9ee 100644 --- a/src/aoai-api-simulator/src/aoai_api_simulator/telemetry.py +++ b/src/aoai-api-simulator/src/aoai_api_simulator/telemetry.py @@ -1,9 +1,9 @@ import logging import os +from dataclasses import dataclass # from opentelemetry import trace from azure.monitor.opentelemetry import configure_azure_monitor -from dataclasses import dataclass from fastapi import FastAPI from opentelemetry import metrics, trace from opentelemetry.exporter.otlp.proto.grpc._log_exporter import OTLPLogExporter @@ -83,6 +83,7 @@ def _get_simulator_metrics() -> SimulatorMetrics: simulator_metrics = _get_simulator_metrics() + def setup_telemetry() -> bool: using_azure_monitor: bool