From bc2c9596eedcafea16b33e85a630e9e5a98230db Mon Sep 17 00:00:00 2001 From: SamYuan1990 Date: Mon, 19 Jan 2026 22:40:46 +0800 Subject: [PATCH] init with otel Signed-off-by: SamYuan1990 --- prometheus.yml | 19 ++++++ scl/cap_reg.py | 5 ++ scl/llm_chat.py | 6 ++ scl/otel/metric_decorator.py | 26 +++++++++ scl/otel/otel.py | 110 +++++++++++++++++++++++++++++++++++ scl/trace.py | 51 ---------------- 6 files changed, 166 insertions(+), 51 deletions(-) create mode 100644 prometheus.yml create mode 100644 scl/otel/metric_decorator.py create mode 100644 scl/otel/otel.py delete mode 100644 scl/trace.py diff --git a/prometheus.yml b/prometheus.yml new file mode 100644 index 0000000..9754a99 --- /dev/null +++ b/prometheus.yml @@ -0,0 +1,19 @@ +global: + scrape_interval: 15s # By default, scrape targets every 15 seconds. + + # Attach these labels to any time series or alerts when communicating with + # external systems (federation, remote storage, Alertmanager). + external_labels: + monitor: 'codelab-monitor' + +# A scrape configuration containing exactly one endpoint to scrape: +# Here it's Prometheus itself. +scrape_configs: + # The job name is added as a label `job=` to any timeseries scraped from this config. + - job_name: 'prometheus' + + # Override the global default and scrape targets from this job every 5 seconds. + scrape_interval: 5s + + static_configs: + - targets: ['localhost:9090'] \ No newline at end of file diff --git a/scl/cap_reg.py b/scl/cap_reg.py index ed2a17b..301c842 100644 --- a/scl/cap_reg.py +++ b/scl/cap_reg.py @@ -3,6 +3,8 @@ import logging from typing import List, Dict from scl.meta.capability import Capability +from scl.otel.metric_decorator import record_latency +from scl.otel.otel import search_time_histogram, tool_execute_time_histogram, cap_search_result_count # Add the StructuredContextLanguage directory to the path scl_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) @@ -46,10 +48,12 @@ def get_cap_by_name(self, name)-> Capability: ## RAG search between context and function description after embedding ## Return function in openAI tool format @tracer.start_as_current_span("getCapsBySimilarity") + @record_latency(search_time_histogram, cap_search_result_count) def getCapsBySimilarity(self, msg: Msg, limit=5, min_similarity=0.5) -> Dict[str, Capability]: return self.cap_store.search_by_similarity(msg, limit, min_similarity) @tracer.start_as_current_span("invoke_cap_safe") + @record_latency(tool_execute_time_histogram) def call_cap_safe(self, cap: Capability, args_dict=None): ## todo replace by https://github.com/langchain-ai/langchain-sandbox? ## todo replace by e2b? @@ -71,5 +75,6 @@ def record(self, msg: Msg, cap: Capability): return self.cap_store.record(msg, cap) @tracer.start_as_current_span("getCapsByHistory") + @record_latency(search_time_histogram, cap_search_result_count) def getCapsByHistory(self, msg: Msg, limit=5, min_similarity=0.5) -> Dict[str, Capability]: return self.cap_store.getCapsByHistory(msg, limit, min_similarity) diff --git a/scl/llm_chat.py b/scl/llm_chat.py index 6a0cb45..cd1d4f1 100644 --- a/scl/llm_chat.py +++ b/scl/llm_chat.py @@ -4,6 +4,7 @@ from scl.cap_reg import CapRegistry from scl.meta.msg import Msg from scl.config import config +from scl.otel.otel import cap_use_hit_count, cap_duplicate_count, cap_total_count ## why not we just prvide the metrics and leave the function to user themself? ## using hooks to provide user capbility to overwrite the default behavior @@ -46,6 +47,8 @@ def send_messages( **({} if tools_autonomy is None else tools_autonomy), **({} if tools_history is None else tools_history) } + cap_total_count = len(tools_merged) + cap_duplicate_count = len(tools_named) + len(tools_autonomy) + len(tools_history) - cap_total_count ## metrics tool number,metrics as duplicate number? ## or a cache for duplicate info tools = [] @@ -88,6 +91,7 @@ def function_call_playground( ## todo-> debug/trace logging.info(response) if response.tool_calls: + cap_use_hit_count = len(response.tool_calls) for tool_call in response.tool_calls: ## metric accuery for each search? from LLM, back to cap_reg's cache func1_name = tool_call.function.name @@ -104,4 +108,6 @@ def function_call_playground( msg.append_cap_result(func1_out, tool_call.id) ## metric execution time response = send_messages(client, model, cap_registry, ToolNames, msg, turns) + else: + cap_use_hit_count = 0 return response.content diff --git a/scl/otel/metric_decorator.py b/scl/otel/metric_decorator.py new file mode 100644 index 0000000..0211c35 --- /dev/null +++ b/scl/otel/metric_decorator.py @@ -0,0 +1,26 @@ +import functools +from typing import Callable, Any + +def record_latency(histogram, count): + """记录函数执行时间的装饰器""" + def decorator(func: Callable) -> Callable: + @functools.wraps(func) + def wrapper(*args, **kwargs) -> Any: + # 获取直方图(假设在闭包或全局可访问) + # 或者可以传递histogram对象 + start_time = time.perf_counter() + + try: + result = func(*args, **kwargs) + #print(f"Function {func.__name__} returned {result}") + # todo + if count is not None: + count = len(result) + return result + finally: + end_time = time.perf_counter() + duration = end_time - start_time + histogram.record(duration, {"function": func.__name__}) + + return wrapper + return decorator \ No newline at end of file diff --git a/scl/otel/otel.py b/scl/otel/otel.py new file mode 100644 index 0000000..393431c --- /dev/null +++ b/scl/otel/otel.py @@ -0,0 +1,110 @@ +import os +import logging +from typing import Iterable + +from opentelemetry import trace +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import ( + BatchSpanProcessor, + ConsoleSpanExporter, +) + +from opentelemetry import metrics +from opentelemetry.sdk.metrics import ( + MeterProvider, + CallbackOptions, + Observation +) +from opentelemetry.sdk.metrics.export import ( + ConsoleMetricExporter, + PeriodicExportingMetricReader, +) +from scl.config import config + +# Try to import OTLP exporters, fallback to console if not available +try: + from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter + from opentelemetry.exporter.otlp.proto.http.metric_exporter import OTLPMetricExporter + OTLP_AVAILABLE = True +except ImportError: + OTLP_AVAILABLE = False + logging.info("OTLP exporter not available, falling back to console exporter") + +# 创建TracerProvider +provider = TracerProvider() + +# Use OTLP exporter if available, otherwise use console exporter +if OTLP_AVAILABLE: + OTLP_ENDPOINT = config.otlp_endpoint + span_exporter = OTLPSpanExporter( + endpoint=f"{OTLP_ENDPOINT}/v1/traces" + ) +else: + span_exporter = ConsoleSpanExporter() + +# 创建并添加span processor +span_processor = BatchSpanProcessor(span_exporter) +provider.add_span_processor(span_processor) + +# 设置全局tracer provider +trace.set_tracer_provider(provider) + +# 创建tracer实例 - 建议使用模块名或应用名 +tracer = trace.get_tracer("scl") + +# Use OTLP exporter for metrics if available, otherwise use console exporter +if OTLP_AVAILABLE: + metric_exporter = OTLPMetricExporter() +else: + metric_exporter = ConsoleMetricExporter() + +metric_reader = PeriodicExportingMetricReader(metric_exporter) +provider = MeterProvider(metric_readers=[metric_reader]) + +# Sets the global default meter provider +metrics.set_meter_provider(provider) + +# Creates a meter from the global meter provider +meter = metrics.get_meter("scl") + +## a key value map +## map +### key as str in name,desc format +### value as object +### value can be a number for gauge +### value can be a histogram + +# Define metrics +search_time_histogram = meter.create_histogram( + name="cap.search.time", + description="Time taken for search operations", + explicit_bucket_boundaries_advisory=[1.0, 5.0, 10.0], + unit="s" +) + +tool_execute_time_histogram = meter.create_histogram( + name="cap.execute.time", + description="Time taken for cap execution", + explicit_bucket_boundaries_advisory=[1.0, 5.0, 10.0], + unit="s" +) + +## todo make it a map for search +cap_search_result_count=0 +cap_total_count=0 +cap_duplicate_count=0 +cap_use_hit_count=0 + +def observable_cap_gauge_func(options: CallbackOptions) -> Iterable[Observation]: + ## for loop map in ... value as int + yield Observation(cap_search_result_count, {"type": "cap search count"}) + #yield Observation(cap_total_count, {"type": "cap total count"}) + #yield Observation(cap_duplicate_count, {"type": "cap duplicate count"}) + #yield Observation(cap_use_hit_count, {"type": "cap use hit count"}) + +cap_gauge = meter.create_observable_gauge( + name="cap.gauge", + callbacks=[observable_cap_gauge_func], + description="gauge related with cap", + unit="1" +) diff --git a/scl/trace.py b/scl/trace.py deleted file mode 100644 index cdfa7dc..0000000 --- a/scl/trace.py +++ /dev/null @@ -1,51 +0,0 @@ -import os -import logging - -from opentelemetry import trace -from opentelemetry.sdk.trace import TracerProvider -from opentelemetry.sdk.trace.export import ( - BatchSpanProcessor, - ConsoleSpanExporter, -) -from scl.config import config - -# Try to import OTLP exporter, fallback to console if not available -try: - from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter - OTLP_AVAILABLE = True -except ImportError: - OTLP_AVAILABLE = False - logging.info("OTLP exporter not available, falling back to console exporter") - -# 创建TracerProvider -provider = TracerProvider() - -# Use OTLP exporter if available, otherwise use console exporter -if OTLP_AVAILABLE: - OTLP_ENDPOINT = config.otlp_endpoint - span_exporter = OTLPSpanExporter( - endpoint=f"{OTLP_ENDPOINT}/v1/traces" - ) -else: - span_exporter = ConsoleSpanExporter() - -# 创建并添加span processor -span_processor = BatchSpanProcessor(span_exporter) -provider.add_span_processor(span_processor) - -# 设置全局tracer provider -trace.set_tracer_provider(provider) - -# 创建tracer实例 - 建议使用模块名或应用名 -tracer = trace.get_tracer("scl") - -# 测试函数,验证tracer是否工作 -#def test_tracing(): -# """测试trace是否正常工作""" -# with tracer.start_as_current_span("test_operation") as span: -# span.set_attribute("test.key", "test.value") -# print("Tracing is working! Check your exporter for traces.") -# return "Trace created successfully" - -# 如果需要立即测试,可以取消注释下面的行 -# test_tracing() \ No newline at end of file