diff --git a/CHANGELOG.md b/CHANGELOG.md index 7abc04b..ec04792 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,12 @@ All notable changes to this project will be documented in this file. This project adheres to [Semantic Versioning](http://semver.org/). This project adheres to [CHANGELOG](http://keepachangelog.com). +## [4.1.0] - 2025-12-12 +### Added +- Add opt-in histogram mode via `PROMETHEUS_USE_HISTOGRAMS` config for accurate percentile calculations (p50, p95, p99) +- Add configurable `PROMETHEUS_LATENCY_BUCKETS` for customizing histogram bucket boundaries +- Histogram metrics use seconds (Prometheus standard), Summary metrics remain in milliseconds for backwards compatibility + ## [4.0.0] - 2025-04-05 ### Changed *breaking* - Renamed `original_image.fetch` metric to `original_image.fetch.count` and diff --git a/README.md b/README.md index 8444e07..d443d12 100644 --- a/README.md +++ b/README.md @@ -21,8 +21,52 @@ METRICS = 'tc_prometheus.metrics.prometheus_metrics' # optional with defaults PROMETHEUS_SCRAPE_PORT = 8000 # Port the prometheus client should listen on + +# Histogram mode (opt-in feature for percentile calculations) +PROMETHEUS_USE_HISTOGRAMS = False # Set to True to enable histogram metrics instead of summaries +PROMETHEUS_LATENCY_BUCKETS = [0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0, 60.0] # Bucket boundaries in seconds ``` +### Histogram vs Summary Metrics + +By default, `tc_prometheus` uses **Summary** metrics for timing/latency measurements. This maintains backwards compatibility with existing installations. + +For more detailed latency analysis including percentile calculations (p50, p95, p99), you can opt in to **Histogram** metrics: + +```python +# Enable histogram mode for better percentile calculations +PROMETHEUS_USE_HISTOGRAMS = True + +# Customize bucket boundaries to match your latency profile +# Buckets are in seconds (Thumbor's millisecond values are automatically converted) +# Default: [0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0, 60.0] +# Example for fast CDN-backed serving (50ms-5s typical): +PROMETHEUS_LATENCY_BUCKETS = [0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0] +# Example for slower processing (100ms-1min typical): +PROMETHEUS_LATENCY_BUCKETS = [0.1, 0.5, 1.0, 5.0, 10.0, 30.0, 60.0] + +# Note: When using histograms, metric values are stored in seconds (Prometheus best practice). +# Summary mode (default) continues to use milliseconds for backwards compatibility. +``` + +**Important:** Switching from Summary to Histogram changes the metric type and will break existing dashboards and alerts. Plan your migration carefully: + +1. Histogram metrics expose `_bucket`, `_count`, and `_sum` time series +2. Summary metrics expose quantiles directly (`_sum`, `_count`) +3. **Unit change**: Histogram values are in seconds, Summary values are in milliseconds +4. You'll need to update PromQL queries, dashboards, and alerts when switching + +**When to use Histograms:** +- You need accurate percentile calculations (p95, p99) +- You want to aggregate latencies across multiple instances +- You need more granular latency distribution visibility +- You're willing to update dashboards/alerts for the migration + +**When to keep Summaries:** +- You have existing dashboards and alerts that would break +- You don't need detailed percentile analysis +- You want to minimize cardinality in your metrics + ## Multiprocess support In case thumbor is to be started with multiple processes using the `--processes` argument, the prometheus diff --git a/pyproject.toml b/pyproject.toml index f8bea27..2fee0c5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "tc_prometheus" -version = "4.0.0" +version = "4.1.0" license = "MIT" license-files = ["LICENSE"] authors = [{name = "Simon Effenberg", email = "savar@schuldeigen.de"}] diff --git a/tc_prometheus/__init__.py b/tc_prometheus/__init__.py index 7cd4117..b626b93 100644 --- a/tc_prometheus/__init__.py +++ b/tc_prometheus/__init__.py @@ -7,3 +7,17 @@ 'Port the prometheus client should listen on', 'Metrics' ) + +Config.define( + 'PROMETHEUS_USE_HISTOGRAMS', + False, + 'Use Histogram instead of Summary for timing metrics (enables percentile calculations)', + 'Metrics' +) + +Config.define( + 'PROMETHEUS_LATENCY_BUCKETS', + [0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0, 60.0], + 'Latency histogram bucket boundaries in seconds (only used when PROMETHEUS_USE_HISTOGRAMS is True)', + 'Metrics' +) diff --git a/tc_prometheus/metrics/prometheus_metrics.py b/tc_prometheus/metrics/prometheus_metrics.py index 93ec560..6987948 100644 --- a/tc_prometheus/metrics/prometheus_metrics.py +++ b/tc_prometheus/metrics/prometheus_metrics.py @@ -8,7 +8,7 @@ from os import environ -from prometheus_client import Counter, start_http_server, Summary, multiprocess +from prometheus_client import Counter, Histogram, start_http_server, Summary, multiprocess from prometheus_client.core import CollectorRegistry from thumbor.metrics import BaseMetrics @@ -17,6 +17,7 @@ class Metrics(BaseMetrics): def __init__(self, config): super().__init__(config) + self.config = config if not hasattr(Metrics, 'http_server_started'): port = config.PROMETHEUS_SCRAPE_PORT @@ -31,6 +32,7 @@ def __init__(self, config): Metrics.http_server_started = True Metrics.counters = {} Metrics.summaries = {} + Metrics.histograms = {} # hard coded mapping right now self.mapping = { @@ -59,15 +61,32 @@ def incr(self, metricname, value=1): def timing(self, metricname, value): name, labels = self.__data(metricname) - if name not in Metrics.summaries: - Metrics.summaries[name] = Summary(name, name, labels.keys()) + if self.config.PROMETHEUS_USE_HISTOGRAMS: + # Use Histogram for better percentile calculations + # Convert milliseconds to seconds for Prometheus best practices + value_in_seconds = value / 1000.0 - summary = Metrics.summaries[name] + if name not in Metrics.histograms: + buckets = self.config.PROMETHEUS_LATENCY_BUCKETS + Metrics.histograms[name] = Histogram(name, name, labels.keys(), buckets=buckets) - if len(labels) != 0: - summary = summary.labels(**labels) + metric = Metrics.histograms[name] + + if len(labels) != 0: + metric = metric.labels(**labels) + + metric.observe(value_in_seconds) + else: + # Use Summary with millisecond values for backwards compatibility (default behavior) + if name not in Metrics.summaries: + Metrics.summaries[name] = Summary(name, name, labels.keys()) + + metric = Metrics.summaries[name] + + if len(labels) != 0: + metric = metric.labels(**labels) - summary.observe(value) + metric.observe(value) def __data(self, metricname): basename = self.__basename(metricname) diff --git a/tests/test_metrics.py b/tests/test_metrics.py index 387fa57..835623a 100644 --- a/tests/test_metrics.py +++ b/tests/test_metrics.py @@ -55,11 +55,11 @@ class PrometheusEndpoint(MetricsContext): def setUp(self): super().setUp() self.http_client = HTTPClient() - + def tearDown(self): super().tearDown() self.http_client.close() - + def test_should_present_metrics(self): self.context.metrics.incr('test.counter') self.context.metrics.incr('test.counter', 5) @@ -81,3 +81,35 @@ def test_should_present_metrics(self): expect(body).to_include('thumbor_test_timer_count 2') expect(body).to_include('thumbor_test_timer_sum 500') expect(body).to_include('thumbor_response_status_total{statuscode="200"} 1') + + +class PrometheusHistogramMode(TestCase): + def setUp(self): + self.config = Config() + self.config.METRICS = 'tc_prometheus.metrics.prometheus_metrics' + self.config.PROMETHEUS_SCRAPE_PORT = '8003' + self.config.PROMETHEUS_USE_HISTOGRAMS = True + self.config.PROMETHEUS_LATENCY_BUCKETS = [0.01, 0.05, 0.1, 0.5, 1.0] + + self.importer = Importer(self.config) + self.importer.import_modules() + + self.context = Context(None, self.config, self.importer) + + def test_should_use_histogram_when_enabled(self): + self.context.metrics.timing('histogram.timer', 0.025) + self.context.metrics.timing('histogram.timer', 0.075) + self.context.metrics.timing('histogram.timer', 0.150) + + # Verify histogram was created with correct configuration + metric_name = 'thumbor_histogram_timer' + histograms = tc_prometheus.metrics.prometheus_metrics.Metrics.histograms + + expect(metric_name in histograms).to_be_true() + + histogram = histograms[metric_name] + expect(histogram).Not.to_be_null() + + # Verify the histogram is configured with custom buckets + # The histogram should have the buckets we configured + expect(histogram._upper_bounds).to_equal([0.01, 0.05, 0.1, 0.5, 1.0, float('inf')])