thumbor-community · jcord04 · Dec 12, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,12 @@ All notable changes to this project will be documented in this file.
 This project adheres to [Semantic Versioning](http://semver.org/).
 This project adheres to [CHANGELOG](http://keepachangelog.com).
 
+## [4.1.0] - 2025-12-12
+### Added
+- Add opt-in histogram mode via `PROMETHEUS_USE_HISTOGRAMS` config for accurate percentile calculations (p50, p95, p99)
+- Add configurable `PROMETHEUS_LATENCY_BUCKETS` for customizing histogram bucket boundaries
+- Histogram metrics use seconds (Prometheus standard), Summary metrics remain in milliseconds for backwards compatibility
+
 ## [4.0.0] - 2025-04-05
 ### Changed *breaking*
 - Renamed `original_image.fetch` metric to `original_image.fetch.count` and

diff --git a/README.md b/README.md
@@ -21,8 +21,52 @@ METRICS = 'tc_prometheus.metrics.prometheus_metrics'
 
 # optional with defaults
 PROMETHEUS_SCRAPE_PORT = 8000 # Port the prometheus client should listen on
+
+# Histogram mode (opt-in feature for percentile calculations)
+PROMETHEUS_USE_HISTOGRAMS = False # Set to True to enable histogram metrics instead of summaries
+PROMETHEUS_LATENCY_BUCKETS = [0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0, 60.0] # Bucket boundaries in seconds
 ```
 
+### Histogram vs Summary Metrics
+
+By default, `tc_prometheus` uses **Summary** metrics for timing/latency measurements. This maintains backwards compatibility with existing installations.
+
+For more detailed latency analysis including percentile calculations (p50, p95, p99), you can opt in to **Histogram** metrics:
+
+```python
+# Enable histogram mode for better percentile calculations
+PROMETHEUS_USE_HISTOGRAMS = True
+
+# Customize bucket boundaries to match your latency profile
+# Buckets are in seconds (Thumbor's millisecond values are automatically converted)
+# Default: [0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0, 60.0]
+# Example for fast CDN-backed serving (50ms-5s typical):
+PROMETHEUS_LATENCY_BUCKETS = [0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0]
+# Example for slower processing (100ms-1min typical):
+PROMETHEUS_LATENCY_BUCKETS = [0.1, 0.5, 1.0, 5.0, 10.0, 30.0, 60.0]
+
+# Note: When using histograms, metric values are stored in seconds (Prometheus best practice).
+# Summary mode (default) continues to use milliseconds for backwards compatibility.
+```
+
+**Important:** Switching from Summary to Histogram changes the metric type and will break existing dashboards and alerts. Plan your migration carefully:
+
+1. Histogram metrics expose `_bucket`, `_count`, and `_sum` time series
+2. Summary metrics expose quantiles directly (`_sum`, `_count`)
+3. **Unit change**: Histogram values are in seconds, Summary values are in milliseconds
+4. You'll need to update PromQL queries, dashboards, and alerts when switching
+
+**When to use Histograms:**
+- You need accurate percentile calculations (p95, p99)
+- You want to aggregate latencies across multiple instances
+- You need more granular latency distribution visibility
+- You're willing to update dashboards/alerts for the migration
+
+**When to keep Summaries:**
+- You have existing dashboards and alerts that would break
+- You don't need detailed percentile analysis
+- You want to minimize cardinality in your metrics
+
 ## Multiprocess support
 
 In case thumbor is to be started with multiple processes using the `--processes` argument, the prometheus

diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "tc_prometheus"
-version = "4.0.0"
+version = "4.1.0"
 license = "MIT"
 license-files = ["LICENSE"]
 authors = [{name = "Simon Effenberg", email = "savar@schuldeigen.de"}]

diff --git a/tc_prometheus/__init__.py b/tc_prometheus/__init__.py
@@ -7,3 +7,17 @@
         'Port the prometheus client should listen on',
         'Metrics'
 )
+
+Config.define(
+        'PROMETHEUS_USE_HISTOGRAMS',
+        False,
+        'Use Histogram instead of Summary for timing metrics (enables percentile calculations)',
+        'Metrics'
+)
+
+Config.define(
+        'PROMETHEUS_LATENCY_BUCKETS',
+        [0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0, 60.0],
+        'Latency histogram bucket boundaries in seconds (only used when PROMETHEUS_USE_HISTOGRAMS is True)',
+        'Metrics'
+)
diff --git a/tc_prometheus/metrics/prometheus_metrics.py b/tc_prometheus/metrics/prometheus_metrics.py
@@ -8,7 +8,7 @@
 
 from os import environ
 
-from prometheus_client import Counter, start_http_server, Summary, multiprocess
+from prometheus_client import Counter, Histogram, start_http_server, Summary, multiprocess
 from prometheus_client.core import CollectorRegistry
 from thumbor.metrics import BaseMetrics
 
@@ -17,6 +17,7 @@ class Metrics(BaseMetrics):
 
     def __init__(self, config):
         super().__init__(config)
+        self.config = config
 
         if not hasattr(Metrics, 'http_server_started'):
             port = config.PROMETHEUS_SCRAPE_PORT
@@ -31,6 +32,7 @@ def __init__(self, config):
             Metrics.http_server_started = True
             Metrics.counters = {}
             Metrics.summaries = {}
+            Metrics.histograms = {}
 
         # hard coded mapping right now
         self.mapping = {
@@ -59,15 +61,32 @@ def incr(self, metricname, value=1):
     def timing(self, metricname, value):
         name, labels = self.__data(metricname)
 
-        if name not in Metrics.summaries:
-            Metrics.summaries[name] = Summary(name, name, labels.keys())
+        if self.config.PROMETHEUS_USE_HISTOGRAMS:
+            # Use Histogram for better percentile calculations
+            # Convert milliseconds to seconds for Prometheus best practices
+            value_in_seconds = value / 1000.0
 
-        summary = Metrics.summaries[name]
+            if name not in Metrics.histograms:
+                buckets = self.config.PROMETHEUS_LATENCY_BUCKETS
+                Metrics.histograms[name] = Histogram(name, name, labels.keys(), buckets=buckets)
 
-        if len(labels) != 0:
-            summary = summary.labels(**labels)
+            metric = Metrics.histograms[name]
+
+            if len(labels) != 0:
+                metric = metric.labels(**labels)
+
+            metric.observe(value_in_seconds)
+        else:
+            # Use Summary with millisecond values for backwards compatibility (default behavior)
+            if name not in Metrics.summaries:
+                Metrics.summaries[name] = Summary(name, name, labels.keys())
+
+            metric = Metrics.summaries[name]
+
+            if len(labels) != 0:
+                metric = metric.labels(**labels)
 
-        summary.observe(value)
+            metric.observe(value)
 
     def __data(self, metricname):
         basename = self.__basename(metricname)

diff --git a/tests/test_metrics.py b/tests/test_metrics.py
@@ -55,11 +55,11 @@ class PrometheusEndpoint(MetricsContext):
     def setUp(self):
         super().setUp()
         self.http_client = HTTPClient()
-        
+
     def tearDown(self):
         super().tearDown()
         self.http_client.close()
-    
+
     def test_should_present_metrics(self):
         self.context.metrics.incr('test.counter')
         self.context.metrics.incr('test.counter', 5)
@@ -81,3 +81,35 @@ def test_should_present_metrics(self):
         expect(body).to_include('thumbor_test_timer_count 2')
         expect(body).to_include('thumbor_test_timer_sum 500')
         expect(body).to_include('thumbor_response_status_total{statuscode="200"} 1')
+
+
+class PrometheusHistogramMode(TestCase):
+    def setUp(self):
+        self.config = Config()
+        self.config.METRICS = 'tc_prometheus.metrics.prometheus_metrics'
+        self.config.PROMETHEUS_SCRAPE_PORT = '8003'
+        self.config.PROMETHEUS_USE_HISTOGRAMS = True
+        self.config.PROMETHEUS_LATENCY_BUCKETS = [0.01, 0.05, 0.1, 0.5, 1.0]
+
+        self.importer = Importer(self.config)
+        self.importer.import_modules()
+
+        self.context = Context(None, self.config, self.importer)
+
+    def test_should_use_histogram_when_enabled(self):
+        self.context.metrics.timing('histogram.timer', 0.025)
+        self.context.metrics.timing('histogram.timer', 0.075)
+        self.context.metrics.timing('histogram.timer', 0.150)
+
+        # Verify histogram was created with correct configuration
+        metric_name = 'thumbor_histogram_timer'
+        histograms = tc_prometheus.metrics.prometheus_metrics.Metrics.histograms
+
+        expect(metric_name in histograms).to_be_true()
+
+        histogram = histograms[metric_name]
+        expect(histogram).Not.to_be_null()
+
+        # Verify the histogram is configured with custom buckets
+        # The histogram should have the buckets we configured
+        expect(histogram._upper_bounds).to_equal([0.01, 0.05, 0.1, 0.5, 1.0, float('inf')])