Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,12 @@ All notable changes to this project will be documented in this file.
This project adheres to [Semantic Versioning](http://semver.org/).
This project adheres to [CHANGELOG](http://keepachangelog.com).

## [4.1.0] - 2025-12-12
### Added
- Add opt-in histogram mode via `PROMETHEUS_USE_HISTOGRAMS` config for accurate percentile calculations (p50, p95, p99)
- Add configurable `PROMETHEUS_LATENCY_BUCKETS` for customizing histogram bucket boundaries
- Histogram metrics use seconds (Prometheus standard), Summary metrics remain in milliseconds for backwards compatibility

## [4.0.0] - 2025-04-05
### Changed *breaking*
- Renamed `original_image.fetch` metric to `original_image.fetch.count` and
Expand Down
44 changes: 44 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,52 @@ METRICS = 'tc_prometheus.metrics.prometheus_metrics'

# optional with defaults
PROMETHEUS_SCRAPE_PORT = 8000 # Port the prometheus client should listen on

# Histogram mode (opt-in feature for percentile calculations)
PROMETHEUS_USE_HISTOGRAMS = False # Set to True to enable histogram metrics instead of summaries
PROMETHEUS_LATENCY_BUCKETS = [0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0, 60.0] # Bucket boundaries in seconds
```

### Histogram vs Summary Metrics

By default, `tc_prometheus` uses **Summary** metrics for timing/latency measurements. This maintains backwards compatibility with existing installations.

For more detailed latency analysis including percentile calculations (p50, p95, p99), you can opt in to **Histogram** metrics:

```python
# Enable histogram mode for better percentile calculations
PROMETHEUS_USE_HISTOGRAMS = True

# Customize bucket boundaries to match your latency profile
# Buckets are in seconds (Thumbor's millisecond values are automatically converted)
# Default: [0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0, 60.0]
# Example for fast CDN-backed serving (50ms-5s typical):
PROMETHEUS_LATENCY_BUCKETS = [0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0]
# Example for slower processing (100ms-1min typical):
PROMETHEUS_LATENCY_BUCKETS = [0.1, 0.5, 1.0, 5.0, 10.0, 30.0, 60.0]

# Note: When using histograms, metric values are stored in seconds (Prometheus best practice).
# Summary mode (default) continues to use milliseconds for backwards compatibility.
```

**Important:** Switching from Summary to Histogram changes the metric type and will break existing dashboards and alerts. Plan your migration carefully:

1. Histogram metrics expose `_bucket`, `_count`, and `_sum` time series
2. Summary metrics expose quantiles directly (`_sum`, `_count`)
3. **Unit change**: Histogram values are in seconds, Summary values are in milliseconds
4. You'll need to update PromQL queries, dashboards, and alerts when switching

**When to use Histograms:**
- You need accurate percentile calculations (p95, p99)
- You want to aggregate latencies across multiple instances
- You need more granular latency distribution visibility
- You're willing to update dashboards/alerts for the migration

**When to keep Summaries:**
- You have existing dashboards and alerts that would break
- You don't need detailed percentile analysis
- You want to minimize cardinality in your metrics

## Multiprocess support

In case thumbor is to be started with multiple processes using the `--processes` argument, the prometheus
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "tc_prometheus"
version = "4.0.0"
version = "4.1.0"
license = "MIT"
license-files = ["LICENSE"]
authors = [{name = "Simon Effenberg", email = "savar@schuldeigen.de"}]
Expand Down
14 changes: 14 additions & 0 deletions tc_prometheus/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,17 @@
'Port the prometheus client should listen on',
'Metrics'
)

Config.define(
'PROMETHEUS_USE_HISTOGRAMS',
False,
'Use Histogram instead of Summary for timing metrics (enables percentile calculations)',
'Metrics'
)

Config.define(
'PROMETHEUS_LATENCY_BUCKETS',
[0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0, 60.0],
'Latency histogram bucket boundaries in seconds (only used when PROMETHEUS_USE_HISTOGRAMS is True)',
'Metrics'
)
33 changes: 26 additions & 7 deletions tc_prometheus/metrics/prometheus_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from os import environ

from prometheus_client import Counter, start_http_server, Summary, multiprocess
from prometheus_client import Counter, Histogram, start_http_server, Summary, multiprocess
from prometheus_client.core import CollectorRegistry
from thumbor.metrics import BaseMetrics

Expand All @@ -17,6 +17,7 @@ class Metrics(BaseMetrics):

def __init__(self, config):
super().__init__(config)
self.config = config

if not hasattr(Metrics, 'http_server_started'):
port = config.PROMETHEUS_SCRAPE_PORT
Expand All @@ -31,6 +32,7 @@ def __init__(self, config):
Metrics.http_server_started = True
Metrics.counters = {}
Metrics.summaries = {}
Metrics.histograms = {}

# hard coded mapping right now
self.mapping = {
Expand Down Expand Up @@ -59,15 +61,32 @@ def incr(self, metricname, value=1):
def timing(self, metricname, value):
name, labels = self.__data(metricname)

if name not in Metrics.summaries:
Metrics.summaries[name] = Summary(name, name, labels.keys())
if self.config.PROMETHEUS_USE_HISTOGRAMS:
# Use Histogram for better percentile calculations
# Convert milliseconds to seconds for Prometheus best practices
value_in_seconds = value / 1000.0

summary = Metrics.summaries[name]
if name not in Metrics.histograms:
buckets = self.config.PROMETHEUS_LATENCY_BUCKETS
Metrics.histograms[name] = Histogram(name, name, labels.keys(), buckets=buckets)

if len(labels) != 0:
summary = summary.labels(**labels)
metric = Metrics.histograms[name]

if len(labels) != 0:
metric = metric.labels(**labels)

metric.observe(value_in_seconds)
else:
# Use Summary with millisecond values for backwards compatibility (default behavior)
if name not in Metrics.summaries:
Metrics.summaries[name] = Summary(name, name, labels.keys())

metric = Metrics.summaries[name]

if len(labels) != 0:
metric = metric.labels(**labels)

summary.observe(value)
metric.observe(value)

def __data(self, metricname):
basename = self.__basename(metricname)
Expand Down
36 changes: 34 additions & 2 deletions tests/test_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,11 +55,11 @@ class PrometheusEndpoint(MetricsContext):
def setUp(self):
super().setUp()
self.http_client = HTTPClient()

def tearDown(self):
super().tearDown()
self.http_client.close()

def test_should_present_metrics(self):
self.context.metrics.incr('test.counter')
self.context.metrics.incr('test.counter', 5)
Expand All @@ -81,3 +81,35 @@ def test_should_present_metrics(self):
expect(body).to_include('thumbor_test_timer_count 2')
expect(body).to_include('thumbor_test_timer_sum 500')
expect(body).to_include('thumbor_response_status_total{statuscode="200"} 1')


class PrometheusHistogramMode(TestCase):
def setUp(self):
self.config = Config()
self.config.METRICS = 'tc_prometheus.metrics.prometheus_metrics'
self.config.PROMETHEUS_SCRAPE_PORT = '8003'
self.config.PROMETHEUS_USE_HISTOGRAMS = True
self.config.PROMETHEUS_LATENCY_BUCKETS = [0.01, 0.05, 0.1, 0.5, 1.0]

self.importer = Importer(self.config)
self.importer.import_modules()

self.context = Context(None, self.config, self.importer)

def test_should_use_histogram_when_enabled(self):
self.context.metrics.timing('histogram.timer', 0.025)
self.context.metrics.timing('histogram.timer', 0.075)
self.context.metrics.timing('histogram.timer', 0.150)

# Verify histogram was created with correct configuration
metric_name = 'thumbor_histogram_timer'
histograms = tc_prometheus.metrics.prometheus_metrics.Metrics.histograms

expect(metric_name in histograms).to_be_true()

histogram = histograms[metric_name]
expect(histogram).Not.to_be_null()

# Verify the histogram is configured with custom buckets
# The histogram should have the buckets we configured
expect(histogram._upper_bounds).to_equal([0.01, 0.05, 0.1, 0.5, 1.0, float('inf')])