diff --git a/internal/observability/tracer.go b/internal/observability/tracer.go index 25bd6a2..9b0fb53 100644 --- a/internal/observability/tracer.go +++ b/internal/observability/tracer.go @@ -2,6 +2,10 @@ package observability import ( "context" + "time" + + "google.golang.org/grpc" + "google.golang.org/grpc/backoff" "go.opentelemetry.io/otel" "go.opentelemetry.io/otel/attribute" @@ -9,10 +13,9 @@ import ( "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc" "go.opentelemetry.io/otel/sdk/resource" "go.opentelemetry.io/otel/sdk/trace" - "go.opentelemetry.io/otel/trace/noop" - semconv "go.opentelemetry.io/otel/semconv/v1.26.0" oteltrace "go.opentelemetry.io/otel/trace" + "go.opentelemetry.io/otel/trace/noop" ) type TracerOptions struct { @@ -33,6 +36,27 @@ func SetupTracer(opts TracerOptions) (func() error, error) { otlpClient := otlptracegrpc.NewClient( otlptracegrpc.WithInsecure(), otlptracegrpc.WithEndpoint(opts.OTLPEndpoint), + otlptracegrpc.WithTimeout(90*time.Second), + // Configure gRPC connection retry for network-level failures (e.g., "connection refused") + otlptracegrpc.WithDialOption( + grpc.WithConnectParams(grpc.ConnectParams{ + Backoff: backoff.Config{ + // this delay/multiplier config gives retries at roughly [3, 9, 21, 45, 93] seconds + BaseDelay: 3 * time.Second, + Multiplier: 2.0, + Jitter: 0.2, + MaxDelay: 120 * time.Second, + }, + MinConnectTimeout: 5 * time.Second, + }), + ), + // Configure application-level retry for retryable errors (e.g., rate limits, temporary server errors) + otlptracegrpc.WithRetry(otlptracegrpc.RetryConfig{ + Enabled: true, + InitialInterval: 5 * time.Second, + MaxInterval: 15 * time.Second, + MaxElapsedTime: 90 * time.Second, + }), ) exporter, err := otlptrace.New(context.Background(), otlpClient)