From 518bd05509c5a290b96387a867b8f28654f7cb6f Mon Sep 17 00:00:00 2001 From: Martin Wimpress Date: Fri, 6 Feb 2026 14:08:59 +0000 Subject: [PATCH] feat(processor): prefer speech-specific metrics when available - Prefer SpeechProfile measurements over full-file averages when present, since full-file metrics can be diluted by silence in multi-track recordings. - Add preferSpeechMetricSigned to safely choose speech-specific values for metrics that can be zero or negative (e.g. spectral decrease/skew). - Update DS201 highpass/lowpass, de-esser, LA-2A tuning and recording tips to use speech-specific metrics when available. - Update and add tests (recording_tips_test, adaptive_test) to cover speech-profile-driven branches and preferSpeechMetricSigned. Improves adaptive tuning accuracy for voice-centric recordings and makes recording tips more relevant when speech candidates are detected. Signed-off-by: Martin Wimpress --- internal/logging/recording_tips.go | 25 +++++- internal/logging/recording_tips_test.go | 112 +++++++++++++++++++----- internal/processor/adaptive.go | 105 ++++++++++++++++++---- internal/processor/adaptive_test.go | 28 ++++++ 4 files changed, 225 insertions(+), 45 deletions(-) diff --git a/internal/logging/recording_tips.go b/internal/logging/recording_tips.go index 8d0b588..3a6c244 100644 --- a/internal/logging/recording_tips.go +++ b/internal/logging/recording_tips.go @@ -229,8 +229,15 @@ func tipTooFarFromMic(m *processor.AudioMeasurements, _ *processor.FilterChainCo // spectralDecreaseWarm = -0.05. Skewness > 2.5 is tip-specific (stricter // than adaptive.go's spectralSkewnessLFEmphasis = 1.0). func tipProximityEffect(m *processor.AudioMeasurements, _ *processor.FilterChainConfig) *RecordingTip { - veryWarm := m.SpectralDecrease < -0.10 - warmWithSkew := m.SpectralDecrease < -0.05 && m.SpectralSkewness > 2.5 + decrease := m.SpectralDecrease + skewness := m.SpectralSkewness + if m.SpeechProfile != nil { + decrease = m.SpeechProfile.SpectralDecrease + skewness = m.SpeechProfile.SpectralSkewness + } + + veryWarm := decrease < -0.10 + warmWithSkew := decrease < -0.05 && skewness > 2.5 if !veryWarm && !warmWithSkew { return nil @@ -280,8 +287,13 @@ func tipSibilance(m *processor.AudioMeasurements, config *processor.FilterChainC // References: adaptive.go la2aLRAExpressive = 14.0 LU, // Spectral-Metrics-Reference.md crest > 18 dB = extreme dynamics. func tipDynamicRange(m *processor.AudioMeasurements, _ *processor.FilterChainConfig) *RecordingTip { + crest := m.CrestFactor + if m.SpeechProfile != nil && m.SpeechProfile.CrestFactor > 0 { + crest = m.SpeechProfile.CrestFactor + } + veryWide := m.InputLRA > 18.0 - wideWithCrest := m.InputLRA > 14.0 && m.CrestFactor > 18.0 + wideWithCrest := m.InputLRA > 14.0 && crest > 18.0 if !veryWide && !wideWithCrest { return nil @@ -298,7 +310,12 @@ func tipDynamicRange(m *processor.AudioMeasurements, _ *processor.FilterChainCon // Threshold: CrestFactor < 6 dB (brickwalled per Spectral-Metrics-Reference.md). // CrestFactor == 0 is treated as unmeasured and skipped. func tipOverCompressed(m *processor.AudioMeasurements, _ *processor.FilterChainConfig) *RecordingTip { - if m.CrestFactor >= 6.0 || m.CrestFactor == 0 { + crest := m.CrestFactor + if m.SpeechProfile != nil && m.SpeechProfile.CrestFactor > 0 { + crest = m.SpeechProfile.CrestFactor + } + + if crest >= 6.0 || crest == 0 { return nil } return &RecordingTip{ diff --git a/internal/logging/recording_tips_test.go b/internal/logging/recording_tips_test.go index 23169e0..392c7a0 100644 --- a/internal/logging/recording_tips_test.go +++ b/internal/logging/recording_tips_test.go @@ -363,20 +363,43 @@ func TestTipProximityEffect(t *testing.T) { name string spectralDecrease float64 spectralSkewness float64 + speechProfile *processor.SpeechCandidateMetrics wantTip bool }{ - {"very warm spectral decrease", -0.15, 1.0, true}, - {"warm with high skewness", -0.07, 3.0, true}, - {"warm without skewness", -0.07, 1.5, false}, - {"normal spectral decrease", -0.03, 1.0, false}, - {"boundary decrease -0.10 fires", -0.101, 0.0, true}, - {"boundary decrease -0.05 with skew", -0.051, 2.6, true}, + {"very warm spectral decrease", -0.15, 1.0, nil, true}, + {"warm with high skewness", -0.07, 3.0, nil, true}, + {"warm without skewness", -0.07, 1.5, nil, false}, + {"normal spectral decrease", -0.03, 1.0, nil, false}, + {"boundary decrease -0.10 fires", -0.101, 0.0, nil, true}, + {"boundary decrease -0.05 with skew", -0.051, 2.6, nil, true}, + { + name: "speech profile overrides full-file no tip", + spectralDecrease: -0.15, + spectralSkewness: 1.0, + speechProfile: &processor.SpeechCandidateMetrics{SpectralDecrease: -0.03, SpectralSkewness: 0.5}, + wantTip: false, + }, + { + name: "speech profile triggers when full-file would not", + spectralDecrease: -0.03, + spectralSkewness: 0.5, + speechProfile: &processor.SpeechCandidateMetrics{SpectralDecrease: -0.15, SpectralSkewness: 1.0}, + wantTip: true, + }, + { + name: "nil speech profile uses full-file", + spectralDecrease: -0.15, + spectralSkewness: 1.0, + speechProfile: nil, + wantTip: true, + }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { m := &processor.AudioMeasurements{} m.SpectralDecrease = tt.spectralDecrease m.SpectralSkewness = tt.spectralSkewness + m.SpeechProfile = tt.speechProfile tip := tipProximityEffect(m, nil) if (tip != nil) != tt.wantTip { t.Errorf("tipProximityEffect() returned tip=%v, want tip=%v", tip != nil, tt.wantTip) @@ -481,23 +504,46 @@ func TestTipSibilance(t *testing.T) { func TestTipDynamicRange(t *testing.T) { tests := []struct { - name string - inputLRA float64 - crestFactor float64 - wantTip bool + name string + inputLRA float64 + crestFactor float64 + speechProfile *processor.SpeechCandidateMetrics + wantTip bool }{ - {"very wide LRA", 20.0, 12.0, true}, - {"wide LRA with high crest", 15.0, 20.0, true}, - {"wide LRA with normal crest", 15.0, 12.0, false}, - {"normal LRA", 10.0, 12.0, false}, - {"boundary LRA 18 no tip", 18.0, 12.0, false}, - {"boundary LRA 14 with crest 18 no tip", 14.0, 18.0, false}, + {"very wide LRA", 20.0, 12.0, nil, true}, + {"wide LRA with high crest", 15.0, 20.0, nil, true}, + {"wide LRA with normal crest", 15.0, 12.0, nil, false}, + {"normal LRA", 10.0, 12.0, nil, false}, + {"boundary LRA 18 no tip", 18.0, 12.0, nil, false}, + {"boundary LRA 14 with crest 18 no tip", 14.0, 18.0, nil, false}, + { + name: "speech crest overrides full-file no wideWithCrest", + inputLRA: 15.0, + crestFactor: 20.0, + speechProfile: &processor.SpeechCandidateMetrics{CrestFactor: 12.0}, + wantTip: false, + }, + { + name: "speech crest triggers wideWithCrest", + inputLRA: 15.0, + crestFactor: 12.0, + speechProfile: &processor.SpeechCandidateMetrics{CrestFactor: 20.0}, + wantTip: true, + }, + { + name: "veryWide ignores crest speech profile", + inputLRA: 20.0, + crestFactor: 12.0, + speechProfile: &processor.SpeechCandidateMetrics{CrestFactor: 5.0}, + wantTip: true, + }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { m := &processor.AudioMeasurements{} m.InputLRA = tt.inputLRA m.CrestFactor = tt.crestFactor + m.SpeechProfile = tt.speechProfile tip := tipDynamicRange(m, nil) if (tip != nil) != tt.wantTip { t.Errorf("tipDynamicRange() returned tip=%v, want tip=%v", tip != nil, tt.wantTip) @@ -511,19 +557,39 @@ func TestTipDynamicRange(t *testing.T) { func TestTipOverCompressed(t *testing.T) { tests := []struct { - name string - crestFactor float64 - wantTip bool + name string + crestFactor float64 + speechProfile *processor.SpeechCandidateMetrics + wantTip bool }{ - {"heavily compressed", 4.0, true}, - {"boundary crest 6 no tip", 6.0, false}, - {"crest zero unmeasured", 0, false}, - {"normal crest", 12.0, false}, + {"heavily compressed", 4.0, nil, true}, + {"boundary crest 6 no tip", 6.0, nil, false}, + {"crest zero unmeasured", 0, nil, false}, + {"normal crest", 12.0, nil, false}, + { + name: "speech crest overrides full-file no tip", + crestFactor: 4.0, + speechProfile: &processor.SpeechCandidateMetrics{CrestFactor: 12.0}, + wantTip: false, + }, + { + name: "speech crest triggers compressed", + crestFactor: 12.0, + speechProfile: &processor.SpeechCandidateMetrics{CrestFactor: 4.0}, + wantTip: true, + }, + { + name: "speech crest zero uses full-file", + crestFactor: 4.0, + speechProfile: &processor.SpeechCandidateMetrics{CrestFactor: 0}, + wantTip: true, + }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { m := &processor.AudioMeasurements{} m.CrestFactor = tt.crestFactor + m.SpeechProfile = tt.speechProfile tip := tipOverCompressed(m, nil) if (tip != nil) != tt.wantTip { t.Errorf("tipOverCompressed() returned tip=%v, want tip=%v", tip != nil, tt.wantTip) diff --git a/internal/processor/adaptive.go b/internal/processor/adaptive.go index 02cea3d..ecb5da3 100644 --- a/internal/processor/adaptive.go +++ b/internal/processor/adaptive.go @@ -444,14 +444,29 @@ func tuneDS201HighPass(config *FilterChainConfig, measurements *AudioMeasurement return } + // Prefer speech-specific spectral metrics when available. + // Full-file averages are diluted by silence in multi-track recordings. + hasSpeech := measurements.SpeechProfile != nil + centroid := measurements.SpectralCentroid + if hasSpeech { + centroid = preferSpeechMetric(centroid, measurements.SpeechProfile.SpectralCentroid) + } + var speechDecrease, speechSkewness float64 + if hasSpeech { + speechDecrease = measurements.SpeechProfile.SpectralDecrease + speechSkewness = measurements.SpeechProfile.SpectralSkewness + } + decrease := preferSpeechMetricSigned(measurements.SpectralDecrease, speechDecrease, hasSpeech) + skewness := preferSpeechMetricSigned(measurements.SpectralSkewness, speechSkewness, hasSpeech) + // Determine base frequency from spectral centroid var baseFreq float64 switch { - case measurements.SpectralCentroid > centroidBright: + case centroid > centroidBright: // Bright voice with high-frequency energy concentration // Safe to use higher cutoff - voice energy is well above 100Hz baseFreq = ds201HPBrightFreq - case measurements.SpectralCentroid > centroidNormal: + case centroid > centroidNormal: // Normal voice with balanced frequency distribution // Use standard cutoff for podcast speech baseFreq = ds201HPDefaultFreq @@ -504,21 +519,21 @@ func tuneDS201HighPass(config *FilterChainConfig, measurements *AudioMeasurement // - Reduced mix (blend filtered with dry signal) // // This removes subsonic rumble while preserving bass character. - if measurements.SpectralDecrease < spectralDecreaseVeryWarm { + if decrease < spectralDecreaseVeryWarm { // Very warm voice // Use minimal settings: 30Hz cutoff, gentle Q, 50% mix config.DS201HPFreq = ds201HPVeryWarmFreq config.DS201HPWidth = ds201HPVeryWarmWidth config.DS201HPMix = ds201HPVeryWarmMix config.DS201HPPoles = 1 // Gentle 6dB/oct slope - } else if measurements.SpectralSkewness > spectralSkewnessLFEmphasis { + } else if skewness > spectralSkewnessLFEmphasis { // Significant LF emphasis // Use warm settings: 40Hz cutoff, gentle Q, 70% mix config.DS201HPFreq = ds201HPWarmFreq config.DS201HPWidth = ds201HPWarmWidth config.DS201HPMix = ds201HPWarmMix config.DS201HPPoles = 1 // Gentle 6dB/oct slope - } else if measurements.SpectralDecrease < spectralDecreaseWarm { + } else if decrease < spectralDecreaseWarm { // Warm voice - cap at default with gentle slope to preserve body if config.DS201HPFreq > ds201HPDefaultFreq { config.DS201HPFreq = ds201HPDefaultFreq @@ -553,8 +568,14 @@ func tuneDS201LowPass(config *FilterChainConfig, m *AudioMeasurements) { config.DS201LPContentType = contentType // Calculate rolloff/centroid ratio for logging - if m.SpectralCentroid > 0 { - config.DS201LPRolloffRatio = m.SpectralRolloff / m.SpectralCentroid + rolloff := m.SpectralRolloff + centroid := m.SpectralCentroid + if m.SpeechProfile != nil { + rolloff = preferSpeechMetric(rolloff, m.SpeechProfile.SpectralRolloff) + centroid = preferSpeechMetric(centroid, m.SpeechProfile.SpectralCentroid) + } + if centroid > 0 { + config.DS201LPRolloffRatio = rolloff / centroid } switch contentType { @@ -596,17 +617,26 @@ func tuneDS201LowPassForSpeech(config *FilterChainConfig, m *AudioMeasurements) config.DS201LPMix = 1.0 config.DS201LPReason = "no HF issues detected" + // Prefer speech-specific spectral metrics when available. + // Full-file averages are diluted by silence in multi-track recordings. + rolloff := m.SpectralRolloff + centroid := m.SpectralCentroid + if m.SpeechProfile != nil { + rolloff = preferSpeechMetric(rolloff, m.SpeechProfile.SpectralRolloff) + centroid = preferSpeechMetric(centroid, m.SpeechProfile.SpectralCentroid) + } + // Condition 1: Voice already dark (rolloff < 8kHz) // No benefit from lowpass — would only remove wanted content - if m.SpectralRolloff < lpRolloffDarkVoice { + if rolloff < lpRolloffDarkVoice { config.DS201LPReason = "voice already dark (rolloff < 8kHz)" return } // Condition 2: High rolloff (> 14kHz) — ultrasonic content present // Enable at rolloff + 2kHz to clean up ultrasonics while preserving audible content - if m.SpectralRolloff > lpRolloffEnableThreshold { - cutoff := m.SpectralRolloff + lpRolloffHeadroom + if rolloff > lpRolloffEnableThreshold { + cutoff := rolloff + lpRolloffHeadroom // Clamp to reasonable maximum if cutoff > 20000 { cutoff = 20000 @@ -621,7 +651,7 @@ func tuneDS201LowPassForSpeech(config *FilterChainConfig, m *AudioMeasurements) // Condition 3: High ZCR with low centroid (HF noise, not sibilance) // Sibilance has high ZCR AND high centroid; noise has high ZCR with low centroid - if m.ZeroCrossingsRate > lpZCRHigh && m.SpectralCentroid < lpZCRCentroidThreshold { + if m.ZeroCrossingsRate > lpZCRHigh && centroid < lpZCRCentroidThreshold { config.DS201LPEnabled = true config.DS201LPFreq = lpZCRCutoff config.DS201LPPoles = 1 // 6dB/oct - gentle @@ -682,6 +712,18 @@ func preferSpeechMetric(fullFile, speechProfile float64) float64 { return fullFile } +// preferSpeechMetricSigned returns speech-specific measurement if speech data +// exists, otherwise falls back to full-file measurement. Unlike preferSpeechMetric, +// this variant uses an explicit flag rather than checking value > 0, making it +// safe for metrics that can legitimately be zero or negative (e.g. SpectralDecrease, +// SpectralSkewness). +func preferSpeechMetricSigned(fullFile, speechValue float64, hasSpeech bool) float64 { + if hasSpeech { + return speechValue + } + return fullFile +} + // scaleExpansion returns expansion depth based on noise severity. // Noisier recordings need more aggressive expansion to suppress residuals. func scaleExpansion(noiseFloor float64) float64 { @@ -767,10 +809,17 @@ func tuneDeesserFull(config *FilterChainConfig, measurements *AudioMeasurements) // tuneDeesserCentroidOnly provides fallback when rolloff is unavailable func tuneDeesserCentroidOnly(config *FilterChainConfig, measurements *AudioMeasurements) { + // Prefer speech-specific centroid when available. + // Full-file averages are diluted by silence in multi-track recordings. + centroid := measurements.SpectralCentroid + if measurements.SpeechProfile != nil { + centroid = preferSpeechMetric(centroid, measurements.SpeechProfile.SpectralCentroid) + } + switch { - case measurements.SpectralCentroid > centroidVeryBright: + case centroid > centroidVeryBright: config.DeessIntensity = deessIntensityBright - case measurements.SpectralCentroid > centroidBright: + case centroid > centroidBright: config.DeessIntensity = deessIntensityNormal default: config.DeessIntensity = deessIntensityDark @@ -1272,6 +1321,13 @@ func tuneLA2ARelease(config *FilterChainConfig, measurements *AudioMeasurements) flux = preferSpeechMetric(flux, measurements.SpeechProfile.SpectralFlux) } + // Prefer speech-specific skewness for warm voice detection + var speechSkewness float64 + if measurements.SpeechProfile != nil { + speechSkewness = measurements.SpeechProfile.SpectralSkewness + } + skewness := preferSpeechMetricSigned(measurements.SpectralSkewness, speechSkewness, measurements.SpeechProfile != nil) + // Start with standard LA-2A-style release release := la2aReleaseStandard @@ -1299,7 +1355,7 @@ func tuneLA2ARelease(config *FilterChainConfig, measurements *AudioMeasurements) // Warm voices (positive skewness = bass-concentrated) get extra release // This preserves the body and warmth that LA-2A is known for - if measurements.SpectralSkewness > la2aSkewnessWarm { + if skewness > la2aSkewnessWarm { release += la2aReleaseWarmBoost } @@ -1397,20 +1453,33 @@ func tuneLA2AKnee(config *FilterChainConfig, measurements *AudioMeasurements) { // Start with standard LA-2A soft knee knee := la2aKneeNormal + // Prefer speech-specific spectral metrics when available. + // Full-file averages are diluted by silence in multi-track recordings. + hasSpeech := measurements.SpeechProfile != nil + centroid := measurements.SpectralCentroid + if hasSpeech { + centroid = preferSpeechMetric(centroid, measurements.SpeechProfile.SpectralCentroid) + } + var speechSkewness float64 + if hasSpeech { + speechSkewness = measurements.SpeechProfile.SpectralSkewness + } + skewness := preferSpeechMetricSigned(measurements.SpectralSkewness, speechSkewness, hasSpeech) + // Adjust based on spectral centroid (voice brightness) - if measurements.SpectralCentroid > 0 { + if centroid > 0 { switch { - case measurements.SpectralCentroid < la2aCentroidDark: + case centroid < la2aCentroidDark: // Dark/warm voice - extra soft knee preserves warmth knee = la2aKneeDark - case measurements.SpectralCentroid > la2aCentroidBright: + case centroid > la2aCentroidBright: // Bright voice - slightly firmer knee knee = la2aKneeBright } } // Warm/bass-concentrated voices get extra soft knee - if measurements.SpectralSkewness > la2aSkewnessWarm { + if skewness > la2aSkewnessWarm { knee += la2aKneeWarmBoost } diff --git a/internal/processor/adaptive_test.go b/internal/processor/adaptive_test.go index bbc05a5..655fd0a 100644 --- a/internal/processor/adaptive_test.go +++ b/internal/processor/adaptive_test.go @@ -1292,6 +1292,34 @@ func TestPreferSpeechMetric(t *testing.T) { } } +func TestPreferSpeechMetricSigned(t *testing.T) { + tests := []struct { + name string + fullFile float64 + speechValue float64 + hasSpeech bool + want float64 + }{ + {"speech available positive", 1000.0, 1500.0, true, 1500.0}, + {"speech available negative", -0.05, -0.12, true, -0.12}, + {"speech available zero", -0.05, 0.0, true, 0.0}, + {"no speech falls back", 1000.0, 0.0, false, 1000.0}, + {"no speech with negative fallback", -0.05, 0.0, false, -0.05}, + {"both zero with speech", 0.0, 0.0, true, 0.0}, + {"both zero without speech", 0.0, 0.0, false, 0.0}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := preferSpeechMetricSigned(tt.fullFile, tt.speechValue, tt.hasSpeech) + if got != tt.want { + t.Errorf("preferSpeechMetricSigned(%v, %v, %v) = %v, want %v", + tt.fullFile, tt.speechValue, tt.hasSpeech, got, tt.want) + } + }) + } +} + func TestSanitizeFloat(t *testing.T) { // Tests for the sanitizeFloat helper function // Returns default value for NaN and Inf, otherwise returns original value