Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions internal/audio/reader.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ package audio
import (
"errors"
"fmt"
"math"

ffmpeg "github.com/linuxmatters/ffmpeg-statigo"
)
Expand Down Expand Up @@ -183,6 +184,17 @@ func (r *Reader) GetDecoderContext() *ffmpeg.AVCodecContext {
return r.decCtx
}

// Seek seeks to the specified timestamp in AV_TIME_BASE units.
// Use 0 to seek to the beginning of the file. After seeking, the decoder
// buffers are flushed so that subsequent ReadFrame calls return fresh data.
func (r *Reader) Seek(timestamp int64) error {
if _, err := ffmpeg.AVFormatSeekFile(r.fmtCtx, -1, math.MinInt64, timestamp, math.MaxInt64, 0); err != nil {
return fmt.Errorf("failed to seek: %w", err)
}
ffmpeg.AVCodecFlushBuffers(r.decCtx)
return nil
}

// Close releases all resources
func (r *Reader) Close() {
if r.frame != nil {
Expand Down
217 changes: 154 additions & 63 deletions internal/processor/analyzer.go
Original file line number Diff line number Diff line change
Expand Up @@ -792,15 +792,12 @@ func getIntervalsInRange(intervals []IntervalSample, start, end time.Duration) [
return nil
}

// Find first interval at or after start time
startIdx := -1
for i, interval := range intervals {
if interval.Timestamp >= start {
startIdx = i
break
}
}
if startIdx < 0 {
// Find first interval at or after start time using binary search
// (intervals are sorted by timestamp from the collection loop in AnalyzeAudio)
startIdx := sort.Search(len(intervals), func(i int) bool {
return intervals[i].Timestamp >= start
})
if startIdx >= len(intervals) {
return nil
}

Expand Down Expand Up @@ -1153,6 +1150,36 @@ func roomToneScore(interval IntervalSample, rmsP50, fluxP50 float64) float64 {
return roomToneAmplitudeWeight*amplitudeScore + roomToneFluxWeight*fluxScore
}

// silenceMedians holds pre-computed median values for silence/room-tone detection.
// Avoids redundant O(n log n) sorts when the same interval data is used by
// multiple detection functions.
type silenceMedians struct {
rmsP50 float64
fluxP50 float64
}

// computeSilenceMedians calculates RMS and spectral flux medians from the
// search interval slice used for silence/room-tone detection. The caller
// passes the already-sliced searchIntervals (first silenceSearchPercent% of intervals).
func computeSilenceMedians(searchIntervals []IntervalSample) silenceMedians {
if len(searchIntervals) == 0 {
return silenceMedians{}
}
rmsLevels := make([]float64, len(searchIntervals))
fluxValues := make([]float64, len(searchIntervals))
for i, interval := range searchIntervals {
rmsLevels[i] = interval.RMSLevel
fluxValues[i] = interval.SpectralFlux
}
sort.Float64s(rmsLevels)
sort.Float64s(fluxValues)

return silenceMedians{
rmsP50: rmsLevels[len(rmsLevels)/2],
fluxP50: fluxValues[len(fluxValues)/2],
}
}

// estimateNoiseFloorAndThreshold analyses interval data to estimate noise floor and silence threshold.
// Returns (noiseFloor, silenceThreshold, ok). If ok is false, fallback values should be used.
//
Expand All @@ -1163,7 +1190,7 @@ func roomToneScore(interval IntervalSample, rmsP50, fluxP50 float64) float64 {
//
// The noise floor is the max RMS of high-confidence room tone intervals.
// The silence threshold adds headroom to the noise floor for detection margin.
func estimateNoiseFloorAndThreshold(intervals []IntervalSample) (noiseFloor, silenceThreshold float64, ok bool) {
func estimateNoiseFloorAndThreshold(intervals []IntervalSample, medians silenceMedians) (noiseFloor, silenceThreshold float64, ok bool) {
if len(intervals) < silenceThresholdMinIntervals {
return 0, 0, false
}
Expand All @@ -1175,18 +1202,9 @@ func estimateNoiseFloorAndThreshold(intervals []IntervalSample) (noiseFloor, sil
}
searchIntervals := intervals[:searchLimit]

// Calculate medians for scoring reference
rmsLevels := make([]float64, len(searchIntervals))
fluxValues := make([]float64, len(searchIntervals))
for i, interval := range searchIntervals {
rmsLevels[i] = interval.RMSLevel
fluxValues[i] = interval.SpectralFlux
}
sort.Float64s(rmsLevels)
sort.Float64s(fluxValues)

rmsP50 := rmsLevels[len(rmsLevels)/2]
fluxP50 := fluxValues[len(fluxValues)/2]
// Use pre-computed medians for scoring reference
rmsP50 := medians.rmsP50
fluxP50 := medians.fluxP50

// Score each interval for room tone likelihood
type scoredInterval struct {
Expand Down Expand Up @@ -1233,15 +1251,15 @@ func estimateNoiseFloorAndThreshold(intervals []IntervalSample) (noiseFloor, sil
// Uses a room tone score approach that considers both amplitude and spectral stability.
//
// Detection algorithm:
// 1. Calculate reference values (medians) for room tone scoring
// 1. Use pre-computed reference values (medians) for room tone scoring
// 2. Score each interval for "room tone likelihood"
// 3. Use a score threshold (0.5) to identify room tone intervals
// 4. Find consecutive runs that meet minimum duration (8 seconds)
//
// The RMS threshold parameter is used as a hard ceiling - intervals above it
// cannot be silence regardless of spectral characteristics.
// Candidates in the first 15 seconds are excluded (typically contains intro).
func findSilenceCandidatesFromIntervals(intervals []IntervalSample, threshold float64, _ float64) []SilenceRegion {
func findSilenceCandidatesFromIntervals(intervals []IntervalSample, threshold float64, medians silenceMedians) []SilenceRegion {
if len(intervals) < minimumSilenceIntervals {
return nil
}
Expand All @@ -1251,20 +1269,10 @@ func findSilenceCandidatesFromIntervals(intervals []IntervalSample, threshold fl
if searchLimit < minimumSilenceIntervals {
searchLimit = minimumSilenceIntervals
}
searchIntervals := intervals[:searchLimit]

// Calculate medians for room tone scoring
rmsLevels := make([]float64, len(searchIntervals))
fluxValues := make([]float64, len(searchIntervals))
for i, interval := range searchIntervals {
rmsLevels[i] = interval.RMSLevel
fluxValues[i] = interval.SpectralFlux
}
sort.Float64s(rmsLevels)
sort.Float64s(fluxValues)

rmsP50 := rmsLevels[len(rmsLevels)/2]
fluxP50 := fluxValues[len(fluxValues)/2]
// Use pre-computed medians for room tone scoring
rmsP50 := medians.rmsP50
fluxP50 := medians.fluxP50

var candidates []SilenceRegion
var silenceStart time.Duration
Expand Down Expand Up @@ -1750,14 +1758,13 @@ func extractSpectralMetrics(metadata *ffmpeg.AVDictionary) spectralMetrics {
// extractIntervalFrameMetrics extracts per-frame metrics for interval accumulation.
// Only collects metrics that are valid per-window (aspectralstats, ebur128 windowed).
// Excludes astats which provides cumulative values, not per-interval.
func extractIntervalFrameMetrics(metadata *ffmpeg.AVDictionary) intervalFrameMetrics {
func extractIntervalFrameMetrics(metadata *ffmpeg.AVDictionary, spectral spectralMetrics) intervalFrameMetrics {
var m intervalFrameMetrics

// Peak level from astats (used for max tracking, which is valid per-interval)
m.PeakLevel, _ = getFloatMetadata(metadata, metaKeyPeakLevel)

// aspectralstats metrics (valid per-window measurements)
spectral := extractSpectralMetrics(metadata)
// aspectralstats metrics (valid per-window measurements, pre-extracted by caller)
m.SpectralMean = spectral.Mean
m.SpectralVariance = spectral.Variance
m.SpectralCentroid = spectral.Centroid
Expand Down Expand Up @@ -1790,14 +1797,14 @@ func extractIntervalFrameMetrics(metadata *ffmpeg.AVDictionary) intervalFrameMet
// extractFrameMetadata extracts audio analysis metadata from a filtered frame.
// Updates accumulators with spectral, astats, and ebur128 measurements.
// Called from both the main processing loop and the flush loop.
func extractFrameMetadata(metadata *ffmpeg.AVDictionary, acc *metadataAccumulators) {
func extractFrameMetadata(metadata *ffmpeg.AVDictionary, acc *metadataAccumulators, spectral spectralMetrics) {
if metadata == nil {
return
}

// Extract all aspectralstats measurements (averaged across frames)
// Accumulate pre-extracted spectral metrics (averaged across frames)
// For mono audio, spectral stats are under channel .1
acc.accumulateSpectral(extractSpectralMetrics(metadata))
acc.accumulateSpectral(spectral)

// Extract astats measurements (cumulative, so we keep the latest)
// For mono audio, stats are under channel .1
Expand Down Expand Up @@ -2299,12 +2306,16 @@ func AnalyzeAudio(filename string, config *FilterChainConfig, progressCallback f
return nil, fmt.Errorf("failed to get filtered frame: %w", err)
}

// Extract spectral metrics once, reuse for both whole-file and interval accumulators
metadata := filteredFrame.Metadata()
spectral := extractSpectralMetrics(metadata)

// Extract measurements from frame metadata (whole-file accumulators)
extractFrameMetadata(filteredFrame.Metadata(), acc)
extractFrameMetadata(metadata, acc, spectral)

// Also accumulate into current interval for per-interval spectral data
// Filtered frames roughly correspond to input timing (just at higher sample rate)
intervalAcc.add(extractIntervalFrameMetrics(filteredFrame.Metadata()))
intervalAcc.add(extractIntervalFrameMetrics(metadata, spectral))

ffmpeg.AVFrameUnref(filteredFrame)
}
Expand All @@ -2324,11 +2335,15 @@ func AnalyzeAudio(filename string, config *FilterChainConfig, progressCallback f
return nil, fmt.Errorf("failed to get filtered frame: %w", err)
}

// Extract spectral metrics once, reuse for both whole-file and interval accumulators
metadata := filteredFrame.Metadata()
spectral := extractSpectralMetrics(metadata)

// Extract measurements from remaining frames
extractFrameMetadata(filteredFrame.Metadata(), acc)
extractFrameMetadata(metadata, acc, spectral)

// Also accumulate into current interval for per-interval spectral data
intervalAcc.add(extractIntervalFrameMetrics(filteredFrame.Metadata()))
intervalAcc.add(extractIntervalFrameMetrics(metadata, spectral))

ffmpeg.AVFrameUnref(filteredFrame)
}
Expand All @@ -2347,7 +2362,18 @@ func AnalyzeAudio(filename string, config *FilterChainConfig, progressCallback f

// Estimate noise floor and silence threshold from interval data
// This replaces the previous separate pre-scan pass
noiseFloorEstimate, silenceThreshold, ok := estimateNoiseFloorAndThreshold(intervals)

// Pre-compute silence detection medians (shared by noise estimation and candidate detection)
silSearchLimit := len(intervals) * silenceSearchPercent / 100
if silSearchLimit < silenceThresholdMinIntervals {
silSearchLimit = silenceThresholdMinIntervals
}
if silSearchLimit > len(intervals) {
silSearchLimit = len(intervals)
}
silMedians := computeSilenceMedians(intervals[:silSearchLimit])

noiseFloorEstimate, silenceThreshold, ok := estimateNoiseFloorAndThreshold(intervals, silMedians)
if !ok {
// Fallback if insufficient interval data (very short recordings)
noiseFloorEstimate = defaultNoiseFloor
Expand Down Expand Up @@ -2473,7 +2499,7 @@ func AnalyzeAudio(filename string, config *FilterChainConfig, progressCallback f

// Detect silence regions using threshold already computed from interval distribution
// The silenceThreshold was calculated above via estimateNoiseFloorAndThreshold()
measurements.SilenceRegions = findSilenceCandidatesFromIntervals(intervals, silenceThreshold, 0)
measurements.SilenceRegions = findSilenceCandidatesFromIntervals(intervals, silenceThreshold, silMedians)

// Extract noise profile from best silence region (if available)
// Uses interval data for all measurements - no file re-reading required
Expand Down Expand Up @@ -3478,6 +3504,20 @@ func scoreSpeechCandidate(m *SpeechCandidateMetrics) float64 {
//
// Returns full SilenceCandidateMetrics with all amplitude, spectral, and loudness measurements.
func MeasureOutputSilenceRegion(outputPath string, region SilenceRegion) (*SilenceCandidateMetrics, error) {
// Open the processed audio file
reader, _, err := audio.OpenAudioFile(outputPath)
if err != nil {
return nil, fmt.Errorf("failed to open output file: %w", err)
}
defer reader.Close()

return measureOutputSilenceRegionFromReader(reader, region)
}

// measureOutputSilenceRegionFromReader performs the silence region measurement
// using an already-opened audio reader. This enables the combined
// MeasureOutputRegions function to share a single file open/close cycle.
func measureOutputSilenceRegionFromReader(reader *audio.Reader, region SilenceRegion) (*SilenceCandidateMetrics, error) {
// Diagnostic logging: function entry with region details
debugLog("=== MeasureOutputSilenceRegion: start=%.3fs, duration=%.3fs ===",
region.Start.Seconds(), region.Duration.Seconds())
Expand All @@ -3490,13 +3530,6 @@ func MeasureOutputSilenceRegion(outputPath string, region SilenceRegion) (*Silen
return nil, fmt.Errorf("invalid region: non-positive duration")
}

// Open the processed audio file
reader, _, err := audio.OpenAudioFile(outputPath)
if err != nil {
return nil, fmt.Errorf("failed to open output file: %w", err)
}
defer reader.Close()

// Build filter spec to extract and analyze the silence region
// Filter chain captures all measurements for comprehensive analysis:
// 1. atrim: extract the specific time region (start/duration format)
Expand Down Expand Up @@ -3836,6 +3869,57 @@ func MeasureOutputSilenceRegion(outputPath string, region SilenceRegion) (*Silen
return metrics, nil
}

// MeasureOutputRegions measures both silence and speech regions from the same
// output file in a single open/close cycle. This avoids redundant file opens,
// demuxing, and decoding that would occur when calling MeasureOutputSilenceRegion
// and MeasureOutputSpeechRegion independently.
//
// Either region parameter may be nil to skip that measurement. Returns nil for
// any skipped or failed measurement (non-fatal — matches existing behaviour).
func MeasureOutputRegions(outputPath string, silenceRegion *SilenceRegion, speechRegion *SpeechRegion) (*SilenceCandidateMetrics, *SpeechCandidateMetrics) {
if silenceRegion == nil && speechRegion == nil {
return nil, nil
}

// Open the output file once for both measurements
reader, _, err := audio.OpenAudioFile(outputPath)
if err != nil {
debugLog("Warning: Failed to open output file for region measurements: %v", err)
return nil, nil
}
defer reader.Close()

// Measure silence region first (if requested)
var silenceMetrics *SilenceCandidateMetrics
if silenceRegion != nil {
silenceMetrics, err = measureOutputSilenceRegionFromReader(reader, *silenceRegion)
if err != nil {
debugLog("Warning: Failed to measure silence region: %v", err)
// Non-fatal — continue to speech measurement
}
}

// Seek back to the beginning before measuring the speech region
if speechRegion != nil {
if silenceRegion != nil {
// Only need to seek if we already read through the file for silence
if err := reader.Seek(0); err != nil {
debugLog("Warning: Failed to seek for speech region measurement: %v", err)
return silenceMetrics, nil
}
}

speechMetrics, err := measureOutputSpeechRegionFromReader(reader, *speechRegion)
if err != nil {
debugLog("Warning: Failed to measure speech region: %v", err)
return silenceMetrics, nil
}
return silenceMetrics, speechMetrics
}

return silenceMetrics, nil
}

// MeasureOutputSpeechRegion analyses a speech region in the output file
// to capture comprehensive metrics for adaptive filter tuning and validation.
//
Expand All @@ -3844,6 +3928,20 @@ func MeasureOutputSilenceRegion(outputPath string, region SilenceRegion) (*Silen
//
// Returns full SpeechCandidateMetrics with all amplitude, spectral, and loudness measurements.
func MeasureOutputSpeechRegion(outputPath string, region SpeechRegion) (*SpeechCandidateMetrics, error) {
// Open the processed audio file
reader, _, err := audio.OpenAudioFile(outputPath)
if err != nil {
return nil, fmt.Errorf("failed to open output file: %w", err)
}
defer reader.Close()

return measureOutputSpeechRegionFromReader(reader, region)
}

// measureOutputSpeechRegionFromReader performs the speech region measurement
// using an already-opened audio reader. This enables the combined
// MeasureOutputRegions function to share a single file open/close cycle.
func measureOutputSpeechRegionFromReader(reader *audio.Reader, region SpeechRegion) (*SpeechCandidateMetrics, error) {
// Diagnostic logging: function entry with region details
debugLog("=== MeasureOutputSpeechRegion: start=%.3fs, duration=%.3fs ===",
region.Start.Seconds(), region.Duration.Seconds())
Expand All @@ -3856,13 +3954,6 @@ func MeasureOutputSpeechRegion(outputPath string, region SpeechRegion) (*SpeechC
return nil, fmt.Errorf("invalid region: non-positive duration")
}

// Open the processed audio file
reader, _, err := audio.OpenAudioFile(outputPath)
if err != nil {
return nil, fmt.Errorf("failed to open output file: %w", err)
}
defer reader.Close()

// Build filter spec to extract and analyze the speech region
// Filter chain captures all measurements for comprehensive analysis:
// 1. atrim: extract the specific time region (start/duration format)
Expand Down
Loading