From 36c814e8c9b146456db8cb62743ff89c8c6a4470 Mon Sep 17 00:00:00 2001 From: sebastianrosenzweig Date: Sat, 3 Jan 2026 23:39:42 +0100 Subject: [PATCH 1/7] add real-time swipe --- pyproject.toml | 6 +++--- pytch/audio.py | 48 ++++++++++++++++-------------------------------- pytch/gui.py | 13 ------------- 3 files changed, 19 insertions(+), 48 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 27c528b..f2bfbc5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "pytch" -version = "2.2.0" +version = "2.3.0" description = "A Real-Time Pitch Analysis Tool For Polyphonic Music" authors = [ {name = "Pytch Contributors"} @@ -14,9 +14,9 @@ dependencies = [ "numpy>=1.25.2", "scipy>=1.11.1", "pyqt6>=6.7.0", - "libf0>=1.0.2", "sounddevice>=0.4.7", - "pyqtgraph>=0.13.1" + "pyqtgraph>=0.13.1", + "rtswipe@git+https://github.com/groupmm/real_time_swipe.git" ] requires-python = ">=3.11" diff --git a/pytch/audio.py b/pytch/audio.py index 4363c7e..63125b9 100644 --- a/pytch/audio.py +++ b/pytch/audio.py @@ -8,7 +8,7 @@ import logging import sounddevice import soundfile as sf -import libf0 +from rtswipe import RTSwipe from scipy.ndimage import median_filter from datetime import datetime import csv @@ -75,8 +75,7 @@ def check_fs(device_index, fs): logger.debug(e) valid = False - finally: - return valid + return valid @njit @@ -211,7 +210,6 @@ def __init__( fft_len=512, channels=None, device_no=None, - f0_algorithm="YIN", out_path="", ): """Initialize audio processing. @@ -222,7 +220,6 @@ def __init__( fft_len: FFT length in bins. channels: List of channels to record. device_no: Index of device to record from. - f0_algorithm: F0 algorithm to use. out_path: Output directory for F0 trajectories. """ self.fs = fs @@ -233,7 +230,6 @@ def __init__( self.fft_win = np.hanning(self.fft_len).reshape(-1, 1) self.channels = [0] if channels is None else channels self.device_no = device_no - self.f0_algorithm = f0_algorithm self.out_path = out_path self.f0_lvl_threshold = -70 # minimum level in dB to compute f0 estimates self.frame_rate = self.fs / self.hop_len @@ -289,6 +285,16 @@ def __init__( + [f"Confidence Channel {ch}" for ch in channels] ) + # initialize real-time SWIPE + self.rtswipe = RTSwipe( + fs=self.fs, + hop_len=self.fft_len, + f_min=55.0, + f_max=1760.0, + num_channels=len(channels), + delay=0.0, + ) + def start_stream(self): """Start recording and processing""" self.stop_stream() @@ -396,32 +402,10 @@ def compute_f0(self, audio, lvl): conf: Confidence. """ - f0 = np.zeros((1, audio.shape[1])) - conf = np.zeros((1, audio.shape[1])) - - for c in range(audio.shape[1]): - if lvl[0, c] < self.f0_lvl_threshold: - continue - - audio_tmp = np.concatenate( - (audio[:, c][::-1], audio[:, c], audio[:, c][::-1]) - ) - if self.f0_algorithm == "YIN": - f0_tmp, _, conf_tmp = libf0.yin( - audio_tmp, - Fs=self.fs, - N=self.fft_len, - H=self.fft_len, - F_min=80.0, - F_max=640.0, - threshold=0.15, - verbose=False, - ) - f0[:, c] = np.mean(f0_tmp) # take the center frame - conf[:, c] = 1 - np.mean(conf_tmp) - else: - f0[:, c] = np.zeros(f0.shape[0]) - conf[:, c] = np.zeros(f0.shape[0]) + if np.all(lvl > self.f0_lvl_threshold): + f0, conf = self.rtswipe(audio) + else: + f0 = conf = np.zeros((1, len(self.channels))) return f0, conf diff --git a/pytch/gui.py b/pytch/gui.py index 9660bc9..ea402f1 100644 --- a/pytch/gui.py +++ b/pytch/gui.py @@ -221,7 +221,6 @@ def __init__(self, sounddevice_idx, channels, fs, fft_size, out_path): self.fs = fs self.fft_size = fft_size self.out_path = out_path - self.f0_algorithms = ["YIN"] self.buf_len_sec = 30.0 # sec self.spec_scale_types = ["log", "linear"] self.ref_freq_modes = ["fixed", "highest", "lowest"] @@ -271,7 +270,6 @@ def __init__(self, sounddevice_idx, channels, fs, fft_size, out_path): fft_len=self.fft_size, channels=self.channels, device_no=self.sounddevice_idx, - f0_algorithm=self.f0_algorithms[0], out_path=out_path, ) @@ -475,13 +473,6 @@ def __init__(self, main_window: MainWindow): ) layout.addWidget(self.box_show_tv, 10, 1, 1, 1) - layout.addWidget(qw.QLabel("F0 Algorithm"), 11, 0) - self.select_algorithm = qw.QComboBox(self) - self.select_algorithm.addItems(main_window.f0_algorithms) - self.select_algorithm.setCurrentIndex(0) - self.select_algorithm.currentTextChanged.connect(self.on_algorithm_select) - layout.addWidget(self.select_algorithm, 11, 1, 1, 1) - layout.addWidget(qw.QLabel("Confidence Threshold"), 12, 0) self.noise_thresh_slider = qw.QSlider() self.noise_thresh_slider.setRange(0, 10) @@ -561,10 +552,6 @@ def on_max_freq_changed(self, f): self.main_window.cur_disp_freq_lims ) - def on_algorithm_select(self, algorithm): - """Update function for F0 algorithm on user interaction.""" - self.main_window.audio_processor.f0_algorithm = algorithm - def on_conf_threshold_changed(self, val): """Update function for confidence threshold on user interaction.""" self.noise_thresh_label.setText(str(val / 10.0)) From c6a7dd2b56f546c82f2914bd195ce522fe525182 Mon Sep 17 00:00:00 2001 From: sebastianrosenzweig Date: Sun, 4 Jan 2026 22:37:30 +0100 Subject: [PATCH 2/7] swipe and yin --- pytch/audio.py | 39 ++++++++++++++++++++++++++++++++++----- pytch/gui.py | 15 ++++++++++++++- 2 files changed, 48 insertions(+), 6 deletions(-) diff --git a/pytch/audio.py b/pytch/audio.py index 63125b9..2128df8 100644 --- a/pytch/audio.py +++ b/pytch/audio.py @@ -8,6 +8,7 @@ import logging import sounddevice import soundfile as sf +import libf0 from rtswipe import RTSwipe from scipy.ndimage import median_filter from datetime import datetime @@ -75,7 +76,8 @@ def check_fs(device_index, fs): logger.debug(e) valid = False - return valid + finally: + return valid @njit @@ -210,6 +212,7 @@ def __init__( fft_len=512, channels=None, device_no=None, + f0_algorithm="YIN", out_path="", ): """Initialize audio processing. @@ -220,6 +223,7 @@ def __init__( fft_len: FFT length in bins. channels: List of channels to record. device_no: Index of device to record from. + f0_algorithm: F0 algorithm to use. out_path: Output directory for F0 trajectories. """ self.fs = fs @@ -230,6 +234,7 @@ def __init__( self.fft_win = np.hanning(self.fft_len).reshape(-1, 1) self.channels = [0] if channels is None else channels self.device_no = device_no + self.f0_algorithm = f0_algorithm self.out_path = out_path self.f0_lvl_threshold = -70 # minimum level in dB to compute f0 estimates self.frame_rate = self.fs / self.hop_len @@ -402,10 +407,34 @@ def compute_f0(self, audio, lvl): conf: Confidence. """ - if np.all(lvl > self.f0_lvl_threshold): - f0, conf = self.rtswipe(audio) - else: - f0 = conf = np.zeros((1, len(self.channels))) + f0 = np.zeros((1, audio.shape[1])) + conf = np.zeros((1, audio.shape[1])) + + if self.f0_algorithm == "YIN": + for c in range(audio.shape[1]): + if lvl[0, c] < self.f0_lvl_threshold: + continue + + audio_tmp = np.concatenate( + (audio[:, c][::-1], audio[:, c], audio[:, c][::-1]) + ) + f0_tmp, _, conf_tmp = libf0.yin( + audio_tmp, + Fs=self.fs, + N=self.fft_len, + H=self.fft_len, + F_min=80.0, + F_max=640.0, + threshold=0.15, + verbose=False, + ) + f0[:, c] = np.mean(f0_tmp) # take the center frame + conf[:, c] = 1 - np.mean(conf_tmp) + elif self.f0_algorithm == "SWIPE": + if np.all(lvl > self.f0_lvl_threshold): + f0, conf = self.rtswipe(audio) + f0 = f0.reshape(1, -1) + conf = conf.reshape(1, -1) return f0, conf diff --git a/pytch/gui.py b/pytch/gui.py index ea402f1..74c77cc 100644 --- a/pytch/gui.py +++ b/pytch/gui.py @@ -221,6 +221,7 @@ def __init__(self, sounddevice_idx, channels, fs, fft_size, out_path): self.fs = fs self.fft_size = fft_size self.out_path = out_path + self.f0_algorithms = ["SWIPE", "YIN"] self.buf_len_sec = 30.0 # sec self.spec_scale_types = ["log", "linear"] self.ref_freq_modes = ["fixed", "highest", "lowest"] @@ -243,7 +244,7 @@ def __init__(self, sounddevice_idx, channels, fs, fft_size, out_path): self.cur_spec_scale_type = self.spec_scale_types[0] self.cur_ref_freq_mode = self.ref_freq_modes[0] self.cur_ref_freq = 220 # Hz - self.cur_conf_threshold = 0.5 + self.cur_conf_threshold = 0.2 self.cur_gradient_tol = 600 # Cents self.cur_smoothing_len = 3 # bins self.gui_refresh_ms = int(np.round(1000 / 60)) # 60 fps @@ -270,6 +271,7 @@ def __init__(self, sounddevice_idx, channels, fs, fft_size, out_path): fft_len=self.fft_size, channels=self.channels, device_no=self.sounddevice_idx, + f0_algorithm=self.f0_algorithms[0], out_path=out_path, ) @@ -473,6 +475,13 @@ def __init__(self, main_window: MainWindow): ) layout.addWidget(self.box_show_tv, 10, 1, 1, 1) + layout.addWidget(qw.QLabel("F0 Algorithm"), 11, 0) + self.select_algorithm = qw.QComboBox(self) + self.select_algorithm.addItems(main_window.f0_algorithms) + self.select_algorithm.setCurrentIndex(0) + self.select_algorithm.currentTextChanged.connect(self.on_algorithm_select) + layout.addWidget(self.select_algorithm, 11, 1, 1, 1) + layout.addWidget(qw.QLabel("Confidence Threshold"), 12, 0) self.noise_thresh_slider = qw.QSlider() self.noise_thresh_slider.setRange(0, 10) @@ -552,6 +561,10 @@ def on_max_freq_changed(self, f): self.main_window.cur_disp_freq_lims ) + def on_algorithm_select(self, algorithm): + """Update function for F0 algorithm on user interaction.""" + self.main_window.audio_processor.f0_algorithm = algorithm + def on_conf_threshold_changed(self, val): """Update function for confidence threshold on user interaction.""" self.noise_thresh_label.setText(str(val / 10.0)) From 4e292edca639047c4fefdd5327dd3fa037d416ba Mon Sep 17 00:00:00 2001 From: sebastianrosenzweig Date: Thu, 8 Jan 2026 21:36:07 +0100 Subject: [PATCH 3/7] use updated libf0 fork --- pyproject.toml | 5 +++-- pytch/audio.py | 3 +-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index f2bfbc5..00e431d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "pytch" -version = "2.3.0" +version = "2.3.0rc" description = "A Real-Time Pitch Analysis Tool For Polyphonic Music" authors = [ {name = "Pytch Contributors"} @@ -16,7 +16,8 @@ dependencies = [ "pyqt6>=6.7.0", "sounddevice>=0.4.7", "pyqtgraph>=0.13.1", - "rtswipe@git+https://github.com/groupmm/real_time_swipe.git" + "rtswipe@git+https://github.com/groupmm/real_time_swipe.git", + "libf0@git+https://github.com/sebastianrosenzweig/libf0.git" ] requires-python = ">=3.11" diff --git a/pytch/audio.py b/pytch/audio.py index 2128df8..a6e6698 100644 --- a/pytch/audio.py +++ b/pytch/audio.py @@ -76,8 +76,7 @@ def check_fs(device_index, fs): logger.debug(e) valid = False - finally: - return valid + return valid @njit From c321e67b13ef06c9033698fb3365e9bdb898b26c Mon Sep 17 00:00:00 2001 From: sebastianrosenzweig Date: Fri, 16 Jan 2026 20:44:12 +0100 Subject: [PATCH 4/7] more paper sections --- paper/paper.bib | 8 ++++++++ paper/paper.md | 38 ++++++++++++++++++++++++-------------- pyproject.toml | 4 ++-- 3 files changed, 34 insertions(+), 16 deletions(-) diff --git a/paper/paper.bib b/paper/paper.bib index e619fc3..83a310c 100644 --- a/paper/paper.bib +++ b/paper/paper.bib @@ -1,3 +1,11 @@ +@inproceedings{MeierSSMB25_RealTimeSWIPE_CMMR, + author = {Peter Meier and Sebastian Strahl and Simon Schw{\"a}r and Meinard M{\"u}ller and Stefan Balke}, + title = {Pitch Estimation in Real Time: Revisiting {SWIPE} With Causal Windowing}, + booktitle = {Proceedings of the International Symposium on Computer Music Multidisciplinary Research ({CMMR})}, + address = {London, UK}, + year = {2025} +} + @article{MeierCM24_RealTimePLP_TISMIR, author = {Peter Meier and Ching-Yu Chiu and Meinard M{\"u}ller}, title = {{A} Real-Time Beat Tracking System with Zero Latency and Enhanced Controllability}, diff --git a/paper/paper.md b/paper/paper.md index e6aa802..52d3806 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -30,31 +30,35 @@ affiliations: index: 2 - name: University of Potsdam, Potsdam, Germany index: 3 -date: 30 May 2025 +date: 16 January 2026 bibliography: paper.bib --- # Summary Polyphonic singing is one of the most widespread forms of music-making. During a performance, singers must constantly adjust their pitch to stay in tune with one another — a complex skill that requires extensive practice. Research has shown that pitch monitoring tools can assist singers in fine-tuning their intonation during a performance [@BerglinPD22_VisualFeedback_JPM]. Specifically, real-time visualizations of the fundamental frequency (F0), which represents the pitch of the singing voice, help singers assess their pitch relative to a fixed reference or other voices. -To support the monitoring of polyphonic singing performances, we developed `pytch`, an interactive Python tool with a graphical user interface (GUI) designed to record, process, and visualize multiple voices in real time. The GUI displays vocal spectra and estimated F0 trajectories for all singers, as well as the harmonic intervals between them. Additionally, users can adjust visual and algorithmic parameters interactively to accommodate different input devices, microphone signals, singing styles, and use cases. Written in Python, `pytch` utilizes the `libf0` library [@RosenzweigSM22_libf0_ISMIR-LBD] for real-time F0 estimation and `pyqtgraph`[^1] for efficient visualizations of the analysis results. -Our tool builds upon a late-breaking demo in [@KriegerowskiS_Pytch_2017], which we refer to as version 1. Since then, the tool has been significantly extended with a new real-time graphics engine, a modular audio processing backend that facilitates the integration of additional algorithms, and improved support for a wider range of platforms and recording hardware, which we refer to as version 2. Over its seven years of development, `pytch` has been tested and refined through use in several rehearsals, workshops, and field studies — including Sardinian quartet singing (see demo video[^2]) and traditional Georgian singing (see demo video[^3]). +To support the monitoring of polyphonic singing performances, we developed `pytch`, an interactive Python tool with a graphical user interface (GUI) designed to record, process, and visualize multiple voices in real time. The GUI displays vocal spectra and estimated F0 trajectories for all singers, as well as the harmonic intervals between them. Additionally, users can adjust visual and algorithmic parameters interactively to accommodate different input devices, microphone signals, singing styles, and use cases. Written in Python, `pytch` utilizes the libraries `libf0` [@RosenzweigSM22_libf0_ISMIR-LBD] and `rtswipe` [@MeierSSMB25_RealTimeSWIPE_CMMR] for real-time F0 estimation, and `pyqtgraph`[^1] for efficient visualizations of the analysis results. +Our tool builds upon a late-breaking demo in [@KriegerowskiS_Pytch_2017], which we refer to as version 1. Since then, the tool has been significantly extended with a new real-time graphics engine, a modular audio processing backend that facilitates the integration of additional algorithms, and improved support for a wider range of platforms and recording hardware, which we refer to as version 2. Its applications range from research in the field of computational musicology to pedagogical contexts focused on intonation and harmonic listening. [^1]: -[^2]: -[^3]: # Statement of Need Software that assesses the pitch of a singing voice in real time is best known from Karaoke singing applications, such as Let's Sing[^2], Rock Band[^3], or Cantamus[^4]. These tools typically compare the singer’s pitch to a score reference to judge whether notes are ‘correct’ or ‘incorrect’. However, such applications face several limitations when applied to polyphonic or group singing contexts. Most notably, many Karaoke systems can only process one or two singing voices at a time, which is problematic for monitoring group performances. Additionally, software that relies on a score as a reference poses challenges for a cappella performances, where singers may drift together in pitch over time while maintaining relative harmony, or in orally-transmitted traditions that may lack a formal score altogether. Finally, existing open-source research software for singing voice processing, like Praat [@Boersma01_Praat_GI], Sonic Visualiser [@CannamLS10_SonicVisualizer_ICMC], and Tarsos [@SixCL13_Tarsos_JNMR], lack real-time feedback, preventing an effective feedback loop between singers and their tool. -To address these challenges, we developed `pytch`. Our tool is currently the only software that enables singers and conductors to monitor and train harmonic interval singing in real time — a skill that is essential in many vocal traditions. This includes not only polyphonic genres such as traditional Georgian vocal music [@ScherbaumMRM19_MultimediaRecordings_FMA] or Barbershop singing [@HagermanS80_Barbershop_CITESEER], where precise tuning between voices is stylistically central, but also the practice of non-tempered tuning systems found in various oral traditions. In more detail, the vocal spectra can help singers fine-tune the expression of formant frequencies, while melodic and harmonic issues become visible through F0 trajectories and harmonic intervals. Unlike many existing tools, `pytch` does not require a musical score, making it well-suited for rehearsals, ethnomusicological research and pedagogical contexts focused on intonation and harmonic listening. +To address these challenges, we developed `pytch`. Our tool is currently the only software that enables singers and conductors to monitor and train harmonic interval singing in real time — a skill that is essential in many vocal traditions. This includes not only polyphonic genres such as traditional Georgian vocal music [@ScherbaumMRM19_MultimediaRecordings_FMA] or Barbershop singing [@HagermanS80_Barbershop_CITESEER], where precise tuning between voices is stylistically central, but also the practice of non-tempered tuning systems found in various oral traditions. Unlike many existing tools, `pytch` does not require a musical score. The vocal spectra can help singers fine-tune the expression of formant frequencies, while melodic and harmonic issues become visible through F0 trajectories and harmonic intervals. -In addition to its practical applications, `pytch` also provides a flexible platform for music information retrieval (MIR) research on real-time audio processing. Working with real-time data introduces challenges such as a limited audio context for analysis and strict timing constraints to ensure low-latency processing. Researchers can use `pytch` to develop, test, and compare algorithms for F0 estimation and other music information retrieval tasks [@StefaniT22_RealTimeMIR_DAFX;@Goto04_RealTimeF0_SC;@MeierCM24_RealTimePLP_TISMIR]. +[^2]: +[^3]: +[^4]: -[^4]: -[^5]: -[^6]: +# Research Impact Statement +Over its seven years of development to now, `pytch` has been tested and iterated through use in ensemble rehearsals, singing workshops, and ethnomusicological field research. For example, the tool was used to analyze the Sardinian singing style of the "Quintina" in which four singers manage to fuse the high frequency partials of their voices in such a way that an apparent fifth voice appears (see demo video[^2]). Furthermore, `pytch` has been used during a field expedition to Georgia, where ethnomusicologists recorded and analyzed traditional vocal music which belongs to the UNESCO intangible world cultural heritage (see demo video[^3]). + +In addition to its use in musicological research, `pytch` also provides a platform for music information retrieval (MIR) research on real-time audio processing. Working with real-time data introduces challenges such as a limited audio context for analysis and strict timing constraints to ensure low-latency processing. Researchers can use `pytch` to develop, test, and compare algorithms for F0 estimation and other music information retrieval tasks [@StefaniT22_RealTimeMIR_DAFX;@Goto04_RealTimeF0_SC;@MeierCM24_RealTimePLP_TISMIR]. A notable example is the `rtswipe` [@MeierSSMB25_RealTimeSWIPE_CMMR] library, which was inspired by `pytch` use cases and integrated after its initial release. Using its implementation significantly increased the speed and accuracy of F0 estimates in `pytch`; consequently, it was selected as the default algorithm in our tool. + +[^5]: +[^6]: # Multitrack Singing Recordings @@ -68,14 +72,17 @@ In addition to live monitoring, `pytch` can also be used to analyze pre-recorded [^7]: [^8]: +# Software Design +`pytch` is implemented in Python, the predominant language for research software in music computing, which facilitates contributions from other researchers. The `pytch` codebase systematically separates GUI code from audio processing code to ease maintenance. -# Audio Processing +## Audio Processing The real-time audio processing pipeline implemented in the file `audio.py` is the heart of `pytch` and consists of two main stages: recording and analysis. The recording stage captures multichannel audio waveforms from the soundcard or an external audio interface using the `sounddevice` library. The library is based on PortAudio and supports a wide range of operating systems, audio devices, and sampling rates. The recorded audio is received in chunks via a recording callback and fed into a ring buffer shared with the analysis process. When the buffer is sufficiently filled with audio chunks, the analysis process reads the recorded audio to compute several audio features. -For each channel, the analysis stage computes the audio level in dBFS, a time--frequency representation of the audio signal via the Short-Time Fourier Transform (see [@Mueller21_FMP_SPRINGER] for fundamentals of music processing), and an estimate of the F0 along with a confidence value, using the `libf0` library [@RosenzweigSM22_libf0_ISMIR-LBD]. The library includes several implementations of well-known F0 estimation algorithms. We make use of YIN [@CheveigneK02_YIN_JASA], which is a time-domain algorithm that computes the F0 based on a tweaked auto-correlation function. It is computationally efficient and well-suited for low-latency applications, but it tends to suffer from estimation errors, particularly confusions with higher harmonics such as the octave. The obtained F0 estimates, which are natively computed in the unit Hz, are converted to the unit cents using a user-specified reference frequency. Depending on the audio quality and vocal characteristics, F0 estimates may exhibit artifacts such as discontinuities or pitch slides, which can make the resulting trajectories difficult to interpret [@RosenzweigSM19_StableF0_ISMIR]. Previous research has shown that using throat microphones can improve the isolation of individual voices in group singing contexts, resulting in cleaner signals and more accurate F0 estimates [@Scherbaum16_LarynxMicrophones_IWFMA]. To further enhance interpretability, `pytch` includes several optional post-processing steps: a confidence threshold to discard estimates with low confidence score, a median filter to smooth the trajectories, and a gradient filter to suppress abrupt pitch slides. As a final step in the audio analysis, the harmonic intervals between the F0 trajectories are computed. Every audio feature is stored separately in a dedicated ring buffer. After processing, the pipeline sets a flag that notifies the GUI that new data is ready for visualization. +For each channel, the analysis stage computes the audio level in dBFS, a time--frequency representation of the audio signal via the Short-Time Fourier Transform (see [@Mueller21_FMP_SPRINGER] for fundamentals of music processing), and an estimate of the F0 along with a confidence value. There are currently two F0 algorithms available: YIN [@CheveigneK02_YIN_JASA] from the `libf0` library [@RosenzweigSM22_libf0_ISMIR-LBD], and a real-time version of SWIPE [@CamachoH08_SawtoothWaveform_JASA] from the `rtswipe` library [@MeierSSMB25_RealTimeSWIPE_CMMR]. YIN is a time-domain algorithm that computes the F0 based on a tweaked auto-correlation function. It is computationally efficient and well-suited for low-latency applications, but it tends to suffer from estimation errors, particularly confusions with higher harmonics such as the octave. SWIPE is a frequency-domain algorithm that computes the F0 by correlating pitch candidate kernels derived from a sawtooth waveform with the spectrum of an input signal. It is known to be more robust to noise as YIN. +The obtained F0 estimates, which are natively computed in the unit Hz, are converted to the unit cents using a user-specified reference frequency. Depending on the audio quality and vocal characteristics, F0 estimates may exhibit artifacts such as discontinuities or pitch slides, which can make the resulting trajectories difficult to interpret [@RosenzweigSM19_StableF0_ISMIR]. Previous research has shown that using throat microphones can improve the isolation of individual voices in group singing contexts, resulting in cleaner signals and more accurate F0 estimates [@Scherbaum16_LarynxMicrophones_IWFMA]. To further enhance interpretability, `pytch` includes several optional post-processing steps: a confidence threshold to discard estimates with low confidence score, a median filter to smooth the trajectories, and a gradient filter to suppress abrupt pitch slides. As a final step in the audio analysis, the harmonic intervals between the F0 trajectories are computed. Every audio feature is stored separately in a dedicated ring buffer. After processing, the pipeline sets a flag that notifies the GUI that new data is ready for visualization. -# Graphical User Interface (GUI) +## Graphical User Interface (GUI) In this section, we provide a step-by-step explanation of the `pytch` GUI implemented in the file `gui.py`. Right after the program start, a startup menu opens in which the user is asked to specify the soundcard, input channels, sampling rate, and window size for processing (see Figure \autoref{fig:menu}). Furthermore, the user can choose to store the recorded audio and the F0 trajectories on disk. ![`pytch` startup menu.\label{fig:menu}](../pictures/menu.png){ width=50% } @@ -88,7 +95,10 @@ The main GUI is organized into three horizontal sections. On the left, a control The right section, referred to as the "trajectory view," provides time-based visualizations of either the F0 trajectories ("pitches" tab) or the harmonic intervals between voices ("differential" tab) with a 10 second time context. Using the controls in the left-side menu, the user can select the F0 estimation algorithm and improve the real-time visualization by adjusting the confidence threshold, the median filter length for smoothing, and the tolerance of the gradient filter. F0 and interval trajectories can be displayed with respect to a fixed reference frequency or a dynamic one derived from a selected channel, the lowest, or highest detected voice. Axis limits for this section can also be manually set. +# AI Usage Disclosure +The authors used the AI tool ChatGPT to assist with grammar, spelling, and language refinement in this article. In addition, ChatGPT was used to generate example code for PyQt6 functions and classes. No code was copied verbatim from ChatGPT; all generated examples were reviewed, adapted, and integrated by the authors. + # Acknowledgements -We would like to thank Lukas Dietz for his help with the implementation, Peter Meier and Sebastian Strahl for the collaboration on real-time implementations, and all the singers who contributed to testing `pytch` during its development. +We would like to thank Lukas Dietz for his help with the implementation, Peter Meier and Sebastian Strahl for the collaboration on the real-time SWIPE implementation, and all the singers who contributed to testing `pytch` during its development. # References diff --git a/pyproject.toml b/pyproject.toml index 00e431d..f6af893 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "pytch" -version = "2.3.0rc" +version = "2.3.0" description = "A Real-Time Pitch Analysis Tool For Polyphonic Music" authors = [ {name = "Pytch Contributors"} @@ -17,7 +17,7 @@ dependencies = [ "sounddevice>=0.4.7", "pyqtgraph>=0.13.1", "rtswipe@git+https://github.com/groupmm/real_time_swipe.git", - "libf0@git+https://github.com/sebastianrosenzweig/libf0.git" + "libf0>=1.1.0" ] requires-python = ">=3.11" From c186c9ada7c529d4e6de0b677089b2ac8e28c22f Mon Sep 17 00:00:00 2001 From: sebastianrosenzweig Date: Fri, 30 Jan 2026 16:35:09 +0100 Subject: [PATCH 5/7] fix ref freq display --- pytch/gui_utils.py | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/pytch/gui_utils.py b/pytch/gui_utils.py index 5871037..1da9422 100644 --- a/pytch/gui_utils.py +++ b/pytch/gui_utils.py @@ -26,25 +26,29 @@ class FloatQLineEdit(qw.QLineEdit): accepted_value = qc.pyqtSignal(float) def __init__(self, default=None): - """Initialization. + super().__init__() - Args: - default: Default value. + validator = qg.QDoubleValidator(self) + validator.setNotation(qg.QDoubleValidator.Notation.StandardNotation) + validator.setLocale(qc.QLocale.c()) # force "." as decimal separator + self.setValidator(validator) - """ - qw.QLineEdit.__init__(self) - self.setValidator(qg.QDoubleValidator()) self.setFocusPolicy(qc.Qt.FocusPolicy.ClickFocus | qc.Qt.FocusPolicy.TabFocus) - self.returnPressed.connect(self.do_check) - p = self.parent() - if p: - self.returnPressed.connect(p.setFocus) - if default: - self.setText(str(default)) + + self.editingFinished.connect(self.do_check) + + if default is not None: + self.setText(f"{float(default)}") def do_check(self): - text = self.text() - val = float(text) + text = self.text().replace(",", "") + try: + val = float(text) + except ValueError: + return + + # Normalize display (no scientific notation) + self.setText(f"{val}") self.accepted_value.emit(val) From 34ba71c0756a06bed36ff4956e21cc6e82d40415 Mon Sep 17 00:00:00 2001 From: sebastianrosenzweig Date: Sun, 1 Feb 2026 18:29:31 +0100 Subject: [PATCH 6/7] more --- paper/paper.bib | 20 ++++++++++++++++++++ paper/paper.md | 42 +++++++++++++++++++++++------------------- 2 files changed, 43 insertions(+), 19 deletions(-) diff --git a/paper/paper.bib b/paper/paper.bib index 83a310c..ca511ad 100644 --- a/paper/paper.bib +++ b/paper/paper.bib @@ -6,6 +6,16 @@ @inproceedings{MeierSSMB25_RealTimeSWIPE_CMMR year = {2025} } +@software{Aubier25_FUS_JOSS, + author = {Tom Aubier}, + title = {{CoperniFUS}: {A} flexible {P}ython-based {GUI} for stereotaxic {F}ocused {U}ltra{S}ound ({FUS}) experiment planning}, + year = {2025}, + publisher = {Zenodo}, + version = {0.1.2}, + doi = {10.5281/zenodo.15720551}, + url = {https://doi.org/10.5281/zenodo.15720551}, +} + @article{MeierCM24_RealTimePLP_TISMIR, author = {Peter Meier and Ching-Yu Chiu and Meinard M{\"u}ller}, title = {{A} Real-Time Beat Tracking System with Zero Latency and Enhanced Controllability}, @@ -83,6 +93,16 @@ @article{RosenzweigCWSGM20_DCS_TISMIR url-demo = {https://www.audiolabs-erlangen.de/resources/MIR/2020-DagstuhlChoirSet} } +@software{PatakyND19_mwarp1d_JOSS, + author = {Todd Pataky and Hanaa Naouma and Cyril Donnelly}, + title = {{mwarp1d}: {M}anual one-dimensional data warping in {P}ython and {P}y{Q}t}, + year = {2019}, + publisher = {Zenodo}, + version = {v0.2.0}, + doi = {10.5281/zenodo.3568447}, + url = {https://doi.org/10.5281/zenodo.3568447}, +} + @inproceedings{ScherbaumMRM19_MultimediaRecordings_FMA, author = {Frank Scherbaum and Nana Mzhavanadze and Sebastian Rosenzweig and Meinard M{\"u}ller}, title = {Multi-media recordings of traditional {G}eorgian vocal music for computational analysis}, diff --git a/paper/paper.md b/paper/paper.md index 52d3806..8a5b52e 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -30,35 +30,33 @@ affiliations: index: 2 - name: University of Potsdam, Potsdam, Germany index: 3 -date: 16 January 2026 +date: 1 February 2026 bibliography: paper.bib --- # Summary Polyphonic singing is one of the most widespread forms of music-making. During a performance, singers must constantly adjust their pitch to stay in tune with one another — a complex skill that requires extensive practice. Research has shown that pitch monitoring tools can assist singers in fine-tuning their intonation during a performance [@BerglinPD22_VisualFeedback_JPM]. Specifically, real-time visualizations of the fundamental frequency (F0), which represents the pitch of the singing voice, help singers assess their pitch relative to a fixed reference or other voices. -To support the monitoring of polyphonic singing performances, we developed `pytch`, an interactive Python tool with a graphical user interface (GUI) designed to record, process, and visualize multiple voices in real time. The GUI displays vocal spectra and estimated F0 trajectories for all singers, as well as the harmonic intervals between them. Additionally, users can adjust visual and algorithmic parameters interactively to accommodate different input devices, microphone signals, singing styles, and use cases. Written in Python, `pytch` utilizes the libraries `libf0` [@RosenzweigSM22_libf0_ISMIR-LBD] and `rtswipe` [@MeierSSMB25_RealTimeSWIPE_CMMR] for real-time F0 estimation, and `pyqtgraph`[^1] for efficient visualizations of the analysis results. -Our tool builds upon a late-breaking demo in [@KriegerowskiS_Pytch_2017], which we refer to as version 1. Since then, the tool has been significantly extended with a new real-time graphics engine, a modular audio processing backend that facilitates the integration of additional algorithms, and improved support for a wider range of platforms and recording hardware, which we refer to as version 2. Its applications range from research in the field of computational musicology to pedagogical contexts focused on intonation and harmonic listening. - -[^1]: +To support the monitoring of polyphonic singing performances, we developed `pytch`, an interactive Python tool with a graphical user interface (GUI) designed to record, process, and visualize multiple voices in real time. The GUI displays vocal spectra and estimated F0 trajectories for all singers, as well as the harmonic intervals between them. Additionally, users can adjust visual and algorithmic parameters interactively to accommodate different input devices, microphone signals, singing styles, and use cases. +Our tool builds upon a late-breaking demo in [@KriegerowskiS_Pytch_2017], which we refer to as version 1. Since its initial release, the tool has been significantly extended with a new real-time graphics engine, a modular audio processing backend that facilitates the integration of additional algorithms, and improved support for a wider range of platforms and recording hardware, which we refer to as version 2. The applications of `pytch` range from research in the field of computational musicology to pedagogical contexts focused on intonation and harmonic listening. # Statement of Need -Software that assesses the pitch of a singing voice in real time is best known from Karaoke singing applications, such as Let's Sing[^2], Rock Band[^3], or Cantamus[^4]. These tools typically compare the singer’s pitch to a score reference to judge whether notes are ‘correct’ or ‘incorrect’. However, such applications face several limitations when applied to polyphonic or group singing contexts. Most notably, many Karaoke systems can only process one or two singing voices at a time, which is problematic for monitoring group performances. Additionally, software that relies on a score as a reference poses challenges for a cappella performances, where singers may drift together in pitch over time while maintaining relative harmony, or in orally-transmitted traditions that may lack a formal score altogether. Finally, existing open-source research software for singing voice processing, like Praat [@Boersma01_Praat_GI], Sonic Visualiser [@CannamLS10_SonicVisualizer_ICMC], and Tarsos [@SixCL13_Tarsos_JNMR], lack real-time feedback, preventing an effective feedback loop between singers and their tool. +Software that assesses the pitch of a singing voice in real time is best known from Karaoke singing applications, such as Let's Sing[^1], Rock Band[^2], or Cantamus[^3]. These tools typically compare the singer’s pitch to a score reference to judge whether notes are ‘correct’ or ‘incorrect’. However, such applications face several limitations when applied to polyphonic or group singing contexts. Most notably, many Karaoke systems can only process one or two singing voices at a time, which is problematic for monitoring group performances. Additionally, software that relies on a score as a reference poses challenges for a cappella performances, where singers may drift together in pitch over time while maintaining relative harmony, or in orally-transmitted traditions that may lack a formal score altogether. Finally, existing open-source research software for singing voice processing, like Praat [@Boersma01_Praat_GI], Sonic Visualiser [@CannamLS10_SonicVisualizer_ICMC], and Tarsos [@SixCL13_Tarsos_JNMR], lack real-time feedback, preventing an effective feedback loop between singers and their tool. To address these challenges, we developed `pytch`. Our tool is currently the only software that enables singers and conductors to monitor and train harmonic interval singing in real time — a skill that is essential in many vocal traditions. This includes not only polyphonic genres such as traditional Georgian vocal music [@ScherbaumMRM19_MultimediaRecordings_FMA] or Barbershop singing [@HagermanS80_Barbershop_CITESEER], where precise tuning between voices is stylistically central, but also the practice of non-tempered tuning systems found in various oral traditions. Unlike many existing tools, `pytch` does not require a musical score. The vocal spectra can help singers fine-tune the expression of formant frequencies, while melodic and harmonic issues become visible through F0 trajectories and harmonic intervals. -[^2]: -[^3]: -[^4]: +[^1]: +[^2]: +[^3]: # Research Impact Statement -Over its seven years of development to now, `pytch` has been tested and iterated through use in ensemble rehearsals, singing workshops, and ethnomusicological field research. For example, the tool was used to analyze the Sardinian singing style of the "Quintina" in which four singers manage to fuse the high frequency partials of their voices in such a way that an apparent fifth voice appears (see demo video[^2]). Furthermore, `pytch` has been used during a field expedition to Georgia, where ethnomusicologists recorded and analyzed traditional vocal music which belongs to the UNESCO intangible world cultural heritage (see demo video[^3]). +Over its seven years of development to now, `pytch` has been tested and iterated through use in ensemble rehearsals, singing workshops, and ethnomusicological field research. For example, the tool was used to analyze the Sardinian singing style of the "Quintina" in which four singers manage to fuse the high frequency partials of their voices in such a way that an apparent fifth voice appears (see demo video[^4]). Furthermore, `pytch` has been used during a field expedition to Georgia, where ethnomusicologists recorded and analyzed traditional vocal music which belongs to the UNESCO intangible world cultural heritage (see demo video[^5]). -In addition to its use in musicological research, `pytch` also provides a platform for music information retrieval (MIR) research on real-time audio processing. Working with real-time data introduces challenges such as a limited audio context for analysis and strict timing constraints to ensure low-latency processing. Researchers can use `pytch` to develop, test, and compare algorithms for F0 estimation and other music information retrieval tasks [@StefaniT22_RealTimeMIR_DAFX;@Goto04_RealTimeF0_SC;@MeierCM24_RealTimePLP_TISMIR]. A notable example is the `rtswipe` [@MeierSSMB25_RealTimeSWIPE_CMMR] library, which was inspired by `pytch` use cases and integrated after its initial release. Using its implementation significantly increased the speed and accuracy of F0 estimates in `pytch`; consequently, it was selected as the default algorithm in our tool. +In addition to its use in musicological research, `pytch` also provides a platform for music information retrieval (MIR) research on real-time audio processing. Working with real-time data introduces challenges such as a limited audio context for analysis and strict timing constraints to ensure low-latency processing. Researchers can use `pytch` to develop, test, and compare algorithms for F0 estimation and other music information retrieval tasks [@StefaniT22_RealTimeMIR_DAFX;@Goto04_RealTimeF0_SC;@MeierCM24_RealTimePLP_TISMIR]. A notable example is the `rtswipe` [@MeierSSMB25_RealTimeSWIPE_CMMR] library, which was inspired by `pytch` use cases and integrated after its initial release. Using its implementation significantly increased the speed and accuracy of F0 estimates in `pytch`; consequently, it was selected as the default algorithm in our tool from version 2.3 onward. -[^5]: -[^6]: +[^4]: +[^5]: # Multitrack Singing Recordings @@ -67,23 +65,29 @@ To fully leverage the capabilities of `pytch`, it is essential to record each si One way to reduce cross-talk is to increase the physical distance between singers or to record them in isolation. However, this is not always feasible, as singers need to hear one another to maintain accurate tuning. An effective workaround is the use of contact microphones, such as throat microphones, which capture vocal fold vibrations directly from the skin of the throat. This method offers a significant advantage: the recorded signals are largely immune to interference from other singers, resulting in much cleaner, more isolated recordings. Throat microphones have successfully been used to record vocal ensembles in several past studies [@Scherbaum16_LarynxMicrophones_IWFMA]. -In addition to live monitoring, `pytch` can also be used to analyze pre-recorded multitrack singing performances. By playing back individual vocal tracks in a digital audio workstation (DAW) and using virtual audio routing tools such as Loopback[^7] (macOS) or BlackHole[^8], these tracks can be streamed into `pytch` as if they were live microphone inputs. This setup, which was also used in the demo video[^3], allows users to benefit from `pytch`’s real-time visualization and analysis features during evaluation of rehearsals, performances, or field recordings. +In addition to live monitoring, `pytch` can also be used to analyze pre-recorded multitrack singing performances. By playing back individual vocal tracks in a digital audio workstation (DAW) and using virtual audio routing tools such as Loopback[^6] (macOS) or BlackHole[^7], these tracks can be streamed into `pytch` as if they were live microphone inputs. This setup, which was also used in the demo video[^5], allows users to benefit from `pytch`’s real-time visualization and analysis features during evaluation of rehearsals, performances, or field recordings. -[^7]: -[^8]: +[^6]: +[^7]: # Software Design `pytch` is implemented in Python, the predominant language for research software in music computing, which facilitates contributions from other researchers. The `pytch` codebase systematically separates GUI code from audio processing code to ease maintenance. ## Audio Processing -The real-time audio processing pipeline implemented in the file `audio.py` is the heart of `pytch` and consists of two main stages: recording and analysis. The recording stage captures multichannel audio waveforms from the soundcard or an external audio interface using the `sounddevice` library. The library is based on PortAudio and supports a wide range of operating systems, audio devices, and sampling rates. The recorded audio is received in chunks via a recording callback and fed into a ring buffer shared with the analysis process. When the buffer is sufficiently filled with audio chunks, the analysis process reads the recorded audio to compute several audio features. +The real-time audio processing pipeline implemented in the file `audio.py` is the heart of `pytch` and consists of two main stages: recording and analysis. The recording stage captures multichannel audio waveforms from the soundcard or an external audio interface using the `sounddevice` library[^8]. The library is based on PortAudio[^9] and supports a wide range of operating systems, audio devices, and sampling rates. The recorded audio is received in chunks via a recording callback and fed into a ring buffer shared with the analysis process. When the buffer is sufficiently filled with audio chunks, the analysis process reads the recorded audio to compute several audio features. For each channel, the analysis stage computes the audio level in dBFS, a time--frequency representation of the audio signal via the Short-Time Fourier Transform (see [@Mueller21_FMP_SPRINGER] for fundamentals of music processing), and an estimate of the F0 along with a confidence value. There are currently two F0 algorithms available: YIN [@CheveigneK02_YIN_JASA] from the `libf0` library [@RosenzweigSM22_libf0_ISMIR-LBD], and a real-time version of SWIPE [@CamachoH08_SawtoothWaveform_JASA] from the `rtswipe` library [@MeierSSMB25_RealTimeSWIPE_CMMR]. YIN is a time-domain algorithm that computes the F0 based on a tweaked auto-correlation function. It is computationally efficient and well-suited for low-latency applications, but it tends to suffer from estimation errors, particularly confusions with higher harmonics such as the octave. SWIPE is a frequency-domain algorithm that computes the F0 by correlating pitch candidate kernels derived from a sawtooth waveform with the spectrum of an input signal. It is known to be more robust to noise as YIN. The obtained F0 estimates, which are natively computed in the unit Hz, are converted to the unit cents using a user-specified reference frequency. Depending on the audio quality and vocal characteristics, F0 estimates may exhibit artifacts such as discontinuities or pitch slides, which can make the resulting trajectories difficult to interpret [@RosenzweigSM19_StableF0_ISMIR]. Previous research has shown that using throat microphones can improve the isolation of individual voices in group singing contexts, resulting in cleaner signals and more accurate F0 estimates [@Scherbaum16_LarynxMicrophones_IWFMA]. To further enhance interpretability, `pytch` includes several optional post-processing steps: a confidence threshold to discard estimates with low confidence score, a median filter to smooth the trajectories, and a gradient filter to suppress abrupt pitch slides. As a final step in the audio analysis, the harmonic intervals between the F0 trajectories are computed. Every audio feature is stored separately in a dedicated ring buffer. After processing, the pipeline sets a flag that notifies the GUI that new data is ready for visualization. ## Graphical User Interface (GUI) -In this section, we provide a step-by-step explanation of the `pytch` GUI implemented in the file `gui.py`. Right after the program start, a startup menu opens in which the user is asked to specify the soundcard, input channels, sampling rate, and window size for processing (see Figure \autoref{fig:menu}). Furthermore, the user can choose to store the recorded audio and the F0 trajectories on disk. +In this section, we provide a step-by-step explanation of the `pytch` GUI implemented in the file `gui.py`. Inspired by other scientific Python-based tools [@Aubier25_FUS_JOSS;@PatakyND19_mwarp1d_JOSS], we designed the `pytch` GUI using the open-source library `pyqtgraph`[^10]. + +[^8]: +[^9]: +[^10]: + +Right after the program start, a startup menu opens in which the user is asked to specify the soundcard, input channels, sampling rate, and window size for processing (see Figure \autoref{fig:menu}). Furthermore, the user can choose to store the recorded audio and the F0 trajectories on disk. ![`pytch` startup menu.\label{fig:menu}](../pictures/menu.png){ width=50% } @@ -99,6 +103,6 @@ The right section, referred to as the "trajectory view," provides time-based vis The authors used the AI tool ChatGPT to assist with grammar, spelling, and language refinement in this article. In addition, ChatGPT was used to generate example code for PyQt6 functions and classes. No code was copied verbatim from ChatGPT; all generated examples were reviewed, adapted, and integrated by the authors. # Acknowledgements -We would like to thank Lukas Dietz for his help with the implementation, Peter Meier and Sebastian Strahl for the collaboration on the real-time SWIPE implementation, and all the singers who contributed to testing `pytch` during its development. +We would like to thank Lukas Dietz for his help with the implementation, Peter Meier, Sebastian Strahl, and Meinard Müller for the collaboration on the real-time SWIPE implementation, and all the singers who contributed to testing `pytch` during its development. # References From 43c16df2c1217d8b0bd7951f8f27655afde199a5 Mon Sep 17 00:00:00 2001 From: sebastianrosenzweig Date: Sun, 1 Feb 2026 18:37:29 +0100 Subject: [PATCH 7/7] add simon --- paper/paper.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paper/paper.md b/paper/paper.md index 8a5b52e..5e488ca 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -103,6 +103,6 @@ The right section, referred to as the "trajectory view," provides time-based vis The authors used the AI tool ChatGPT to assist with grammar, spelling, and language refinement in this article. In addition, ChatGPT was used to generate example code for PyQt6 functions and classes. No code was copied verbatim from ChatGPT; all generated examples were reviewed, adapted, and integrated by the authors. # Acknowledgements -We would like to thank Lukas Dietz for his help with the implementation, Peter Meier, Sebastian Strahl, and Meinard Müller for the collaboration on the real-time SWIPE implementation, and all the singers who contributed to testing `pytch` during its development. +We would like to thank Lukas Dietz for his help with the implementation, Peter Meier, Sebastian Strahl, Simon Schwär, and Meinard Müller for the collaboration on the real-time SWIPE implementation, and all the singers who contributed to testing `pytch` during its development. # References