From 746a1a37129d7fed22903839148ff103cc570ffe Mon Sep 17 00:00:00 2001 From: ivanlmh Date: Sat, 22 Feb 2025 21:30:36 +0000 Subject: [PATCH 1/4] [fix] key transposing calculation error --- modules/transformations.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/modules/transformations.py b/modules/transformations.py index 7a987b6..69c6826 100644 --- a/modules/transformations.py +++ b/modules/transformations.py @@ -232,10 +232,15 @@ def get_transpose_semitones(self, from_key, to_key): to_key = (to_key - 7) % 12 # Calculate the smallest semitone difference needed - difference = (to_key - from_key) % 12 - if difference > 6: - difference -= 12 - return difference + # Calculate direct difference first + direct_diff = to_key - from_key + + # Normalize to find shortest path + if direct_diff > 6: + direct_diff -= 12 + elif direct_diff < -6: + direct_diff += 12 + return direct_diff def analyze_tempo(self, beats_data): """Calculate tempo and time between beats""" From 8517733f55c3071dc4dd89e22629379e1ba0e9fa Mon Sep 17 00:00:00 2001 From: ivanlmh Date: Tue, 25 Feb 2025 10:53:33 +0000 Subject: [PATCH 2/4] [fix] segemnt length issue with augmentation + tempo ratio range in config --- config/grafp.yaml | 5 ++++- modules/transformations.py | 27 ++++++++++++++------------- 2 files changed, 18 insertions(+), 14 deletions(-) diff --git a/config/grafp.yaml b/config/grafp.yaml index 49ea854..40cd09f 100644 --- a/config/grafp.yaml +++ b/config/grafp.yaml @@ -56,6 +56,8 @@ T_max: 400 lambda: 0.0 error_threshold: 5 +weight_decay: 0 #1.0e-6 # 0 is the same as no weight decay + # stem: 'drums' # SampleID train hyperparameters mix_prob: 0.95 @@ -66,9 +68,10 @@ min_beats_required: 32 mix_prob: 0.95 mix_gain_range: [0.1, 0.7] #[0.05, 0.55] min_beats_required: 32 # minimum number of beats required in a sample to be included in the dataset +tempo_ratio_range: [0.5, 2.0] #[0.75, 1.5] # Augmentation hyperparameters -n_frames: 128 #10 #32 # depends on the spectrogram parameters (10 is for Music2latent), old nerualFP was 32, now 128 apparently +n_frames: 128 #10 #32 # depends on the spectrogram parameters (10 is for Music2latent), old nerualFP was 32, now 128 overlap: 0.875 #0.5 tr_snr: [0, 20] val_snr: [0, 10] diff --git a/modules/transformations.py b/modules/transformations.py index 69c6826..8173a99 100644 --- a/modules/transformations.py +++ b/modules/transformations.py @@ -198,6 +198,7 @@ def __init__(self, cfg, train=True, cpu=False): self.mix_prob = float(cfg.get("mix_prob", 0.95)) self.mix_gain_range = cfg.get("mix_gain_range", [0.05, 0.5]) # Narrower range self.mix_gain_range = [float(i) for i in self.mix_gain_range] + self.tempo_ratio_range = cfg.get("tempo_ratio_range", [0.5,2.0]) # Keep melspec transform self.logmelspec = nn.Sequential( @@ -266,9 +267,9 @@ def get_tempo_ratio(self, source_tempo, target_tempo): raw_ratio = target_tempo / source_tempo # Find the closest power of 2 multiple/divisor that keeps ratio between 0.5 and 2.0 - while raw_ratio > 2.0: + while raw_ratio > self.tempo_ratio_range[1]: #1.5: #2.0: raw_ratio /= 2.0 - while raw_ratio < 0.5: + while raw_ratio < self.tempo_ratio_range[0]: #0.75: #0.5: raw_ratio *= 2.0 return raw_ratio @@ -416,26 +417,26 @@ def process_audio_batch(self, batch_audio, metadata): # print("Offset", offset) # Apply offset and padding/trimming to same length - target_length = len(audio) + target_length = len(other_audio) if offset >= 0: # Add offset zeros at the start - other_audio = np.pad(other_audio, (offset, 0)) + audio = np.pad(audio, (offset, 0)) # Then trim/pad to target length - if len(other_audio) > target_length: - other_audio = other_audio[:target_length] + if len(audio) > target_length: + audio = audio[:target_length] else: - other_audio = np.pad( - other_audio, (0, target_length - len(other_audio)) + audio = np.pad( + audio, (0, target_length - len(audio)) ) elif offset < 0: - other_audio = other_audio[-offset:] - if len(other_audio) > target_length: + audio = audio[-offset:] + if len(audio) > target_length: # If longer than target, trim the end - other_audio = other_audio[:target_length] + audio = audio[:target_length] else: # If shorter than target, pad the end - other_audio = np.pad( - other_audio, (0, target_length - len(other_audio)) + audio = np.pad( + audio, (0, target_length - len(audio)) ) # Verify lengths match before mixing From e0525192dafdcaf7bf2c9cbbc05c7f5255391aad Mon Sep 17 00:00:00 2001 From: ivanlmh Date: Tue, 25 Feb 2025 10:54:36 +0000 Subject: [PATCH 3/4] [add] log config file, and use weight decay --- train.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/train.py b/train.py index 0fb0c9e..1e6ccb2 100644 --- a/train.py +++ b/train.py @@ -197,6 +197,18 @@ def main(): cfg = load_config(args.config) writer = SummaryWriter(f'runs/{args.ckp}') + # log the configuration + print("Configuration parameters:") + for key, value in cfg.items(): + print(f" {key}: {value}") + + # Log all config parameters to TensorBoard + # Convert nested structures to strings for TensorBoard + config_flat = {} + for key, value in cfg.items(): + config_flat[key] = str(value) + writer.add_text("Configuration", str(config_flat), 0) + additive = args.additive if not additive: @@ -209,6 +221,7 @@ def main(): # Hyperparameters batch_size = cfg['bsz_train'] learning_rate = cfg['lr'] + weight_decay = cfg['weight_decay'] num_epochs = override(cfg['n_epochs'], args.epochs) model_name = args.ckp random_seed = args.seed @@ -309,7 +322,7 @@ def main(): print(count_parameters(model, args.encoder)) - optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) + optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max = cfg['T_max'], eta_min = cfg['min_lr']) # scaler = GradScaler(enabled=True) scaler = DummyScaler() From d31312aadf85a805567af3397fa4b90e22280239 Mon Sep 17 00:00:00 2001 From: ivanlmh Date: Thu, 27 Feb 2025 13:54:29 +0000 Subject: [PATCH 4/4] [fix] make data additiva dataclass uniform with Adi's --- modules/data.py | 91 +++++++++++++++++++++++-------------------------- 1 file changed, 43 insertions(+), 48 deletions(-) diff --git a/modules/data.py b/modules/data.py index f03e3b3..a8b7564 100644 --- a/modules/data.py +++ b/modules/data.py @@ -433,9 +433,11 @@ def __getitem__(self, idx): audio_resampled = resampler(audio_mono) clip_frames = int(self.sample_rate * self.dur) + offset_frames = int(self.sample_rate * self.offset) + + segment_length = clip_frames + offset_frames - if len(audio_resampled) <= clip_frames: - # self.ignore_idx.append(idx) + if len(audio_resampled) < segment_length: return self[idx + 1] key = self.get_key_for_file(datapath) @@ -449,63 +451,56 @@ def __getitem__(self, idx): "beats": beats, } - # For training pipeline, output a random frame of the audio - if self.train: - a_i = audio_resampled - a_j = a_i.clone() + a_i = audio_resampled - offset_mod = int(self.sample_rate * (self.offset) + clip_frames) - if len(audio_resampled) < offset_mod: - print( - "Audio too short (offset_mod > len(audio resampled)). Skipping..." - ) - return self[idx + 1] - r = np.random.randint(0, len(audio_resampled) - offset_mod) - ri = np.random.randint(0, offset_mod - clip_frames) - rj = np.random.randint(0, offset_mod - clip_frames) + start_idx = np.random.randint(0, len(audio_resampled) - segment_length + 1) + a_i = a_i[start_idx : start_idx + segment_length] - # Add timestamps to metadata - metadata.update( - {"start_i": r + ri, "start_j": r + rj, "clip_length": clip_frames} - ) + a_j = a_i.clone() - clip_i = a_i[r : r + offset_mod] - clip_j = a_j[r : r + offset_mod] - x_i = clip_i[ri : ri + clip_frames] - x_j = clip_j[rj : rj + clip_frames] + # Introduce offset by extracting a random dur-length segment + x_i_start = np.random.randint(0, offset_frames) + x_j_start = np.random.randint(0, offset_frames) - if x_i.abs().max() < self.silence or x_j.abs().max() < self.silence: - print("Silence detected. Skipping...") - return self[idx + 1] + x_i = a_i[x_i_start : x_i_start + clip_frames] + x_j = a_j[x_j_start : x_j_start + clip_frames] - if self.norm is not None: - norm_val = qtile_norm(audio_resampled, q=self.norm) - x_i = x_i / norm_val - x_j = x_j / norm_val + # Add timestamps to metadata + metadata.update( + { + "start_i": start_idx + x_i_start, + "start_j": start_idx + x_j_start, + "clip_length": clip_frames, + } + ) - if self.transform is not None: - x_i, x_j, transform_metadata = self.transform(x_i, x_j, metadata) + if x_i.abs().max() < self.silence or x_j.abs().max() < self.silence: + print("Silence detected. Skipping...") + return self[idx + 1] - if x_i is None or x_j is None: - return self[idx + 1] + # if self.norm is not None: + # norm_val = qtile_norm(audio_resampled, q=self.norm) + # x_i = x_i / norm_val + # x_j = x_j / norm_val - # Pad or truncate to sample_rate * dur - if len(x_i) < clip_frames: - x_i = F.pad(x_i, (0, clip_frames - len(x_i))) - else: - x_i = x_i[:clip_frames] + if self.transform is not None: + x_i, x_j, transform_metadata = self.transform(x_i, x_j, metadata) - if len(x_j) < clip_frames: - x_j = F.pad(x_j, (0, clip_frames - len(x_j))) - else: - x_j = x_j[:clip_frames] + if x_i is None or x_j is None: + return self[idx + 1] - return x_i, x_j, metadata + # Pad or truncate to sample_rate * dur + if len(x_i) < clip_frames: + x_i = F.pad(x_i, (0, clip_frames - len(x_i))) + else: + x_i = x_i[:clip_frames] - # For validation / test, output consecutive (overlapping) frames + if len(x_j) < clip_frames: + x_j = F.pad(x_j, (0, clip_frames - len(x_j))) else: - return audio_resampled, None, metadata - # return audio_resampled + x_j = x_j[:clip_frames] + + return x_i, x_j, metadata def __len__(self): - return len(self.filenames) \ No newline at end of file + return len(self.filenames)