NVIDIA · nenghakheng · Jan 15, 2025 · Feb 1, 2025
diff --git a/__pycache__/audio_processing.cpython-311.pyc b/__pycache__/audio_processing.cpython-311.pyc
diff --git a/__pycache__/audio_processing.cpython-36.pyc b/__pycache__/audio_processing.cpython-36.pyc
diff --git a/__pycache__/data_utils.cpython-311.pyc b/__pycache__/data_utils.cpython-311.pyc
diff --git a/__pycache__/data_utils.cpython-36.pyc b/__pycache__/data_utils.cpython-36.pyc
diff --git a/__pycache__/distributed.cpython-311.pyc b/__pycache__/distributed.cpython-311.pyc
diff --git a/__pycache__/distributed.cpython-36.pyc b/__pycache__/distributed.cpython-36.pyc
diff --git a/__pycache__/hparams.cpython-311.pyc b/__pycache__/hparams.cpython-311.pyc
diff --git a/__pycache__/hparams.cpython-36.pyc b/__pycache__/hparams.cpython-36.pyc
diff --git a/__pycache__/layers.cpython-311.pyc b/__pycache__/layers.cpython-311.pyc
diff --git a/__pycache__/layers.cpython-36.pyc b/__pycache__/layers.cpython-36.pyc
diff --git a/__pycache__/logger.cpython-311.pyc b/__pycache__/logger.cpython-311.pyc
diff --git a/__pycache__/logger.cpython-36.pyc b/__pycache__/logger.cpython-36.pyc
diff --git a/__pycache__/loss_function.cpython-311.pyc b/__pycache__/loss_function.cpython-311.pyc
diff --git a/__pycache__/loss_function.cpython-36.pyc b/__pycache__/loss_function.cpython-36.pyc
diff --git a/__pycache__/model.cpython-311.pyc b/__pycache__/model.cpython-311.pyc
diff --git a/__pycache__/model.cpython-36.pyc b/__pycache__/model.cpython-36.pyc
diff --git a/__pycache__/plotting_utils.cpython-311.pyc b/__pycache__/plotting_utils.cpython-311.pyc
diff --git a/__pycache__/plotting_utils.cpython-36.pyc b/__pycache__/plotting_utils.cpython-36.pyc
diff --git a/__pycache__/stft.cpython-311.pyc b/__pycache__/stft.cpython-311.pyc
diff --git a/__pycache__/stft.cpython-36.pyc b/__pycache__/stft.cpython-36.pyc
diff --git a/__pycache__/utils.cpython-311.pyc b/__pycache__/utils.cpython-311.pyc
diff --git a/__pycache__/utils.cpython-36.pyc b/__pycache__/utils.cpython-36.pyc
diff --git a/data_utils.py b/data_utils.py
@@ -7,14 +7,16 @@
 from utils import load_wav_to_torch, load_filepaths_and_text
 from text import text_to_sequence
 
+print("Trainloader script")
 
 class TextMelLoader(torch.utils.data.Dataset):
     """
         1) loads audio,text pairs
         2) normalizes text and converts them to sequences of one-hot vectors
         3) computes mel-spectrograms from audio files.
     """
-    def __init__(self, audiopaths_and_text, hparams):
+    def __init__(self, audiopaths_and_text, hparams, dataset_type):
+        self.dataset_type = dataset_type
         self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
         self.text_cleaners = hparams.text_cleaners
         self.max_wav_value = hparams.max_wav_value
@@ -36,9 +38,10 @@ def get_mel_text_pair(self, audiopath_and_text):
 
     def get_mel(self, filename):
         if not self.load_mel_from_disk:
-            audio, sampling_rate = load_wav_to_torch(filename)
+            audio, sampling_rate = load_wav_to_torch(self.dataset_type ,filename, self.stft.sampling_rate)
+
             if sampling_rate != self.stft.sampling_rate:
-                raise ValueError("{} {} SR doesn't match target {} SR".format(
+                raise ValueError("{} SR doesn't match target {} SR".format(
                     sampling_rate, self.stft.sampling_rate))
             audio_norm = audio / self.max_wav_value
             audio_norm = audio_norm.unsqueeze(0)

diff --git a/hparams.py b/hparams.py
@@ -1,95 +1,130 @@
-import tensorflow as tf
+# import tensorflow as tf
+import logging
 from text import symbols
 
+# Set up logging
+logging.basicConfig(level=logging.INFO)
 
+print("Hyper Params script")
+
+# Custom HParams class to allow dot notation access
+class HParams(dict):
+    """Custom class that allows dot notation for dictionary keys."""
+
+    def __getattr__(self, name):
+        """Override attribute access to allow dot notation."""
+        if name in self:
+            return self[name]
+        else:
+            raise AttributeError(f"'HParams' object has no attribute '{name}'")
+
+    def __setattr__(self, name, value):
+        """Override setting attributes."""
+        self[name] = value
+
+    def __delattr__(self, name):
+        """Override deleting attributes."""
+        del self[name]
+
+# Create Hyperparameters using the custom HParams class
 def create_hparams(hparams_string=None, verbose=False):
     """Create model hyperparameters. Parse nondefault from given string."""
 
-    hparams = tf.contrib.training.HParams(
+    # Initialize hyperparameters using the custom class
+    hparams = HParams({
         ################################
         # Experiment Parameters        #
         ################################
-        epochs=500,
-        iters_per_checkpoint=1000,
-        seed=1234,
-        dynamic_loss_scaling=True,
-        fp16_run=False,
-        distributed_run=False,
-        dist_backend="nccl",
-        dist_url="tcp://localhost:54321",
-        cudnn_enabled=True,
-        cudnn_benchmark=False,
-        ignore_layers=['embedding.weight'],
+        'epochs': 1600,
+        'iters_per_checkpoint': 1000,
+        'seed': 1234,
+        'dynamic_loss_scaling': True,
+        'fp16_run': False,
+        'distributed_run': False,
+        'dist_backend': "nccl",
+        'dist_url': "tcp://localhost:54321",
+        'cudnn_enabled': True,
+        'cudnn_benchmark': False,
+        'ignore_layers': ['embedding.weight'],
 
         ################################
         # Data Parameters             #
         ################################
-        load_mel_from_disk=False,
-        training_files='filelists/ljs_audio_text_train_filelist.txt',
-        validation_files='filelists/ljs_audio_text_val_filelist.txt',
-        text_cleaners=['english_cleaners'],
+        'load_mel_from_disk': False,
+        'training_files': './datasets/train_datasets/line_index.tsv',
+        'validation_files': './datasets/validation_datasets/line_index.tsv',
+        'text_cleaners': ['transliteration_cleaners'],
 
         ################################
         # Audio Parameters             #
         ################################
-        max_wav_value=32768.0,
-        sampling_rate=22050,
-        filter_length=1024,
-        hop_length=256,
-        win_length=1024,
-        n_mel_channels=80,
-        mel_fmin=0.0,
-        mel_fmax=8000.0,
+        'max_wav_value': 32768.0,
+        'sampling_rate': 22050,
+        'filter_length': 1024,
+        'hop_length': 256,
+        'win_length': 1024,
+        'n_mel_channels': 80,
+        'mel_fmin': 0.0,
+        'mel_fmax': 8000.0,
 
         ################################
         # Model Parameters             #
         ################################
-        n_symbols=len(symbols),
-        symbols_embedding_dim=512,
+        'n_symbols': len(symbols),
+        'symbols_embedding_dim': 512,
 
         # Encoder parameters
-        encoder_kernel_size=5,
-        encoder_n_convolutions=3,
-        encoder_embedding_dim=512,
+        'encoder_kernel_size': 5,
+        'encoder_n_convolutions': 3,
+        'encoder_embedding_dim': 512,
 
         # Decoder parameters
-        n_frames_per_step=1,  # currently only 1 is supported
-        decoder_rnn_dim=1024,
-        prenet_dim=256,
-        max_decoder_steps=1000,
-        gate_threshold=0.5,
-        p_attention_dropout=0.1,
-        p_decoder_dropout=0.1,
+        'n_frames_per_step': 1,  # currently only 1 is supported
+        'decoder_rnn_dim': 1024,
+        'prenet_dim': 256,
+        'max_decoder_steps': 1000,
+        'gate_threshold': 0.5,
+        'p_attention_dropout': 0.1,
+        'p_decoder_dropout': 0.1,
 
         # Attention parameters
-        attention_rnn_dim=1024,
-        attention_dim=128,
+        'attention_rnn_dim': 1024,
+        'attention_dim': 128,
 
         # Location Layer parameters
-        attention_location_n_filters=32,
-        attention_location_kernel_size=31,
+        'attention_location_n_filters': 32,
+        'attention_location_kernel_size': 31,
 
         # Mel-post processing network parameters
-        postnet_embedding_dim=512,
-        postnet_kernel_size=5,
-        postnet_n_convolutions=5,
+        'postnet_embedding_dim': 512,
+        'postnet_kernel_size': 5,
+        'postnet_n_convolutions': 5,
 
         ################################
         # Optimization Hyperparameters #
         ################################
-        use_saved_learning_rate=False,
-        learning_rate=1e-3,
-        weight_decay=1e-6,
-        grad_clip_thresh=1.0,
-        batch_size=64,
-        mask_padding=True  # set model's padded outputs to padded values
-    )
+        'use_saved_learning_rate': False,
+        'learning_rate': 1e-3,
+        'weight_decay': 1e-6,
+        'grad_clip_thresh': 1.0,
+        'batch_size': 32,
+        'mask_padding': True  # set model's padded outputs to padded values
+    })
 
+    # If a hparams string is provided, parse it
     if hparams_string:
-        tf.logging.info('Parsing command line hparams: %s', hparams_string)
-        hparams.parse(hparams_string)
+        logging.info('Parsing command line hparams: %s', hparams_string)
+        # Assuming hparams_string is in a format where key=value pairs are provided (like 'epochs=1000')
+        hparams_list = hparams_string.split(',')
+        for param in hparams_list:
+            key, value = param.split('=')
+            if key in hparams:
+                hparams[key] = type(hparams[key])(value)  # Convert to the correct type
+            else:
+                logging.warning("Unknown parameter: %s", key)
 
+    # If verbose, log the final parsed hyperparameters
     if verbose:
-        tf.logging.info('Final parsed hparams: %s', hparams.values())
+        logging.info('Final parsed hparams: %s', hparams)
 
-    return hparams
+    return hparams