Masterthesis/train_gmm.py at main · Steckdose007/Masterthesis · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
import numpy as np
import librosa
import matplotlib.pyplot as plt
from audiodataloader import AudioDataLoader
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
# Compute MFCCs, Delta MFCCs, and Delta-Delta MFCCs
def compute_mfcc_features(signal, sample_rate, n_mfcc=12, n_mels=22, frame_size=25.6e-3, hop_size=10e-3, n_fft=2048):
    # Convert frame and hop size from seconds to samples
    frame_length = int(frame_size * sample_rate)
    hop_length = int(hop_size * sample_rate)

    # Compute the static MFCCs using librosa's mfcc function
    mfccs = librosa.feature.mfcc(y=signal, sr=sample_rate, n_mfcc=n_mfcc,
                                 n_fft=n_fft, hop_length=hop_length, win_length=frame_length, n_mels=n_mels)

    # Compute the first-order difference (Delta MFCCs) using a 5-frame window
    mfcc_delta = librosa.feature.delta(mfccs, width=3)

    # Compute the second-order difference (Delta-Delta MFCCs)
    mfcc_delta2 = librosa.feature.delta(mfccs, order=2, width=3)

    # Concatenate static, delta, and delta-delta features to form a 36-dimensional feature vector per frame
    mfcc_features = np.concatenate([mfccs, mfcc_delta, mfcc_delta2], axis=0)

    return mfcc_features

def train_ubm(mfcc_features, n_components=16, max_iter=100, reg_covar=1e-6):
    """
    Train a GMM as the Universal Background Model (UBM).

    Parameters:
    - mfcc_features: A numpy array of shape (n_frames, n_features).
    - n_components: The number of Gaussian components in the GMM (UBM).
    - max_iter: Maximum number of iterations for fitting the GMM.
    - reg_covar: Regularization added to the diagonal of covariance matrices to prevent singularities.

    Returns:
    - ubm: A trained GaussianMixture model (UBM).
    """
    gmm = GaussianMixture(n_components=n_components, covariance_type='diag',
                          max_iter=max_iter, random_state=42, reg_covar=reg_covar)

    # Fit the GMM on the MFCC features
    gmm.fit(mfcc_features)
    if gmm.converged_:
        print(f"GMM converged after {gmm.n_iter_} iterations.")
    else:
        print(f"GMM did not converge. Reached the maximum of {max_iter} iterations.")
    return gmm

def compute_posterior_probs(gmm, mfcc_features, relevance_factor=16):
    """
    Compute the posterior probabilities (responsibilities) for each Gaussian component
    in the UBM for the given MFCC features.

    Parameters:
    - gmm: The trained GMM (UBM).
    - mfcc_features: A numpy array of shape (n_frames, n_features).

    Returns:
    - responsibilities: A numpy array of shape (n_frames, n_components), which contains
      the posterior probabilities (responsibilities) for each frame and Gaussian component.
    """
    log_prob_norm, responsibilities = gmm._estimate_log_prob_resp(mfcc_features)
    return responsibilities

def update_means(ubm, responsibilities, mfcc_features, relevance_factor):
    """
    Update the means of the UBM components using MAP adaptation.

    Parameters:
    - ubm: The Universal Background Model (GMM).
    - responsibilities: The posterior probabilities for each Gaussian component (n_frames, n_components).
    - mfcc_features: The MFCC features (n_frames, n_features).
    - relevance_factor: The MAP relevance factor (controls the influence of the UBM means vs. new data).

    Returns:
    - adapted_means: The adapted means of the GMM components.
    """
    # Calculate effective number of data points for each component (N_k)
    N_k = np.sum(responsibilities, axis=0)  # Shape: (n_components,)

    # Calculate the new data means (weighted by responsibilities)
    weighted_sum = np.dot(responsibilities.T, mfcc_features)  # Shape: (n_components, n_features)

    # Update the means using MAP formula
    #adapted_means = (N_k[:, np.newaxis] * weighted_sum + relevance_factor * ubm.means_) / (N_k[:, np.newaxis] + relevance_factor)
    # Calculate the contribution of the new data mean (first term)
    data_term = (N_k[:, np.newaxis] / (N_k[:, np.newaxis] + relevance_factor)) * weighted_sum

    # Calculate the contribution of the UBM mean (second term)
    ubm_term = (relevance_factor / (N_k[:, np.newaxis] + relevance_factor)) * ubm.means_

    # Combine the two terms to get the adapted means
    adapted_means = data_term + ubm_term

    return adapted_means

def update_covariances(ubm, responsibilities, mfcc_features, adapted_means, relevance_factor):
    """
    Update the covariances of the UBM components using MAP adaptation.

    Parameters:
    - ubm: The Universal Background Model (GMM).
    - responsibilities: The posterior probabilities for each Gaussian component (n_frames, n_components).
    - mfcc_features: The MFCC features (n_frames, n_features).
    - adapted_means: The adapted means of the GMM components (from the update_means step).
    - relevance_factor: The MAP relevance factor (controls the influence of the UBM covariances vs. new data).

    Returns:
    - adapted_covariances: The adapted covariances of the GMM components.
    """
    # Calculate effective number of data points for each component (N_k)
    N_k = np.sum(responsibilities, axis=0)  # Shape: (n_components,)

    # Calculate the weighted sum of square deviations from the adapted means
    diff = mfcc_features[:, np.newaxis, :] - adapted_means  # Shape: (n_frames, n_components, n_features)
    weighted_diff = responsibilities[:, :, np.newaxis] * (diff ** 2)

    # Compute new covariances based on weighted differences
    adapted_covariances = (np.sum(weighted_diff, axis=0) + relevance_factor * ubm.covariances_) / (N_k[:, np.newaxis] + relevance_factor)

    return adapted_covariances

def update_weights(ubm, responsibilities, relevance_factor):
    """
    Update the weights of the UBM components using MAP adaptation.

    Parameters:
    - ubm: The Universal Background Model (GMM).
    - responsibilities: The posterior probabilities for each Gaussian component (n_frames, n_components).
    - relevance_factor: The MAP relevance factor (controls the influence of the UBM weights vs. new data).

    Returns:
    - adapted_weights: The adapted weights of the GMM components.
    """
    # Calculate effective number of data points for each component (N_k)
    N_k = np.sum(responsibilities, axis=0)  # Shape: (n_components,)

    # Update the weights using MAP formula
    adapted_weights = (N_k + relevance_factor * ubm.weights_) / (np.sum(N_k) + relevance_factor)

    return adapted_weights

def compute_precision_cholesky(adapted_covariances):
    """
    Compute the precision Cholesky decomposition for diagonal covariances.
    manually computing the precision cholesky and not directly inverting the covariance matrix → no difference.

    Parameters:
    - adapted_covariances: Diagonal covariances of the adapted GMM.

    Returns:
    - precisions_cholesky: The Cholesky decomposition of the precisions (inverse of the covariances).
    """
    precisions = 1.0 / adapted_covariances  # Since we are using diagonal covariance matrices
    precisions_cholesky = np.sqrt(precisions)  # Cholesky decomposition is equivalent to square root for diagonal matrices

    return precisions_cholesky

def adapt_ubm_map(ubm, mfcc_features, relevance_factor=16):
    """
    Perform MAP adaptation of the UBM on new MFCC features.

    Parameters:
    - ubm: The Universal Background Model (GMM).
    - mfcc_features: A numpy array of MFCC features (n_frames, n_features).
    - relevance_factor: The MAP relevance factor (typically between 10 and 20).

    Returns:
    - adapted_gmm: A new GMM with adapted parameters.
    """
    # Compute the posterior probabilities (responsibilities)
    responsibilities = compute_posterior_probs(ubm, mfcc_features)

    # Update GMM parameters (means, covariances, and weights) using MAP adaptation
    adapted_means = update_means(ubm, responsibilities, mfcc_features, relevance_factor)
    adapted_covariances = update_covariances(ubm, responsibilities, mfcc_features, adapted_means, relevance_factor)
    adapted_weights = update_weights(ubm, responsibilities, relevance_factor)

    # Create the adapted GMM
    adapted_gmm = GaussianMixture(n_components=ubm.n_components, covariance_type='diag')
    adapted_gmm.means_ = adapted_means
    adapted_gmm.covariances_ = adapted_covariances
    adapted_gmm.weights_ = adapted_weights
    adapted_gmm.precisions_cholesky_ = 1 / np.sqrt(adapted_covariances)
    #adapted_gmm.precisions_cholesky_ = compute_precision_cholesky(adapted_covariances)
    return adapted_gmm

def extract_supervector(gmm):
    """
    Extract the supervector from the adapted GMM by concatenating its parameters.

    Parameters:
    - gmm: The adapted GaussianMixture model.

    Returns:
    - supervector: A flattened numpy array containing the concatenated means, covariances, and weights.
    """
    means = gmm.means_.flatten()  # Mean vectors of the Gaussian components
    covariances = gmm.covariances_.flatten()  # Diagonal covariance elements of the Gaussian components
    weights = gmm.weights_  # Mixture weights of the Gaussian components

    # Concatenate the means, covariances, and weights into a single supervector
    supervector = np.concatenate([means, covariances, weights])
    simplified_supervector = weights

    return supervector, simplified_supervector


if __name__ == "__main__":

    loader = AudioDataLoader(config_file='config.json', word_data= True, phone_data= False, sentence_data= True)
    words_segments = loader.create_dataclass_words()
    mfcc_list = []
    for word in words_segments:
        signal = word.audio_data
        sample_rate = word.sample_rate
        # Compute 12 static MFCCs, 24 dynamic (delta and delta-delta) MFCCs, using 22 Mel filters
        mfcc = compute_mfcc_features(signal, sample_rate)
        #Transpose to get it like that: (n_components, n_features) for the covarianve_type: diag
        mfcc_list.append(np.transpose(mfcc))

    print(len(mfcc_list),np.shape(mfcc_list[0]))
    # Concatenate all MFCC features into a single matrix So (n_frames,36 features)
    mfcc_features = np.concatenate(mfcc_list, axis=0)
    print(np.shape(mfcc_features))
    print("Training UBM...")
    ubm = train_ubm(mfcc_features, n_components=16, max_iter=100, reg_covar=1e-6)
    print("Training finished!")

    # Step 2: Adapt the UBM for each word
    print("Adapting UBM for each word...")
    supervectors = []
    simmplified_supervectors = []
    for word in words_segments:
        signal = word.audio_data
        mfcc = compute_mfcc_features(signal, word.sample_rate)
        mfcc = np.transpose(mfcc)  # Shape it to (n_frames, n_features)

        # Adapt the UBM to this word
        #print("mfcc word adaption shape:", np.shape(mfcc))
        adapted_gmm = adapt_ubm_map(ubm, mfcc)

        # Step 3: Extract the supervector
        supervector, simmplified_supervector = extract_supervector(adapted_gmm)
        #print(np.shape(supervector),np.shape(simmplified_supervector))
        supervectors.append(supervector)
        simmplified_supervectors.append(simmplified_supervector)

    # supervectors now contains the supervectors for each word
    print(f"Extracted {len(supervectors)} supervectors and {len(simmplified_supervectors)} simplified supervectors")