-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathtrain_gmm.py
More file actions
254 lines (200 loc) · 11.1 KB
/
train_gmm.py
File metadata and controls
254 lines (200 loc) · 11.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
import numpy as np
import librosa
import matplotlib.pyplot as plt
from audiodataloader import AudioDataLoader
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
# Compute MFCCs, Delta MFCCs, and Delta-Delta MFCCs
def compute_mfcc_features(signal, sample_rate, n_mfcc=12, n_mels=22, frame_size=25.6e-3, hop_size=10e-3, n_fft=2048):
# Convert frame and hop size from seconds to samples
frame_length = int(frame_size * sample_rate)
hop_length = int(hop_size * sample_rate)
# Compute the static MFCCs using librosa's mfcc function
mfccs = librosa.feature.mfcc(y=signal, sr=sample_rate, n_mfcc=n_mfcc,
n_fft=n_fft, hop_length=hop_length, win_length=frame_length, n_mels=n_mels)
# Compute the first-order difference (Delta MFCCs) using a 5-frame window
mfcc_delta = librosa.feature.delta(mfccs, width=3)
# Compute the second-order difference (Delta-Delta MFCCs)
mfcc_delta2 = librosa.feature.delta(mfccs, order=2, width=3)
# Concatenate static, delta, and delta-delta features to form a 36-dimensional feature vector per frame
mfcc_features = np.concatenate([mfccs, mfcc_delta, mfcc_delta2], axis=0)
return mfcc_features
def train_ubm(mfcc_features, n_components=16, max_iter=100, reg_covar=1e-6):
"""
Train a GMM as the Universal Background Model (UBM).
Parameters:
- mfcc_features: A numpy array of shape (n_frames, n_features).
- n_components: The number of Gaussian components in the GMM (UBM).
- max_iter: Maximum number of iterations for fitting the GMM.
- reg_covar: Regularization added to the diagonal of covariance matrices to prevent singularities.
Returns:
- ubm: A trained GaussianMixture model (UBM).
"""
gmm = GaussianMixture(n_components=n_components, covariance_type='diag',
max_iter=max_iter, random_state=42, reg_covar=reg_covar)
# Fit the GMM on the MFCC features
gmm.fit(mfcc_features)
if gmm.converged_:
print(f"GMM converged after {gmm.n_iter_} iterations.")
else:
print(f"GMM did not converge. Reached the maximum of {max_iter} iterations.")
return gmm
def compute_posterior_probs(gmm, mfcc_features, relevance_factor=16):
"""
Compute the posterior probabilities (responsibilities) for each Gaussian component
in the UBM for the given MFCC features.
Parameters:
- gmm: The trained GMM (UBM).
- mfcc_features: A numpy array of shape (n_frames, n_features).
Returns:
- responsibilities: A numpy array of shape (n_frames, n_components), which contains
the posterior probabilities (responsibilities) for each frame and Gaussian component.
"""
log_prob_norm, responsibilities = gmm._estimate_log_prob_resp(mfcc_features)
return responsibilities
def update_means(ubm, responsibilities, mfcc_features, relevance_factor):
"""
Update the means of the UBM components using MAP adaptation.
Parameters:
- ubm: The Universal Background Model (GMM).
- responsibilities: The posterior probabilities for each Gaussian component (n_frames, n_components).
- mfcc_features: The MFCC features (n_frames, n_features).
- relevance_factor: The MAP relevance factor (controls the influence of the UBM means vs. new data).
Returns:
- adapted_means: The adapted means of the GMM components.
"""
# Calculate effective number of data points for each component (N_k)
N_k = np.sum(responsibilities, axis=0) # Shape: (n_components,)
# Calculate the new data means (weighted by responsibilities)
weighted_sum = np.dot(responsibilities.T, mfcc_features) # Shape: (n_components, n_features)
# Update the means using MAP formula
#adapted_means = (N_k[:, np.newaxis] * weighted_sum + relevance_factor * ubm.means_) / (N_k[:, np.newaxis] + relevance_factor)
# Calculate the contribution of the new data mean (first term)
data_term = (N_k[:, np.newaxis] / (N_k[:, np.newaxis] + relevance_factor)) * weighted_sum
# Calculate the contribution of the UBM mean (second term)
ubm_term = (relevance_factor / (N_k[:, np.newaxis] + relevance_factor)) * ubm.means_
# Combine the two terms to get the adapted means
adapted_means = data_term + ubm_term
return adapted_means
def update_covariances(ubm, responsibilities, mfcc_features, adapted_means, relevance_factor):
"""
Update the covariances of the UBM components using MAP adaptation.
Parameters:
- ubm: The Universal Background Model (GMM).
- responsibilities: The posterior probabilities for each Gaussian component (n_frames, n_components).
- mfcc_features: The MFCC features (n_frames, n_features).
- adapted_means: The adapted means of the GMM components (from the update_means step).
- relevance_factor: The MAP relevance factor (controls the influence of the UBM covariances vs. new data).
Returns:
- adapted_covariances: The adapted covariances of the GMM components.
"""
# Calculate effective number of data points for each component (N_k)
N_k = np.sum(responsibilities, axis=0) # Shape: (n_components,)
# Calculate the weighted sum of square deviations from the adapted means
diff = mfcc_features[:, np.newaxis, :] - adapted_means # Shape: (n_frames, n_components, n_features)
weighted_diff = responsibilities[:, :, np.newaxis] * (diff ** 2)
# Compute new covariances based on weighted differences
adapted_covariances = (np.sum(weighted_diff, axis=0) + relevance_factor * ubm.covariances_) / (N_k[:, np.newaxis] + relevance_factor)
return adapted_covariances
def update_weights(ubm, responsibilities, relevance_factor):
"""
Update the weights of the UBM components using MAP adaptation.
Parameters:
- ubm: The Universal Background Model (GMM).
- responsibilities: The posterior probabilities for each Gaussian component (n_frames, n_components).
- relevance_factor: The MAP relevance factor (controls the influence of the UBM weights vs. new data).
Returns:
- adapted_weights: The adapted weights of the GMM components.
"""
# Calculate effective number of data points for each component (N_k)
N_k = np.sum(responsibilities, axis=0) # Shape: (n_components,)
# Update the weights using MAP formula
adapted_weights = (N_k + relevance_factor * ubm.weights_) / (np.sum(N_k) + relevance_factor)
return adapted_weights
def compute_precision_cholesky(adapted_covariances):
"""
Compute the precision Cholesky decomposition for diagonal covariances.
manually computing the precision cholesky and not directly inverting the covariance matrix → no difference.
Parameters:
- adapted_covariances: Diagonal covariances of the adapted GMM.
Returns:
- precisions_cholesky: The Cholesky decomposition of the precisions (inverse of the covariances).
"""
precisions = 1.0 / adapted_covariances # Since we are using diagonal covariance matrices
precisions_cholesky = np.sqrt(precisions) # Cholesky decomposition is equivalent to square root for diagonal matrices
return precisions_cholesky
def adapt_ubm_map(ubm, mfcc_features, relevance_factor=16):
"""
Perform MAP adaptation of the UBM on new MFCC features.
Parameters:
- ubm: The Universal Background Model (GMM).
- mfcc_features: A numpy array of MFCC features (n_frames, n_features).
- relevance_factor: The MAP relevance factor (typically between 10 and 20).
Returns:
- adapted_gmm: A new GMM with adapted parameters.
"""
# Compute the posterior probabilities (responsibilities)
responsibilities = compute_posterior_probs(ubm, mfcc_features)
# Update GMM parameters (means, covariances, and weights) using MAP adaptation
adapted_means = update_means(ubm, responsibilities, mfcc_features, relevance_factor)
adapted_covariances = update_covariances(ubm, responsibilities, mfcc_features, adapted_means, relevance_factor)
adapted_weights = update_weights(ubm, responsibilities, relevance_factor)
# Create the adapted GMM
adapted_gmm = GaussianMixture(n_components=ubm.n_components, covariance_type='diag')
adapted_gmm.means_ = adapted_means
adapted_gmm.covariances_ = adapted_covariances
adapted_gmm.weights_ = adapted_weights
adapted_gmm.precisions_cholesky_ = 1 / np.sqrt(adapted_covariances)
#adapted_gmm.precisions_cholesky_ = compute_precision_cholesky(adapted_covariances)
return adapted_gmm
def extract_supervector(gmm):
"""
Extract the supervector from the adapted GMM by concatenating its parameters.
Parameters:
- gmm: The adapted GaussianMixture model.
Returns:
- supervector: A flattened numpy array containing the concatenated means, covariances, and weights.
"""
means = gmm.means_.flatten() # Mean vectors of the Gaussian components
covariances = gmm.covariances_.flatten() # Diagonal covariance elements of the Gaussian components
weights = gmm.weights_ # Mixture weights of the Gaussian components
# Concatenate the means, covariances, and weights into a single supervector
supervector = np.concatenate([means, covariances, weights])
simplified_supervector = weights
return supervector, simplified_supervector
if __name__ == "__main__":
loader = AudioDataLoader(config_file='config.json', word_data= True, phone_data= False, sentence_data= True)
words_segments = loader.create_dataclass_words()
mfcc_list = []
for word in words_segments:
signal = word.audio_data
sample_rate = word.sample_rate
# Compute 12 static MFCCs, 24 dynamic (delta and delta-delta) MFCCs, using 22 Mel filters
mfcc = compute_mfcc_features(signal, sample_rate)
#Transpose to get it like that: (n_components, n_features) for the covarianve_type: diag
mfcc_list.append(np.transpose(mfcc))
print(len(mfcc_list),np.shape(mfcc_list[0]))
# Concatenate all MFCC features into a single matrix So (n_frames,36 features)
mfcc_features = np.concatenate(mfcc_list, axis=0)
print(np.shape(mfcc_features))
print("Training UBM...")
ubm = train_ubm(mfcc_features, n_components=16, max_iter=100, reg_covar=1e-6)
print("Training finished!")
# Step 2: Adapt the UBM for each word
print("Adapting UBM for each word...")
supervectors = []
simmplified_supervectors = []
for word in words_segments:
signal = word.audio_data
mfcc = compute_mfcc_features(signal, word.sample_rate)
mfcc = np.transpose(mfcc) # Shape it to (n_frames, n_features)
# Adapt the UBM to this word
#print("mfcc word adaption shape:", np.shape(mfcc))
adapted_gmm = adapt_ubm_map(ubm, mfcc)
# Step 3: Extract the supervector
supervector, simmplified_supervector = extract_supervector(adapted_gmm)
#print(np.shape(supervector),np.shape(simmplified_supervector))
supervectors.append(supervector)
simmplified_supervectors.append(simmplified_supervector)
# supervectors now contains the supervectors for each word
print(f"Extracted {len(supervectors)} supervectors and {len(simmplified_supervectors)} simplified supervectors")