forked from BUTSpeechFIT/VBx
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfeatures.py
More file actions
149 lines (125 loc) · 6.51 KB
/
features.py
File metadata and controls
149 lines (125 loc) · 6.51 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
#!/usr/bin/env python
# Copyright Brno University of Technology (burget@fit.vutbr.cz)
# Licensed under the Apache License, Version 2.0 (the "License")
import numpy as np
def framing(a, window, shift=1):
shape = ((a.shape[0] - window) // shift + 1, window) + a.shape[1:]
strides = (a.strides[0]*shift,a.strides[0]) + a.strides[1:]
return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)
# Mel and inverse Mel scale warping functions
def mel_inv(x):
return (np.exp(x/1127.)-1.)*700.
def mel(x):
return 1127.*np.log(1. + x/700.)
def preemphasis(x, coef=0.97):
return x - np.c_[x[..., :1], x[..., :-1]] * coef
def mel_fbank_mx(winlen_nfft, fs, NUMCHANS=20, LOFREQ=0.0, HIFREQ=None, warp_fn=mel, inv_warp_fn=mel_inv, htk_bug=True):
"""Returns mel filterbank as an array (NFFT/2+1 x NUMCHANS)
winlen_nfft - Typically the window length as used in mfcc_htk() call. It is
used to determine number of samples for FFT computation (NFFT).
If positive, the value (window lenght) is rounded up to the
next higher power of two to obtain HTK-compatible NFFT.
If negative, NFFT is set to -winlen_nfft. In such case, the
parameter nfft in mfcc_htk() call should be set likewise.
fs - sampling frequency (Hz, i.e. 1e7/SOURCERATE)
NUMCHANS - number of filter bank bands
LOFREQ - frequency (Hz) where the first filter starts
HIFREQ - frequency (Hz) where the last filter ends (default fs/2)
warp_fn - function for frequency warping and its inverse
inv_warp_fn - inverse function to warp_fn
"""
if not HIFREQ: HIFREQ = 0.5 * fs
nfft = 2**int(np.ceil(np.log2(winlen_nfft))) if winlen_nfft > 0 else -int(winlen_nfft)
fbin_mel = warp_fn(np.arange(nfft / 2 + 1, dtype=float) * fs / nfft)
cbin_mel = np.linspace(warp_fn(LOFREQ), warp_fn(HIFREQ), NUMCHANS + 2)
cind = np.floor(inv_warp_fn(cbin_mel) / fs * nfft).astype(int) + 1
mfb = np.zeros((len(fbin_mel), NUMCHANS))
for i in range(NUMCHANS):
mfb[cind[i] :cind[i+1], i] = (cbin_mel[i] -fbin_mel[cind[i] :cind[i+1]]) / (cbin_mel[i] -cbin_mel[i+1])
mfb[cind[i+1]:cind[i+2], i] = (cbin_mel[i+2]-fbin_mel[cind[i+1]:cind[i+2]]) / (cbin_mel[i+2]-cbin_mel[i+1])
if LOFREQ > 0.0 and float(LOFREQ)/fs*nfft+0.5 > cind[0] and htk_bug: mfb[cind[0],:] = 0.0 # Just to be HTK compatible
return mfb
def fbank_htk(x, window, noverlap, fbank_mx, nfft=None, _E=None,
USEPOWER=False, RAWENERGY=True, PREEMCOEF=0.97, ZMEANSOURCE=False,
ENORMALISE=True, ESCALE=0.1, SILFLOOR=50.0, USEHAMMING=True):
"""Mel log Mel-filter bank channel outputs
Returns NUMCHANS-by-M matrix of log Mel-filter bank outputs extracted from
signal x, where M is the number of extracted frames, which can be computed
as floor((length(x)-noverlap)/(window-noverlap)). Remaining parameters
have the following meaning:
x - input signal
window - frame window length (in samples, i.e. WINDOWSIZE/SOURCERATE)
or vector of window weights override default windowing function
(see option USEHAMMING)
noverlap - overlapping between frames (in samples, i.e window-TARGETRATE/SOURCERATE)
fbank_mx - array with (Mel) filter bank (as returned by function mel_fbank_mx()).
Note that this must be compatible with the parameter 'nfft'.
nfft - number of samples for FFT computation. By default, it is set in the
HTK-compatible way to the window length rounded up to the next higher
power of two.
_E - include energy as the "first" or the "last" coefficient of each
feature vector. The possible values are: "first", "last", None.
Remaining options have exactly the same meaning as in HTK.
See also:
mel_fbank_mx:
to obtain the matrix for the parameter fbank_mx
add_deriv:
for adding delta, double delta, ... coefficients
add_dither:
for adding dithering in HTK-like fashion
"""
from time import time
tm = time()
if type(USEPOWER) == bool:
USEPOWER += 1
if np.isscalar(window):
window = np.hamming(window) if USEHAMMING else np.ones(window)
if nfft is None:
nfft = 2**int(np.ceil(np.log2(window.size)))
x = framing(x.astype("float"), window.size, window.size-noverlap).copy()
if ZMEANSOURCE:
x -= x.mean(axis=1)[:,np.newaxis]
if _E is not None and RAWENERGY:
energy = np.log((x**2).sum(axis=1))
if PREEMCOEF is not None:
x = preemphasis(x, PREEMCOEF)
x *= window
if _E is not None and not RAWENERGY:
energy = np.log((x**2).sum(axis=1))
#x = np.abs(scipy.fftpack.fft(x, nfft))
#x = x[:,:x.shape[1]/2+1]
x = np.fft.rfft(x, nfft)
#x = np.abs(x)
x = x.real**2 + x.imag**2
if USEPOWER != 2:
x **= 0.5 * USEPOWER
x = np.log(np.maximum(1.0, np.dot(x, fbank_mx)))
if _E is not None and ENORMALISE:
energy = (energy - energy.max()) * ESCALE + 1.0
min_val = -np.log(10**(SILFLOOR/10.)) * ESCALE + 1.0
energy[energy < min_val] = min_val
return np.hstack(([energy[:,np.newaxis]] if _E == "first" else []) + [x] +
([energy[:,np.newaxis]] if (_E in ["last", True]) else []))
def povey_window(winlen):
return np.power(0.5 - 0.5*np.cos(np.linspace(0,2*np.pi, winlen)), 0.85)
def add_dither(x, level=8):
return x + level * (np.random.rand(*x.shape)*2-1)
def cmvn_floating_kaldi(x, LC,RC, norm_vars=True):
"""Mean and variance normalization over a floating window.
x is the feature matrix (nframes x dim)
LC, RC are the number of frames to the left and right defining the floating
window around the current frame. This function uses Kaldi-like treatment of
the initial and final frames: Floating windows stay of the same size and
for the initial and final frames are not centered around the current frame
but shifted to fit in at the beginning or the end of the feature segment.
Global normalization is used if nframes is less than LC+RC+1.
"""
N, dim = x.shape
win_len = min(len(x), LC+RC+1)
win_start = np.maximum(np.minimum(np.arange(-LC,N-LC), N-win_len), 0)
f = np.r_[np.zeros((1, dim)), np.cumsum(x, 0)]
x = x - (f[win_start+win_len]-f[win_start])/win_len
if norm_vars:
f = np.r_[np.zeros((1, dim)), np.cumsum(x**2, 0)]
x /= np.sqrt((f[win_start+win_len]-f[win_start])/win_len)
return x