forked from SB-BISS/ELISA_CODE
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathEmotionExtractor.py
More file actions
135 lines (106 loc) · 5.51 KB
/
EmotionExtractor.py
File metadata and controls
135 lines (106 loc) · 5.51 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import Structures
import numpy
import pickle
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from keras.layers import Dropout
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from keras.optimizers import SGD
from sklearn.pipeline import Pipeline
from pyAudioAnalysis import audioBasicIO
from pyAudioAnalysis import audioFeatureExtraction
import os
import re
import matplotlib.pyplot as plt
from sklearn.externals import joblib
import array
from pydub import AudioSegment
from pydub.utils import get_array_type
import pandas as pd
from matplotlib import pyplot as plt
class EmotionExtractor:
def __init__(self, filename_baseline,filename_mean_sd):
self.structure = Structures.Structures(3, 34, 7, 256)
self.my_attention_network = self.structure.structure_11_cnn_attention_dot()
#load weights
self.my_attention_network.load_weights(filename_baseline)
self.dictionary = pickle.load(open(filename_mean_sd, "rb"))
self.mean_train = self.dictionary.get("mean")
self.sd_train = self.dictionary.get("sd")
def extract_features(self, file_path):
[Fs, x] = audioBasicIO.readAudioFile(file_path)
x = audioBasicIO.stereo2mono(x) # necessary conversion for pyaudio analysis
features = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.05 * Fs, 0.025 * Fs)
features = np.mean(features, axis=1)
features = np.asarray(features).reshape(len(features), -1).transpose()
# features_complete = np.append(features_complete, features, axis=0)
return features # _complete
def extract_features2(self, Fs, x):
x = audioBasicIO.stereo2mono(x) # necessary conversion for pyaudio analysis
# print len(x)
# they must be 24k samples
# coef = int(np.floor(len(x)/48000))
# x = x[range(0,len(x),6)]
# print len(x)
# Fs=16000
features = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.05 * Fs, 0.025 * Fs)
if len(features) == 0:
features = np.zeros((34,2))
features = np.mean(features, axis=1)
features = np.asarray(features).reshape(len(features), -1).transpose()
# features_complete = np.append(features_complete, features, axis=0)
return features # _complete
def split_song(self, song):
mydict = []
convers = []
for i in range(3000, len(song), 3000):
# print i
splitting = song[i - 3000:i] # first three seconds
bit_depth = splitting.sample_width * 16
# print splitting.frame_rate
array_type = get_array_type(bit_depth)
numeric_array = array.array(array_type, splitting._data)
numeric_array = numeric_array.tolist()
features = self.extract_features2(splitting.frame_rate, np.asarray(numeric_array))[0]
features_transformed = (features - self.mean_train) / self.sd_train
convers.append(features_transformed)
if len(convers) == 3:
prediction = self.my_attention_network.predict(np.array([convers]))[0]
# print prediction
mydict.append({"Anger": prediction[0], "Disgust": prediction[1], "Fear": prediction[3],
"Happiness": prediction[5], "Neutral": prediction[6], "Sadness": prediction[2],
"Surprise": prediction[4]})
convers.pop(0)
data_frame_emotions = pd.DataFrame.from_dict(mydict)
return data_frame_emotions
def split_single_song(self, song):
mydict = []
convers = []
#increment = 3000
#if len(song)< 9000:
increment = int(float(len(song))/3)
for i in range(increment,len(song)+increment, increment):
# print i
splitting = song[i-increment:i] # first incremement seconds
bit_depth = splitting.sample_width * 8
# print splitting.frame_rate
array_type = get_array_type(bit_depth)
numeric_array = array.array(array_type, splitting._data)
numeric_array = numeric_array.tolist()
features = self.extract_features2(splitting.frame_rate, np.asarray(numeric_array))[0]
features_transformed = (features - self.mean_train) / self.sd_train
convers.append(features_transformed)
if len(convers) == 3:
prediction = self.my_attention_network.predict(np.array([convers]))[0]
mydict.append({"Anger": prediction[0], "Disgust": prediction[1], "Fear": prediction[3],
"Happiness": prediction[5], "Neutral": prediction[6], "Sadness": prediction[2],
"Surprise": prediction[4]})
#convers.pop(0)
data_frame_emotions = pd.DataFrame.from_dict(mydict)
return data_frame_emotions