ELISA_CODE/EmotionExtractor.py at master · weiweivv2222/ELISA_CODE · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import Structures
import numpy
import pickle
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from keras.layers import Dropout
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from keras.optimizers import SGD
from sklearn.pipeline import Pipeline
from pyAudioAnalysis import audioBasicIO
from pyAudioAnalysis import audioFeatureExtraction
import os
import re
import matplotlib.pyplot as plt
from sklearn.externals import joblib
import array
from pydub import AudioSegment
from pydub.utils import get_array_type
import pandas as pd
from matplotlib import pyplot as plt


class EmotionExtractor:


        def __init__(self, filename_baseline,filename_mean_sd):
            self.structure = Structures.Structures(3, 34, 7, 256)
            self.my_attention_network = self.structure.structure_11_cnn_attention_dot()
            #load weights
            self.my_attention_network.load_weights(filename_baseline)
            self.dictionary =   pickle.load(open(filename_mean_sd, "rb"))
            self.mean_train = self.dictionary.get("mean")
            self.sd_train = self.dictionary.get("sd")

        def extract_features(self, file_path):
            [Fs, x] = audioBasicIO.readAudioFile(file_path)

            x = audioBasicIO.stereo2mono(x)  # necessary conversion for pyaudio analysis
            features = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.05 * Fs, 0.025 * Fs)
            features = np.mean(features, axis=1)
            features = np.asarray(features).reshape(len(features), -1).transpose()
            # features_complete = np.append(features_complete, features, axis=0)
            return features  # _complete

        def extract_features2(self, Fs, x):
            x = audioBasicIO.stereo2mono(x)  # necessary conversion for pyaudio analysis
            # print len(x)

            # they must be 24k samples
            # coef = int(np.floor(len(x)/48000))

            # x = x[range(0,len(x),6)]
            # print len(x)
            # Fs=16000


            features = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.05 * Fs, 0.025 * Fs)
            if len(features) == 0:
                features = np.zeros((34,2))

            features = np.mean(features, axis=1)
            features = np.asarray(features).reshape(len(features), -1).transpose()
            # features_complete = np.append(features_complete, features, axis=0)
            return features  # _complete

        def split_song(self, song):
            mydict = []
            convers = []

            for i in range(3000, len(song), 3000):
                # print i
                splitting = song[i - 3000:i]  # first three seconds
                bit_depth = splitting.sample_width * 16
                # print splitting.frame_rate
                array_type = get_array_type(bit_depth)
                numeric_array = array.array(array_type, splitting._data)
                numeric_array = numeric_array.tolist()
                features = self.extract_features2(splitting.frame_rate, np.asarray(numeric_array))[0]
                features_transformed = (features - self.mean_train) / self.sd_train
                convers.append(features_transformed)
                if len(convers) == 3:
                    prediction = self.my_attention_network.predict(np.array([convers]))[0]
                    # print prediction
                    mydict.append({"Anger": prediction[0], "Disgust": prediction[1], "Fear": prediction[3],
                                   "Happiness": prediction[5], "Neutral": prediction[6], "Sadness": prediction[2],
                                   "Surprise": prediction[4]})
                    convers.pop(0)

            data_frame_emotions = pd.DataFrame.from_dict(mydict)
            return data_frame_emotions


        def split_single_song(self, song):

            mydict = []
            convers = []

            #increment = 3000
            #if len(song)< 9000:
            increment = int(float(len(song))/3)

            for i in range(increment,len(song)+increment, increment):
                # print i
                splitting = song[i-increment:i]  # first incremement seconds
                bit_depth = splitting.sample_width * 8

                # print splitting.frame_rate
                array_type = get_array_type(bit_depth)


                numeric_array = array.array(array_type, splitting._data)
                numeric_array = numeric_array.tolist()
                features = self.extract_features2(splitting.frame_rate, np.asarray(numeric_array))[0]
                features_transformed = (features - self.mean_train) / self.sd_train
                convers.append(features_transformed)

                if len(convers) == 3:

                    prediction = self.my_attention_network.predict(np.array([convers]))[0]

                    mydict.append({"Anger": prediction[0], "Disgust": prediction[1], "Fear": prediction[3],
                                   "Happiness": prediction[5], "Neutral": prediction[6], "Sadness": prediction[2],
                                   "Surprise": prediction[4]})
                    #convers.pop(0)


            data_frame_emotions = pd.DataFrame.from_dict(mydict)
            return data_frame_emotions