-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathUCSB_malware_image_classification.py
More file actions
264 lines (210 loc) · 9.12 KB
/
UCSB_malware_image_classification.py
File metadata and controls
264 lines (210 loc) · 9.12 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
#!/usr/bin/env python
# coding: utf-8
# In[4]:
# import the necessary packages
from __future__ import print_function
import sklearn
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from skimage import exposure
import numpy as np
import imutils # Make sure to install imutils using pip install imutils
from imutils import paths
import cv2
import random
from sklearn.datasets.base import Bunch
from scipy import io
from sklearn import preprocessing
import argparse
import sys
def load_ucsb_malware_images(datasetPath, min_images = 20, image_size = (128, 100), equal_samples = True,
test_size = 0.33, seed = 42, flatten = False):
# grab the image paths associated with the malware images
imagePaths = sorted(list(paths.list_images(datasetPath)))
# set the random seed, then initialize the data matrix and labels
random.seed(seed)
data = []
labels = []
# loop over the image paths
for (i, imagePath) in enumerate(imagePaths):
# load the image and convert it to grayscale
image = cv2.imread(imagePath)
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
face = cv2.resize(gray, image_size)
# check to see if the face should be flattened into a single row
if flatten:
face = face.flatten()
# update the data matrix and associated labels
data.append(face)
labels.append(imagePath.split("\\")[-2])
# convert the data matrix and labels list to a NumPy array
data = np.array(data)
labels = np.array(labels)
# # check to see if equal samples for each malware class should be used
if equal_samples:
# initialize the list of sampled indexes
sampledIdxs = []
# loop over the unique labels
for label in np.unique(labels):
# grab the indexes into the labels array where labels equals the current
# label
labelIdxs = np.where(labels == label)[0]
# only proceed if the required number of minimum images per malware class can be met
if len(labelIdxs) >= min_images:
# randomly sample the indexes for the current label, keeping only minumum
# supplied amount, then update the list of sampled idnexes
labelIdxs = random.sample(list(labelIdxs), min_images)
sampledIdxs.extend(labelIdxs)
# use the sampled indexes to select the appropriate data points and labels
random.shuffle(sampledIdxs)
data = data[sampledIdxs]
labels = labels[sampledIdxs]
# compute the training and testing split index
idxs = range(0, len(data))
random.shuffle(list(idxs))
split = int(len(idxs) * (1.0 - test_size))
# split the data into training and testing segments
(trainData, testData) = (data[:split], data[split:])
(trainLabels, testLabels) = (labels[:split], labels[split:])
# create the training and testing bunches
training = Bunch(name="training", data=trainData, target=trainLabels)
testing = Bunch(name="testing", data=testData, target=testLabels)
# return a tuple of the training, testing bunches, and original labels
return (training, testing, labels)
class ResultsMontage:
def __init__(self, imageSize, imagesPerRow, numResults):
# store the target image size and the number of images per row
self.imageW = imageSize[0]
self.imageH = imageSize[1]
self.imagesPerRow = imagesPerRow
# allocate memory for the output image
numCols = numResults // imagesPerRow
self.montage = np.zeros((numCols * self.imageW, imagesPerRow * self.imageH, 3), dtype="uint8")
# initialize the counter for the current image along with the row and column
# number
self.counter = 0
self.row = 0
self.col = 0
def addResult(self, image, text=None, highlight=False):
# check to see if the number of images per row has been met, and if so, reset
# the column counter and increment the row
if self.counter != 0 and self.counter % self.imagesPerRow == 0:
self.col = 0
self.row += 1
# resize the image to the fixed width and height and set it in the montage
image = cv2.resize(image, (self.imageH, self.imageW))
(startY, endY) = (self.row * self.imageW, (self.row + 1) * self.imageW)
(startX, endX) = (self.col * self.imageH, (self.col + 1) * self.imageH)
self.montage[startY:endY, startX:endX] = image
# if the text is not None, draw it
if text is not None:
cv2.putText(self.montage, text, (startX + 10, startY + 30), cv2.FONT_HERSHEY_SIMPLEX,
1.0, (0, 255, 255), 3)
# check to see if the result should be highlighted
if highlight:
cv2.rectangle(self.montage, (startX + 3, startY + 3), (endX - 3, endY - 3), (0, 255, 0), 4)
# increment the column counter and image counter
self.col += 1
self.counter += 1
# Due to sklearn deprecation of RandomizedPCA this function will check the version
def is_sklearn_less_than_0_18():
if int(sklearn.__version__.split(".")[1]) < 18:
return True
else:
return False
# python UCSB_malware_image_classification.py -nc 150 -min_im 90 -wh_c True -vis_c False
# python UCSB_malware_image_classification.py -nc 20 -min_im 300 -wh_c 1 -vis_c 0 ---> 0.9711111111111111
if __name__ == "__main__":
argv = sys.argv[1:]
parser = argparse.ArgumentParser()
# nc: Number of PC Components
# nc_comp defines the number of retained PC components
parser.add_argument('-nc', '--n_comp', default = 100, type = int)
parser.add_argument('-min_im', '--min_images', default = 50, type = int)
parser.add_argument('-wh_c', '--whiten_check', default = 1, type = int)
parser.add_argument('-vis_c', '--visualize_check', default = 0, type = int)
args = parser.parse_args()
n_comp = args.n_comp
min_imgs = args.min_images
whiten_int = args.whiten_check
visualize_int = args.visualize_check
if whiten_int == 1:
whiten = True
else:
whiten = False
if visualize_int == 1:
visualize = True
else:
visualize = False
print('No of PCs')
print(n_comp)
print('No of min_images')
print(min_imgs)
# Load the UCSB malware image dataset
print("[INFO] loading UCSB malware image dataset...")
training, testing, labels = load_ucsb_malware_images('malimg_paper_dataset_imgs', min_images = min_imgs, flatten = True, test_size = 0.25)
# Handle if sklearn is < 0.18 where we use RandomizedPCA
if is_sklearn_less_than_0_18():
print("[INFO] sklearn == {}, so using RandomizedPCA".format(sklearn.__version__))
from sklearn.decomposition import RandomizedPCA
# otherwise sklearn's RandomizedPCA is deprecated and we need to use PCA
else:
print("[INFO] sklearn=={}, so using PCA".format(sklearn.__version__))
from sklearn.decomposition import PCA
# compute the PCA (eigenvectors) representation of the data, then project the training data
# onto the eigenvectors subspace
print("[INFO] creating eigenvectors for the malware images...")
# handle if sklearn is < 0.18
if is_sklearn_less_than_0_18():
pca = RandomizedPCA(n_components = n_comp, whiten = whiten)
# otherwise sklearn is >= 0.18
else:
pca = PCA(svd_solver="randomized", n_components = n_comp, whiten = whiten)
trainData = pca.fit_transform(training.data)
# check to see if the PCA components should be visualized
if visualize == True:
# initialize the montage for the components
montage = ResultsMontage((100, 128), 4, 16)
# loop over the first 16 individual components
for (i, component) in enumerate(pca.components_[:16]):
# reshape the component to a 2D matrix, then convert the data type to an unsigned
# 8-bit integer so it can be displayed with OpenCV
component = component.reshape((100, 128))
component = exposure.rescale_intensity(component, out_range=(0, 255)).astype("uint8")
component = np.dstack([component] * 3)
montage.addResult(component)
# show the mean and principal component visualizations
# show the mean image
mean = pca.mean_.reshape((100, 128))
mean = exposure.rescale_intensity(mean, out_range=(0, 255)).astype("uint8")
cv2.imshow("Mean Malware Image", mean)
cv2.imshow("Eigen Malware", montage.montage)
cv2.waitKey(0)
# train a classifier on the eigenfaces representation
print("[INFO] training classifier...")
model = SVC(kernel = "rbf", C = 10.0, gamma = 0.001, random_state = 84)
model.fit(trainData, training.target)
# evaluate the model
print("[INFO] evaluating model...")
predictions = model.predict(pca.transform(testing.data))
print(classification_report(testing.target, predictions))
# Convert the malware names to class labels
le = preprocessing.LabelEncoder()
le.fit(labels)
# print(le.classes_)
#num_labels = le.transform(le.classes_)
# print(num_labels)
# print(confusion_matrix(testing.target, predictions, labels=le.classes_)) #range(np.max(num_labels))
print(accuracy_score(testing.target, predictions))
# loop over the the desired number of samples
# for i in np.random.randint(0, high=len(testing.data), size=(10, )):
# # grab the malware image and classify it
# malware_image = testing.data[i].reshape((100, 128)).astype("uint8")
# prediction = model.predict(pca.transform(testing.data[i].reshape(1, -1)))
# # resize the face to make it more visable, then display the face and the prediction
# print("[INFO] Prediction: {}, Actual: {}".format(prediction[0], testing.target[i]))
# malware_image = imutils.resize(malware_image, width=malware_image.shape[1] * 2, inter=cv2.INTER_CUBIC)
# cv2.imshow("Malware", malware_image)
# cv2.waitKey(0)