Malware_Classification/UCSB_malware_image_classification.py at master · lahouari2018/Malware_Classification · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
#!/usr/bin/env python
# coding: utf-8

# In[4]:


# import the necessary packages
from __future__ import print_function
import sklearn
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from skimage import exposure
import numpy as np
import imutils  # Make sure to install imutils using pip install imutils
from imutils import paths
import cv2
import random
from sklearn.datasets.base import Bunch
from scipy import io
from sklearn import preprocessing
import argparse
import sys


def load_ucsb_malware_images(datasetPath, min_images = 20, image_size = (128, 100), equal_samples = True,
	test_size = 0.33, seed = 42, flatten = False):
	# grab the image paths associated with the malware images
	imagePaths = sorted(list(paths.list_images(datasetPath)))

	# set the random seed, then initialize the data matrix and labels
	random.seed(seed)
	data = []
	labels = []

	# loop over the image paths
	for (i, imagePath) in enumerate(imagePaths):
		# load the image and convert it to grayscale
		image = cv2.imread(imagePath)
		gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

		face = cv2.resize(gray, image_size)

		# check to see if the face should be flattened into a single row
		if flatten:
			face = face.flatten()

		# update the data matrix and associated labels
		data.append(face)
		labels.append(imagePath.split("\\")[-2])

	# convert the data matrix and labels list to a NumPy array
	data = np.array(data)
	labels = np.array(labels)

	# # check to see if equal samples for each malware class should be used
	if equal_samples:
		# initialize the list of sampled indexes
		sampledIdxs = []

		# loop over the unique labels
		for label in np.unique(labels):
			# grab the indexes into the labels array where labels equals the current
			# label
			labelIdxs = np.where(labels == label)[0]

			# only proceed if the required number of minimum images per malware class can be met
			if len(labelIdxs) >= min_images:
				# randomly sample the indexes for the current label, keeping only minumum
				# supplied amount, then update the list of sampled idnexes
				labelIdxs = random.sample(list(labelIdxs), min_images)
				sampledIdxs.extend(labelIdxs)

		# use the sampled indexes to select the appropriate data points and labels
		random.shuffle(sampledIdxs)
		data = data[sampledIdxs]
		labels = labels[sampledIdxs]

	# compute the training and testing split index
	idxs = range(0, len(data))
	random.shuffle(list(idxs))
	split = int(len(idxs) * (1.0 - test_size))

	# split the data into training and testing segments
	(trainData, testData) = (data[:split], data[split:])
	(trainLabels, testLabels) = (labels[:split], labels[split:])

	# create the training and testing bunches
	training = Bunch(name="training", data=trainData, target=trainLabels)
	testing = Bunch(name="testing", data=testData, target=testLabels)

	# return a tuple of the training, testing bunches, and original labels
	return (training, testing, labels)


class ResultsMontage:
	def __init__(self, imageSize, imagesPerRow, numResults):
		# store the target image size and the number of images per row
		self.imageW = imageSize[0]
		self.imageH = imageSize[1]
		self.imagesPerRow = imagesPerRow

		# allocate memory for the output image
		numCols = numResults // imagesPerRow
		self.montage = np.zeros((numCols * self.imageW, imagesPerRow * self.imageH, 3), dtype="uint8")

		# initialize the counter for the current image along with the row and column
		# number
		self.counter = 0
		self.row = 0
		self.col = 0

	def addResult(self, image, text=None, highlight=False):
		# check to see if the number of images per row has been met, and if so, reset
		# the column counter and increment the row
		if self.counter != 0 and self.counter % self.imagesPerRow == 0:
			self.col = 0
			self.row += 1

		# resize the image to the fixed width and height and set it in the montage
		image = cv2.resize(image, (self.imageH, self.imageW))
		(startY, endY) = (self.row * self.imageW, (self.row + 1) * self.imageW)
		(startX, endX) = (self.col * self.imageH, (self.col + 1) * self.imageH)
		self.montage[startY:endY, startX:endX] = image

		# if the text is not None, draw it
		if text is not None:
			cv2.putText(self.montage, text, (startX + 10, startY + 30), cv2.FONT_HERSHEY_SIMPLEX,
				1.0, (0, 255, 255), 3)

		# check to see if the result should be highlighted
		if highlight:
			cv2.rectangle(self.montage, (startX + 3, startY + 3), (endX - 3, endY - 3), (0, 255, 0), 4)

		# increment the column counter and image counter
		self.col += 1
		self.counter += 1


# Due to sklearn deprecation of RandomizedPCA this function will check the version
def is_sklearn_less_than_0_18():
    if int(sklearn.__version__.split(".")[1]) < 18:
        return True
    else:
        return False


# python UCSB_malware_image_classification.py -nc 150 -min_im 90 -wh_c True -vis_c False
# python UCSB_malware_image_classification.py -nc 20 -min_im 300 -wh_c 1 -vis_c 0 ---> 0.9711111111111111
if __name__ == "__main__":
	argv = sys.argv[1:]
	parser = argparse.ArgumentParser()
	# nc: Number of PC Components
	# nc_comp defines the number of retained PC components
	parser.add_argument('-nc', '--n_comp', default = 100, type = int)
	parser.add_argument('-min_im', '--min_images', default = 50, type = int)
	parser.add_argument('-wh_c', '--whiten_check', default = 1, type = int)
	parser.add_argument('-vis_c', '--visualize_check', default = 0, type = int)

	args = parser.parse_args()
	n_comp = args.n_comp
	min_imgs = args.min_images
	whiten_int = args.whiten_check
	visualize_int = args.visualize_check

	if whiten_int == 1:
		whiten = True
	else:
		whiten = False

	if visualize_int == 1:
		visualize = True
	else:
		visualize = False

	print('No of PCs')
	print(n_comp)
	print('No of min_images')
	print(min_imgs)

	# Load the UCSB malware image dataset
	print("[INFO] loading UCSB malware image dataset...")
	training, testing, labels = load_ucsb_malware_images('malimg_paper_dataset_imgs', min_images = min_imgs, flatten = True, test_size = 0.25)

	# Handle if sklearn is < 0.18 where we use RandomizedPCA
	if is_sklearn_less_than_0_18():
		print("[INFO] sklearn == {}, so using RandomizedPCA".format(sklearn.__version__))
		from sklearn.decomposition import RandomizedPCA
	# otherwise sklearn's RandomizedPCA is deprecated and we need to use PCA
	else:
		print("[INFO] sklearn=={}, so using PCA".format(sklearn.__version__))
		from sklearn.decomposition import PCA


	# compute the PCA (eigenvectors) representation of the data, then project the training data
	# onto the eigenvectors subspace
	print("[INFO] creating eigenvectors for the malware images...")

	# handle if sklearn is < 0.18
	if is_sklearn_less_than_0_18():
		pca = RandomizedPCA(n_components = n_comp, whiten = whiten)
	# otherwise sklearn is >= 0.18
	else:
		pca = PCA(svd_solver="randomized", n_components = n_comp, whiten = whiten)


	trainData = pca.fit_transform(training.data)

#	check to see if the PCA components should be visualized
	if visualize == True:
		# initialize the montage for the components
		montage = ResultsMontage((100, 128), 4, 16)

		# loop over the first 16 individual components
		for (i, component) in enumerate(pca.components_[:16]):
			# reshape the component to a 2D matrix, then convert the data type to an unsigned
			# 8-bit integer so it can be displayed with OpenCV
			component = component.reshape((100, 128))
			component = exposure.rescale_intensity(component, out_range=(0, 255)).astype("uint8")
			component = np.dstack([component] * 3)
			montage.addResult(component)

		# show the mean and principal component visualizations
		# show the mean image
		mean = pca.mean_.reshape((100, 128))
		mean = exposure.rescale_intensity(mean, out_range=(0, 255)).astype("uint8")
		cv2.imshow("Mean Malware Image", mean)
		cv2.imshow("Eigen Malware", montage.montage)
		cv2.waitKey(0)


	# train a classifier on the eigenfaces representation
	print("[INFO] training classifier...")
	model = SVC(kernel = "rbf", C = 10.0, gamma = 0.001, random_state = 84)
	model.fit(trainData, training.target)

	# evaluate the model
	print("[INFO] evaluating model...")
	predictions = model.predict(pca.transform(testing.data))
	print(classification_report(testing.target, predictions))

	# Convert the malware names to class labels
	le = preprocessing.LabelEncoder()
	le.fit(labels)
#	print(le.classes_)
	#num_labels = le.transform(le.classes_)
	# print(num_labels)
#	print(confusion_matrix(testing.target, predictions, labels=le.classes_)) #range(np.max(num_labels))
	print(accuracy_score(testing.target, predictions))


	# loop over the the desired number of samples
	# for i in np.random.randint(0, high=len(testing.data), size=(10, )):
	# 	# grab the malware image and classify it
	# 	malware_image = testing.data[i].reshape((100, 128)).astype("uint8")
	# 	prediction = model.predict(pca.transform(testing.data[i].reshape(1, -1)))

	# 	# resize the face to make it more visable, then display the face and the prediction
	# 	print("[INFO] Prediction: {}, Actual: {}".format(prediction[0], testing.target[i]))
	# 	malware_image = imutils.resize(malware_image, width=malware_image.shape[1] * 2, inter=cv2.INTER_CUBIC)
	# 	cv2.imshow("Malware", malware_image)
	# 	cv2.waitKey(0)