-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathmalware_dataset_classifier_1.py
More file actions
76 lines (61 loc) · 2.52 KB
/
malware_dataset_classifier_1.py
File metadata and controls
76 lines (61 loc) · 2.52 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
# import the necessary packages
from sklearn.datasets.base import Bunch
from imutils import paths
from scipy import io
import numpy as np
import random
import cv2
def load_ucsb_malware_images(datasetPath, min_images = 20, image_size = (128, 100), equal_samples = True,
test_size = 0.33, seed = 42, flatten = False):
# grab the image paths associated with the malware images
imagePaths = sorted(list(paths.list_images(datasetPath)))
# set the random seed, then initialize the data matrix and labels
random.seed(seed)
data = []
labels = []
# loop over the image paths
for (i, imagePath) in enumerate(imagePaths):
# load the image and convert it to grayscale
image = cv2.imread(imagePath)
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
face = cv2.resize(gray, image_size)
# check to see if the face should be flattened into a single row
if flatten:
face = face.flatten()
# update the data matrix and associated labels
data.append(face)
labels.append(imagePath.split("\\")[-2])
# convert the data matrix and labels list to a NumPy array
data = np.array(data)
labels = np.array(labels)
# # check to see if equal samples for each face should be used
if equal_samples:
# initialize the list of sampled indexes
sampledIdxs = []
# loop over the unique labels
for label in np.unique(labels):
# grab the indexes into the labels array where labels equals the current
# label
labelIdxs = np.where(labels == label)[0]
# only proceed if the required number of minimum faces can be met
if len(labelIdxs) >= min_faces:
# randomly sample the indexes for the current label, keeping only minumum
# supplied amount, then update the list of sampled idnexes
labelIdxs = random.sample(list(labelIdxs), min_faces)
sampledIdxs.extend(labelIdxs)
# use the sampled indexes to select the appropriate data points and labels
random.shuffle(sampledIdxs)
data = data[sampledIdxs]
labels = labels[sampledIdxs]
# compute the training and testing split index
idxs = range(0, len(data))
random.shuffle(list(idxs))
split = int(len(idxs) * (1.0 - test_size))
# split the data into training and testing segments
(trainData, testData) = (data[:split], data[split:])
(trainLabels, testLabels) = (labels[:split], labels[split:])
# create the training and testing bunches
training = Bunch(name="training", data=trainData, target=trainLabels)
testing = Bunch(name="testing", data=testData, target=testLabels)
# return a tuple of the training, testing bunches, and original labels
return (training, testing, labels)