forked from shubhabrataroy/MalwareDetection
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_consolidation.py
More file actions
89 lines (71 loc) · 2.69 KB
/
data_consolidation.py
File metadata and controls
89 lines (71 loc) · 2.69 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# -*- coding: utf-8 -*-
"""
Created on Mon Feb 09 16:50:14 2015
@author: VishnuC
@email: vrajs5@gmail.com
Beating the benchmark for Microsoft Malware Classification Challenge (BIG 2015)
"""
from multiprocessing import Pool
import os
import gzip
from csv import writer
import six
read_mode, write_mode = ('r','w') if six.PY2 else ('rt','wt')
path = '' #Path to project
os.chdir(path)
if six.PY2:
from itertools import izip
zp = izip
else:
zp = zip
# Give path to gzip of asm files
paths = ['train','test']
def consolidate(path):
''' A consolidation of given train or test files
This function reads each asm files (stored in gzip format)
and prepare summary. asm gzip files are stored in train_gz
and test_gz locating.
'''
s_path = path + '_gz/'
Files = os.listdir(s_path)
byteFiles = [i for i in Files if '.bytes.gz' in i]
consolidatedFile = path + '_consolidation.gz'
with gzip.open(consolidatedFile, write_mode) as f:
# Preparing header part
fw = writer(f)
colnames = ['filename', 'no_que_mark']
colnames += ['TB_'+hex(i)[2:] for i in range(16**2)]
colnames += ['FB_'+hex(i)[2:] for i in range(16**4)]
fw.writerow(colnames)
# Creating row set
consolidation = []
for t, fname in enumerate(byteFiles):
f = gzip.open(s_path+fname, read_mode)
twoByte = [0]*16**2
no_que_mark = 0
for row in f:
codes = row[:-2].split()[1:]
# Finding number of times ?? appears
no_que_mark += codes.count('??')
# Conversion of code to to two byte
twoByteCode = [int(i,16) for i in codes if i != '??']
# Frequency calculation of two byte codes
for i in twoByteCode:
twoByte[i] += 1
# Row added
consolidation.append([fname[:fname.find('.bytes.gz')], no_que_mark] \
+ twoByte)
# Writing rows after every 100 files processed
if (t+1)%100==0:
print(t+1, 'files loaded for ', path)
fw.writerows(consolidation)
consolidation = []
# Writing remaining files
if len(consolidation)>0:
fw.writerows(consolidation)
consolidation = []
del Files, byteFiles, colnames, s_path, consolidation, f, fw, \
twoByte, twoByteCode, consolidatedFile
if __name__ == '__main__':
p = Pool(2)
p.map(consolidate, paths)