-
Notifications
You must be signed in to change notification settings - Fork 44
Expand file tree
/
Copy pathtensorflow_learn.py
More file actions
executable file
·108 lines (85 loc) · 4.41 KB
/
tensorflow_learn.py
File metadata and controls
executable file
·108 lines (85 loc) · 4.41 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
#!/usr/bin/python3
"""
This script reads app_permission_vectors.json (written by parse_xml.py) and
feeds the data into a tensorflow "neural network" to try to learn from it.
"""
import tensorflow as tf
import numpy as np
import json
import math
import random
import sys
from configparser import ConfigParser
__author__='mwleeds'
def main():
config = ConfigParser()
config.read('config.ini')
LEARNING_RATE = config.getfloat('AMA', 'LEARNING_RATE')
NUM_CHUNKS = config.getint('AMA', 'NUM_CHUNKS')
SHUFFLE_CHUNKS = config.getboolean('AMA', 'SHUFFLE_CHUNKS')
DECAY_RATE = config.getfloat('AMA', 'DECAY_RATE')
# load the data from a file
with open(sys.argv[1]) as infile:
dataset = json.load(infile)
# placeholder for any number of bit vectors
x = tf.placeholder(tf.float32, [None, len(dataset['features'])])
# weights for each synapse
W = tf.Variable(tf.zeros([len(dataset['features']), 2]))
b = tf.Variable(tf.zeros([2]))
# results
y = tf.nn.softmax(tf.matmul(x, W) + b)
# placeholder for correct answers
y_ = tf.placeholder(tf.float32, [None, 2])
cross_entropy = -tf.reduce_sum(y_*tf.log(y))
init = tf.initialize_all_variables()
sess = tf.Session()
sess.run(init)
#TODO make a clean interface for this
#TODO make sure the training sample contains reasonable proportions of benign/malicious
malicious_app_names = [app for app in dataset['apps'] if dataset['apps'][app]['malicious'] == [1,0]]
benign_app_names = [app for app in dataset['apps'] if dataset['apps'][app]['malicious'] == [0,1]]
# break up the data into chunks for training and testing
malicious_app_name_chunks = list(chunks(malicious_app_names, math.floor(len(dataset['apps']) / NUM_CHUNKS)))
benign_app_name_chunks = list(chunks(benign_app_names, math.floor(len(dataset['apps']) / NUM_CHUNKS)))
if SHUFFLE_CHUNKS:
random.shuffle(malicious_app_name_chunks)
random.shuffle(benign_app_name_chunks)
# the first chunk of each will be used for testing (and the rest for training)
for i in range(int(sys.argv[2])):
# decayed Learning Rate increases accuracy by 3-5%
decayed_learning_rate = tf.train.exponential_decay(LEARNING_RATE, i, 1, DECAY_RATE, staircase=False)
train_step = tf.train.GradientDescentOptimizer(decayed_learning_rate).minimize(cross_entropy)
j = random.randrange(1, len(malicious_app_name_chunks))
k = random.randrange(1, len(benign_app_name_chunks))
app_names_chunk = malicious_app_name_chunks[j] + benign_app_name_chunks[k]
batch_xs = [dataset['apps'][app]['vector'] for app in app_names_chunk]
batch_ys = [dataset['apps'][app]['malicious'] for app in app_names_chunk]
sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys})
feature_weights=sess.run([W])[0]
# store feature weights that were calculated. Used by match_features.py
with open ('feature_weights.json','w') as f:
json.dump(feature_weights.tolist(),f)
app_names_chunk = malicious_app_name_chunks[0] + benign_app_name_chunks[0]
test_xs = [dataset['apps'][app]['vector'] for app in app_names_chunk]
test_ys = [dataset['apps'][app]['malicious'] for app in app_names_chunk]
prediction_diff = tf.subtract(tf.argmax(y,1), tf.argmax(y_,1))
# calculate how often benign apps were thought to be malicious
false_positive = tf.equal(prediction_diff, tf.constant(-1,shape=[len(test_ys)],dtype=tf.int64))
# calculate how often malicious apps were thought to be benign
false_negative = tf.equal(prediction_diff, tf.constant(1,shape=[len(test_ys)],dtype=tf.int64))
# recalculate prediction accuracy
correct_prediction = tf.equal(prediction_diff, tf.constant(0,shape=[len(test_ys)],dtype=tf.int64))
false_positive_rate = tf.reduce_mean(tf.cast(false_positive, tf.float32))
false_negative_rate = tf.reduce_mean(tf.cast(false_negative, tf.float32))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
print(sess.run(false_positive_rate, feed_dict={x: test_xs, y_: test_ys}))
print(sess.run(false_negative_rate, feed_dict={x: test_xs, y_: test_ys}))
print(sess.run(accuracy, feed_dict={x: test_xs, y_: test_ys}))
print(str(len(malicious_app_names)))
print(str(len(benign_app_names)))
def chunks(l, n):
"""Yield successive n-sized chunks from l."""
for i in range(0, len(l), n):
yield l[i:i+n]
if __name__=='__main__':
main()