android-malware-analysis/match_features.py at master · mwleeds/android-malware-analysis · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
#!/usr/bin/python3

"""
This script reads the JSON file passed as the first command line argument to
get the names of features and feature_weights.json (written by tensorflow_learn.py)
to get the feature weights from a trained ML model. It then matches the feature
weights to the human-readable names for them, and prints them out, sorted by the weights.
So the first feature listed is the one most indicative of maliciousness, and the first
one in the second list is the one most indicative of benign (the lists are just mirror
images of each other).
"""

import json
import sys

def main():
    with open(sys.argv[1]) as vectors:
        # Dataset of feature names that were used in the model
        feature_names = json.load(vectors)['features']

    with open('feature_weights.json') as weights:
        # Tensorflow model calculated weights for every feature
        feature_weights = json.load(weights)

    # Separate malicous and benign weights
    malicious_weights = [weight[0] for weight in feature_weights]
    benign_weights = [weight[1] for weight in feature_weights]

    # Sort weights in descending order
    malicious_indices=sorted(range(len(malicious_weights)), key=lambda k: malicious_weights[k], reverse=True)
    benign_indices=sorted(range(len(benign_weights)), key=lambda k: benign_weights[k], reverse=True)

    # Prints the rank of each feature, its weight, and the feature name
    print('MALICIOUS FEATURE RANKINGS:\n')
    for i,x in enumerate(malicious_indices):
        print ('{}. {} {}'.format(i, feature_names[x], malicious_weights[x]))

    print ('\n\n\n\n\nBENIGN FEATURE RANKINGS:\n')
    for i,x in enumerate(benign_indices):
        print ('{}. {} {}'.format(i, feature_names[x], benign_weights[x]))

if __name__=='__main__':
    main()