FDA-Sampling/find_mismatch.py at master · byubrg/FDA-Sampling · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# this uses a hard-vote-esque method to determine the mismatch (adds up number of mismatches (1s)
# then divides by number of models.  if less than 0.5, then it is not mismatch, and vice versa)

import learner_functions as lf
import load_data as ld

data = ld.LoadData()

mismatch_labels = data.mismatch['mismatch'].tolist()


def find_mismatch_indices_hard(models, data, labels, type="default"):

    predictionForEachModel = list()
    mismatchIndices = list()

    for model in models:
        predictionForEachModel.append(lf.make_test_prediction(model, data, labels, False))


    for index in range(len(predictionForEachModel[0])):
        predictionSum = 0
        for array in predictionForEachModel:
            predictionSum += array[index]

        finalPrediction = predictionSum / len(models)
        if(finalPrediction > 0.5):
            mismatchIndices.append(index)

    return mismatchIndices

#scoreList should have scores of trained models passed into it as an array
def find_mismatch_probabilities(mismatchedIndicies, models, scoreList, data, labels):
    mismatchedIndicies = find_mismatch_indices_hard(models, data, labels)

    finalProbabilities = list()
    scoreWeights = list()
    scoreTotal = sum(scoreList)

    for score in scoreList:
        scoreWeights.append(score/scoreTotal)

    probabilityArrays = list()
    for model in models:
        probabilityArrays.append(model.predict_proba(data))

    for index in mismatchedIndicies:
        counter = 0
        probSum = 0
        for array in probabilityArrays:
            probValue = 0
            probValue = array[index][0]
            probValue *= scoreWeights[counter]
            probSum += probValue
            counter += 1

        finalProbabilities.append(probSum)

    return finalProbabilities