AIJ2019/evaluation_script.py at master · DanAnastasyev/AIJ2019 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
import attr
import time
import warnings
import numpy as np
from collections import defaultdict

from utils import *
from solvers import *

LOAD_ONLY = False
RETRAIN = False
EVAL_ONLY = False

def zero_if_exception(scorer):
    def new_scorer(*args, **kwargs):
        try:
            return scorer(*args, **kwargs)
        except:
            return 0
    return new_scorer


@attr.s
class Score(object):
    score = attr.ib(default=0)
    max_score = attr.ib(default=0)


class Evaluation(object):

    def __init__(self, train_path="dataset/train",
                 test_path="dataset/test",
                 score_path="data/evaluation/scoring.json"):
        self.train_path = train_path
        self.test_path = test_path
        self.score_path = score_path
        self.secondary_score = read_config(self.score_path)["secondary_score"]
        self.test_scores = []
        self.first_scores = []
        self.secondary_scores = []
        self.task_scores = defaultdict(Score)
        self.classifier = classifier.Solver()
        self.clf_fitting()
        s17 = punctuator.Solver()
        self.solvers = [
            solver1.Solver,
            solver2.Solver,
            solver3.Solver,
            solver4.Solver,
            solver5.Solver,
            solver6.Solver,
            solver7.Solver,
            solver8.Solver,
            solver9.Solver,
            solver10.Solver,
            solver10.Solver,
            solver10.Solver,
            solver13.Solver,
            solver14.Solver,
            solver15.Solver,
            solver16.Solver,
            lambda: s17,
            lambda: s17,
            lambda: s17,
            lambda: s17,
            solver21.Solver,
            solver22.Solver,
            solver23.Solver,
            solver24.Solver,
            solver25.Solver,
            solver26.Solver
        ]
        global LOAD_ONLY
        if LOAD_ONLY is False:
            LOAD_ONLY = range(1, 27)
        self.solvers = {i - 1: self.solvers[i - 1]() for i in LOAD_ONLY}

        self.time_limit_is_ok = True
        time_limit_is_observed = self.solver_fitting()
        if time_limit_is_observed:
            print("Time limit of fitting is OK")
        else:
            self.time_limit_is_ok = False
            print("TIMEOUT: Some solvers fit longer than 10m!")

    def solver_fitting(self):
        time_limit_is_observed = True
        for i, solver in self.solvers.items():
            start = time.time()
            solver_index = i + 1
            if solver_index in range(18, 21):
                continue
            train_tasks = load_tasks(self.train_path, task_num=solver_index if solver_index != 17 else [17, 18, 19, 20])
            trained = False
            if RETRAIN is True or (isinstance(RETRAIN, set) and i + 1 in RETRAIN) or not hasattr(solver, "load"):
                try:
                    print("Fitting Solver {}...".format(solver_index))
                    solver.fit(train_tasks)
                    if hasattr(solver, "save"):
                        solver.save("data/models/solver{}.pkl".format(solver_index))
                    trained = True
                except KeyboardInterrupt as e:
                    print(e)
                    pass
            if not trained:
                print("Loading Solver {}".format(solver_index))
                solver.load("data/models/solver{}.pkl".format(solver_index))
            duration = time.time() - start
            if duration > 60:
                time_limit_is_observed = False
                print("Time limit is violated in solver {} which has been fitting for {}m {:2}s".format(
                    solver_index, int(duration // 60), duration % 60))
            print("Solver {} is ready!\n".format(solver_index))
        return time_limit_is_observed

    def clf_fitting(self):
        try:
            #raise OSError()
            self.classifier.load("data/models/clf.pkl")
            print('Loaded classifier')
        except OSError:
            tasks = []
            for filename in os.listdir(self.train_path):
                if filename.endswith(".json"):
                    data = read_config(os.path.join(self.train_path, filename))
                    tasks.append(data)
            print("Fitting Classifier...")
            self.classifier.fit(tasks)
            self.classifier.save("data/models/clf.pkl")
            print("Classifier is ready!")

    # для всех заданий с 1 баллом
    @zero_if_exception
    def get_score(self, y_true, prediction):
        if "correct" in y_true:
            if y_true["correct"] == prediction:
                return 1
        elif "correct_variants" in y_true and isinstance(y_true["correct_variants"][0], str):
            if  prediction in y_true["correct_variants"]:
                return 1
        elif "correct_variants" in y_true and isinstance(y_true["correct_variants"][0], list):
            y_true = set(y_true["correct_variants"][0])
            y_pred = set(prediction)
            return int(len(set.intersection(y_true, y_pred)) == len(y_true) == len(y_pred))
        return 0

    # для 8 и 26
    @zero_if_exception
    def get_matching_score(self, y_true, pred):
        score = 0
        y_true = y_true["correct"]
        if len(y_true) != len(pred):
            return 0
        for y in y_true:
            if y_true[y] == pred[y]:
                score += 1
        return score

    # для 16 задания
    @zero_if_exception
    def get_multiple_score(self, y_true, y_pred):
        y_true = y_true["correct_variants"][0] if "correct_variants" in y_true else y_true["correct"]
        while len(y_pred) < len(y_true):
            y_pred.append(-1)
        return max(0, len(set.intersection(set(y_true), set(y_pred))) - len(y_pred) + len(y_true))

    def variant_score(self, variant_scores):
        first_score = sum(variant_scores)
        mean_score = round(np.mean(variant_scores), 3)
        secondary_score = int(self.secondary_score[str(first_score)])
        scores = {"first_score": first_score, "mean_accuracy": mean_score, "secondary_score": secondary_score}
        self.first_scores.append(first_score)
        self.secondary_scores.append(secondary_score)
        return scores

    def get_overall_scores(self):
        overall_scores = {}
        for variant, variant_scores in enumerate(self.test_scores):
            scores = self.variant_score(variant_scores)
            print("***YOUR RESULTS***")
            print("Variant: {}".format(variant + 1))
            print("Scores: {}\n".format(scores))
            overall_scores[str(variant + 1)] = scores
        self.overall_scores = overall_scores
        return self

    def predict_from_baseline(self):
        time_limit_is_observed = True
        clf_errors = 0
        solver_errors = 0
        for filename in os.listdir(self.test_path):
            predictions = []
            print("Solving {}".format(filename))
            data = read_config(os.path.join(self.test_path, filename))[:-1]
            clf_predictions = self.classifier.predict(data)
            for i, task in enumerate(data):
                task_index, task_type = int(task['id']), task["question"]["type"]
                task_number = clf_predictions[i]
                if task_index != task_number:
                    if task_index not in range(17, 21) or task_number not in range(17, 21):
                        clf_errors += 1
                if EVAL_ONLY is not False and int(task['id']) not in EVAL_ONLY:
                    continue

                start = time.time()
                print("Predicting task {} ({})...".format(task_index, task_number))
                y_true = task["solution"]
                prediction = 'invalid'
                try:
                    prediction = self.solvers[task_number - 1].predict_from_model(task)
                except Exception as e:
                    solver_errors += 1
                    print(e)

                if task_type == "matching":
                    score = self.get_matching_score(y_true, prediction)

                    if "correct_variants" in y_true:
                        correct_values = y_true["correct_variants"][0]
                    else:
                        correct_values = y_true["correct"]

                    max_score = len(correct_values)
                elif task_index == 16:
                    score = self.get_multiple_score(y_true, prediction)

                    if "correct_variants" in y_true:
                        correct_values = y_true["correct_variants"][0]
                    else:
                        correct_values = y_true["correct"]

                    max_score = len(correct_values)
                else:
                    score = self.get_score(y_true, prediction)
                    max_score = 1

                self.task_scores[task_index].score += score
                self.task_scores[task_index].max_score += max_score

                print("Score: {} / {}\nCorrect: {}\nPrediction: {}\n".format(score, max_score, y_true, prediction))
                predictions.append(score)
                duration = time.time() - start
                if duration > 60:
                    time_limit_is_observed = False
                    self.time_limit_is_ok = False
                    print("Time limit is violated in solver {} which has been predicting for {}m {:2}s".format(
                        i+1, int(duration // 60), duration % 60))
            self.test_scores.append(predictions)
        print('Total {} errors of the Classifier'.format(clf_errors))
        print('Total {} errors of solvers'.format(solver_errors))
        return time_limit_is_observed


def main():
    warnings.filterwarnings("ignore")
    evaluation = Evaluation()

    time_limit_is_observed = evaluation.predict_from_baseline()
    if not time_limit_is_observed:
        print('TIMEOUT: some solvers predict longer then 60s!')
    evaluation.get_overall_scores()
    mean_first_score = np.mean(evaluation.first_scores)
    mean_secondary_score = np.mean(evaluation.secondary_scores)
    print("Mean First Score: {}".format(mean_first_score))
    print("Mean Secondary Score: {}".format(mean_secondary_score))

    print("Results per task:")
    for task_id in sorted(evaluation.task_scores, key=lambda id: int(id)):
        print("{}\t{:.3%}".format(task_id,
              float(evaluation.task_scores[task_id].score) / evaluation.task_scores[task_id].max_score
              if evaluation.task_scores[task_id].max_score != 0 else 0.))
    for task_id in sorted(evaluation.task_scores, key=lambda id: int(id)):
        print("{}".format(
              float(evaluation.task_scores[task_id].score) / evaluation.task_scores[task_id].max_score * 100
              if evaluation.task_scores[task_id].max_score != 0 else 0.))

    if evaluation.time_limit_is_ok:
        print("Time limit is not broken by any of the solvers.")
    else:
        print("TIMEOUT: Time limit by violated in some of the solvers.")


if __name__ == "__main__":
    main()