From 99546262c58afcfe888111420689b89a659d18e3 Mon Sep 17 00:00:00 2001 From: Hilde Weerts Date: Wed, 4 Apr 2018 18:49:42 -0400 Subject: [PATCH 1/2] Fixed url too long error when num_runs > 500. Not pretty but it works. --- activetesting/eval/create_arff.py | 7 +++---- activetesting/utils/connect.py | 27 ++++++++++++++++++++------- 2 files changed, 23 insertions(+), 11 deletions(-) diff --git a/activetesting/eval/create_arff.py b/activetesting/eval/create_arff.py index 99b83a2..6a0e7f7 100644 --- a/activetesting/eval/create_arff.py +++ b/activetesting/eval/create_arff.py @@ -15,19 +15,18 @@ def parse_args(): parser.add_argument('--study_id', type=str, default='OpenML100', help='the tag to obtain the tasks from') parser.add_argument('--classifier', type=str, default='random_forest', help='openml flow id') parser.add_argument('--scoring', type=str, default='predictive_accuracy') - parser.add_argument('--num_runs', type=int, default=500, help='max runs to obtain from openml') + parser.add_argument('--num_runs', type=int, default=1000, help='max runs to obtain from openml') parser.add_argument('--prevent_model_cache', action='store_true', help='prevents loading old models from cache') parser.add_argument('--openml_server', type=str, default=None, help='the openml server location') parser.add_argument('--openml_apikey', type=str, default=None, help='the apikey to authenticate to OpenML') parser.add_argument('--num_tasks', type=int, default=None, help='limit number of tasks (for testing)') return parser.parse_args() - if __name__ == '__main__': args = parse_args() study = openml.study.get_study(args.study_id, 'tasks') setup_data_all = None - + if args.classifier == 'random_forest': flow_id = 6969 relevant_parameters = {"bootstrap": "nominal", "max_features": "numeric", "min_samples_leaf": "numeric", @@ -86,4 +85,4 @@ def parse_args(): arff_dict = activetesting.utils.dataframe_to_arff(meta_data) filename = 'meta_%s.arff' %args.classifier with open(filename, 'w') as fp: - arff.dump(arff_dict, fp) + arff.dump(arff_dict, fp) \ No newline at end of file diff --git a/activetesting/utils/connect.py b/activetesting/utils/connect.py index a4f97ee..4be82f3 100644 --- a/activetesting/utils/connect.py +++ b/activetesting/utils/connect.py @@ -4,7 +4,7 @@ import os import pandas as pd import pickle - +import math def get_dataframe_from_openml(task_id, flow_id, num_runs, relevant_parameters, evaluation_measure, cache_directory): if 'y' in relevant_parameters: @@ -19,19 +19,32 @@ def get_dataframe_from_openml(task_id, flow_id, num_runs, relevant_parameters, e evaluations_cache_path = cache_directory + '/' + str(flow_id) + '/' + str(task_id) + '/evaluations.pkl' setups_cache_path = cache_directory + '/' + str(flow_id) + '/' + str(task_id) + '/setups.pkl' if not os.path.isfile(evaluations_cache_path) or not os.path.isfile(setups_cache_path): - evaluations = openml.evaluations.list_evaluations(evaluation_measure, size=num_runs, task=[task_id], flow=[flow_id]) + evaluations = {} + for i in range(0, math.ceil(num_runs/500)): + if i == math.ceil(num_runs/500) - 1: + if num_runs%500 == 0: + evaluations.update(openml.evaluations.list_evaluations(evaluation_measure, size=500, task=[task_id], flow=[flow_id], offset=i*500)) + else: + evaluations.update(openml.evaluations.list_evaluations(evaluation_measure, size=num_runs%500, task=[task_id], flow=[flow_id], offset=i*500)) + else: + evaluations.update(openml.evaluations.list_evaluations(evaluation_measure, size=500, task=[task_id], flow=[flow_id], offset=i*500)) if len(evaluations) == 0: raise ValueError('No evaluations for this task. ') - with open(evaluations_cache_path, 'wb') as fp: pickle.dump(evaluations, fp) - + print(len(evaluations)) # setups setup_ids = [] for run_id, evaluation in evaluations.items(): setup_ids.append(evaluation.setup_id) - setups = openml.setups.list_setups(setup=setup_ids) - + + setups = {} + for i in range(0, math.ceil(len(setup_ids)/500)): + if i == math.ceil(num_runs/500) - 1: + setups.update(openml.setups.list_setups(setup=setup_ids[i*500:])) + else: + setups.update(openml.setups.list_setups(setup=setup_ids[i*500:i*500+500])) + print(len(setups)) with open(setups_cache_path, 'wb') as fp: pickle.dump(setups, fp) @@ -94,7 +107,7 @@ def get_dataframe_from_openml(task_id, flow_id, num_runs, relevant_parameters, e if dataframe.shape[1] != len(relevant_parameters) + 1: # plus 1 for y data raise ValueError() - dataframe = dataframe.reindex_axis(sorted(dataframe.columns), axis=1) + dataframe = dataframe.reindex(sorted(dataframe.columns), axis=1) return dataframe From 9e88b9ce6fe0c79f31c642da7d134aeec49199a6 Mon Sep 17 00:00:00 2001 From: Hilde Weerts Date: Wed, 4 Apr 2018 18:52:29 -0400 Subject: [PATCH 2/2] no message --- activetesting/utils/connect.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/activetesting/utils/connect.py b/activetesting/utils/connect.py index 4be82f3..fe82dc1 100644 --- a/activetesting/utils/connect.py +++ b/activetesting/utils/connect.py @@ -32,7 +32,7 @@ def get_dataframe_from_openml(task_id, flow_id, num_runs, relevant_parameters, e raise ValueError('No evaluations for this task. ') with open(evaluations_cache_path, 'wb') as fp: pickle.dump(evaluations, fp) - print(len(evaluations)) + # setups setup_ids = [] for run_id, evaluation in evaluations.items(): @@ -44,7 +44,7 @@ def get_dataframe_from_openml(task_id, flow_id, num_runs, relevant_parameters, e setups.update(openml.setups.list_setups(setup=setup_ids[i*500:])) else: setups.update(openml.setups.list_setups(setup=setup_ids[i*500:i*500+500])) - print(len(setups)) + with open(setups_cache_path, 'wb') as fp: pickle.dump(setups, fp)