Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 3 additions & 4 deletions activetesting/eval/create_arff.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,19 +15,18 @@ def parse_args():
parser.add_argument('--study_id', type=str, default='OpenML100', help='the tag to obtain the tasks from')
parser.add_argument('--classifier', type=str, default='random_forest', help='openml flow id')
parser.add_argument('--scoring', type=str, default='predictive_accuracy')
parser.add_argument('--num_runs', type=int, default=500, help='max runs to obtain from openml')
parser.add_argument('--num_runs', type=int, default=1000, help='max runs to obtain from openml')
parser.add_argument('--prevent_model_cache', action='store_true', help='prevents loading old models from cache')
parser.add_argument('--openml_server', type=str, default=None, help='the openml server location')
parser.add_argument('--openml_apikey', type=str, default=None, help='the apikey to authenticate to OpenML')
parser.add_argument('--num_tasks', type=int, default=None, help='limit number of tasks (for testing)')
return parser.parse_args()


if __name__ == '__main__':
args = parse_args()
study = openml.study.get_study(args.study_id, 'tasks')
setup_data_all = None

if args.classifier == 'random_forest':
flow_id = 6969
relevant_parameters = {"bootstrap": "nominal", "max_features": "numeric", "min_samples_leaf": "numeric",
Expand Down Expand Up @@ -86,4 +85,4 @@ def parse_args():
arff_dict = activetesting.utils.dataframe_to_arff(meta_data)
filename = 'meta_%s.arff' %args.classifier
with open(filename, 'w') as fp:
arff.dump(arff_dict, fp)
arff.dump(arff_dict, fp)
23 changes: 18 additions & 5 deletions activetesting/utils/connect.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import os
import pandas as pd
import pickle

import math

def get_dataframe_from_openml(task_id, flow_id, num_runs, relevant_parameters, evaluation_measure, cache_directory):
if 'y' in relevant_parameters:
Expand All @@ -19,18 +19,31 @@ def get_dataframe_from_openml(task_id, flow_id, num_runs, relevant_parameters, e
evaluations_cache_path = cache_directory + '/' + str(flow_id) + '/' + str(task_id) + '/evaluations.pkl'
setups_cache_path = cache_directory + '/' + str(flow_id) + '/' + str(task_id) + '/setups.pkl'
if not os.path.isfile(evaluations_cache_path) or not os.path.isfile(setups_cache_path):
evaluations = openml.evaluations.list_evaluations(evaluation_measure, size=num_runs, task=[task_id], flow=[flow_id])
evaluations = {}
for i in range(0, math.ceil(num_runs/500)):
if i == math.ceil(num_runs/500) - 1:
if num_runs%500 == 0:
evaluations.update(openml.evaluations.list_evaluations(evaluation_measure, size=500, task=[task_id], flow=[flow_id], offset=i*500))
else:
evaluations.update(openml.evaluations.list_evaluations(evaluation_measure, size=num_runs%500, task=[task_id], flow=[flow_id], offset=i*500))
else:
evaluations.update(openml.evaluations.list_evaluations(evaluation_measure, size=500, task=[task_id], flow=[flow_id], offset=i*500))
if len(evaluations) == 0:
raise ValueError('No evaluations for this task. ')

with open(evaluations_cache_path, 'wb') as fp:
pickle.dump(evaluations, fp)

# setups
setup_ids = []
for run_id, evaluation in evaluations.items():
setup_ids.append(evaluation.setup_id)
setups = openml.setups.list_setups(setup=setup_ids)

setups = {}
for i in range(0, math.ceil(len(setup_ids)/500)):
if i == math.ceil(num_runs/500) - 1:
setups.update(openml.setups.list_setups(setup=setup_ids[i*500:]))
else:
setups.update(openml.setups.list_setups(setup=setup_ids[i*500:i*500+500]))

with open(setups_cache_path, 'wb') as fp:
pickle.dump(setups, fp)
Expand Down Expand Up @@ -94,7 +107,7 @@ def get_dataframe_from_openml(task_id, flow_id, num_runs, relevant_parameters, e
if dataframe.shape[1] != len(relevant_parameters) + 1: # plus 1 for y data
raise ValueError()

dataframe = dataframe.reindex_axis(sorted(dataframe.columns), axis=1)
dataframe = dataframe.reindex(sorted(dataframe.columns), axis=1)

return dataframe

Expand Down