From 0b27e7b029896b63149edd8368323f0be7224283 Mon Sep 17 00:00:00 2001 From: Davide Date: Thu, 6 Oct 2022 11:08:46 +0200 Subject: [PATCH 1/4] Implement walk forward validation --- README.md | 16 +++++++++ main.py | 2 +- src/data/datasets.py | 2 +- walk_forward_validation.py | 73 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 91 insertions(+), 2 deletions(-) create mode 100644 walk_forward_validation.py diff --git a/README.md b/README.md index 18f3bb6..0b1ecfa 100644 --- a/README.md +++ b/README.md @@ -66,6 +66,22 @@ sh scripts/run_experiments_in_folder.sh parameters/paper_plots ``` +## How to run walk forward validation +In order to run a walk forward validation run the `walk_forward_validation.py` script with the following arguments: +``` +python walk_forward_validation.py --config --n_folds +``` + +you can add the argument `--anchored` if you want the train set of each fold to start at the same point. + +You can add the argument `--jobs ` if you want to parallelize the execution. + + +The given configuration file specifies the configuration of the experiment of the first fold, the following folds will be based on the same configuration but with different datasets. + +Currently it's not possible to run walk forward validation with synthetic datasets. + + ## Datasets For the simulation we discussed in our paper we used two datasets: - Bitcoin hourly close price (BTC-USD) diff --git a/main.py b/main.py index 02164c3..3f442df 100644 --- a/main.py +++ b/main.py @@ -16,7 +16,7 @@ def run_experiment(path=None): if par.train: - df_train, df_eval, df_test = load_train_eval_test_datasets(path) + df_train, df_eval, df_test = load_train_eval_test_datasets() df_train.to_pickle(f'{path}/datasets/df_train.pkl') df_eval.to_pickle(f'{path}/datasets/df_eval.pkl') df_test.to_pickle(f'{path}/datasets/df_test.pkl') diff --git a/src/data/datasets.py b/src/data/datasets.py index 32631fa..7f29b79 100644 --- a/src/data/datasets.py +++ b/src/data/datasets.py @@ -37,7 +37,7 @@ def split_df(df: pd.DataFrame, cut: float): return df.iloc[:split_point], df.iloc[split_point:] -def load_train_eval_test_datasets(path): +def load_train_eval_test_datasets(): df = fetch_dataset() eval_test_prop = par.dataset.eval_proportion + par.dataset.test_proportion diff --git a/walk_forward_validation.py b/walk_forward_validation.py new file mode 100644 index 0000000..e0102a8 --- /dev/null +++ b/walk_forward_validation.py @@ -0,0 +1,73 @@ +import argparse +import json +import os +import random +from joblib import Parallel, delayed +from copy import deepcopy +from main import main +from src.util import load_json +from src.data.datasets import load_train_eval_test_datasets +from src.parameters import Parameters as par + + +def temporay_main(config: dict): + temporary_folder = f"/var/tmp/{hash(random.random())}" + os.mkdir(temporary_folder) + temporary_json_name = os.path.join(temporary_folder, 'config.json') + + with open(temporary_json_name, 'w') as f: + json.dump(config, f, indent=4) + try: + main(temporary_json_name) + finally: + os.remove(temporary_json_name) + os.rmdir(temporary_folder) + + +def get_first_fold_info(json_file: str): + par.from_json(json_file) + df_train, _, df_test = load_train_eval_test_datasets() + return {'first_fold_start': df_train.index[0], + 'first_fold_end': df_test.index[-1]+1, + 'first_fold_test_size': df_test.index[-1] + 1 - df_test.index[0]} + + +def run_fold(first_fold_config: str, anchored: bool, fold_index: int, first_fold_info: dict): + config = deepcopy(first_fold_config) + + shift = fold_index * first_fold_info['first_fold_test_size'] + end = first_fold_info['first_fold_end'] + shift + start = first_fold_info['first_fold_start'] + + if not anchored: + start += shift + + config['dataset']['start'] = start + config['dataset']['end'] = end + config['dataset']['length'] = None + config['dataset']['name'] += f'_fold{fold_index+1}' + + temporay_main(config) + + +def run_folds(json_file: str, n_folds: int, anchored: bool, jobs: int): + first_fold_config = load_json(json_file) + if 'path' not in first_fold_config['dataset']: + raise ValueError('Dataset path is missing from configuration file') + first_fold_info = get_first_fold_info(json_file) + + def process(i): + run_fold(first_fold_config=first_fold_config, anchored=anchored, + fold_index=i, first_fold_info=first_fold_info) + + Parallel(n_jobs=jobs)(delayed(process)(i) for i in range(n_folds)) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--config', default=None, type=str) + parser.add_argument('--n_folds', default=None, type=int) + parser.add_argument('--anchored', action=argparse.BooleanOptionalAction) + parser.add_argument('--jobs', default=1, type=int) + args = parser.parse_args() + run_folds(args.config, args.n_folds, args.anchored, args.jobs) From 8d6623f416f47c3fefb6a1d4bf84dfdfc180c014 Mon Sep 17 00:00:00 2001 From: Davide Scassola Date: Wed, 12 Oct 2022 15:26:31 +0200 Subject: [PATCH 2/4] update requirements --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index b12f8e1..75676dc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ +joblib==1.2.0 matplotlib==3.5.3 numpy==1.23.2 pandas==1.4.4 From d964d0cb6c9c3ffe0df21fca89abd28462e17536 Mon Sep 17 00:00:00 2001 From: Davide Date: Fri, 14 Oct 2022 09:22:58 +0200 Subject: [PATCH 3/4] fix test size when anchored --- walk_forward_validation.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/walk_forward_validation.py b/walk_forward_validation.py index e0102a8..684559b 100644 --- a/walk_forward_validation.py +++ b/walk_forward_validation.py @@ -46,6 +46,9 @@ def run_fold(first_fold_config: str, anchored: bool, fold_index: int, first_fold config['dataset']['end'] = end config['dataset']['length'] = None config['dataset']['name'] += f'_fold{fold_index+1}' + + if anchored: + config['dataset']['test_proportion'] = first_fold_info['first_fold_test_size']/(end-start) temporay_main(config) From df2cfaeff4c671e4d3c8d7372a6e7a6ff9cedeff Mon Sep 17 00:00:00 2001 From: Davide Date: Fri, 14 Oct 2022 09:31:58 +0200 Subject: [PATCH 4/4] fix folds name --- walk_forward_validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/walk_forward_validation.py b/walk_forward_validation.py index 684559b..c893a34 100644 --- a/walk_forward_validation.py +++ b/walk_forward_validation.py @@ -45,7 +45,7 @@ def run_fold(first_fold_config: str, anchored: bool, fold_index: int, first_fold config['dataset']['start'] = start config['dataset']['end'] = end config['dataset']['length'] = None - config['dataset']['name'] += f'_fold{fold_index+1}' + config['dataset']['name'] += f'--fold{fold_index+1}' if anchored: config['dataset']['test_proportion'] = first_fold_info['first_fold_test_size']/(end-start)