Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,22 @@ sh scripts/run_experiments_in_folder.sh parameters/paper_plots
```


## How to run walk forward validation
In order to run a walk forward validation run the `walk_forward_validation.py` script with the following arguments:
```
python walk_forward_validation.py --config <json configuration file> --n_folds <number of folds>
```

you can add the argument `--anchored` if you want the train set of each fold to start at the same point.

You can add the argument `--jobs <number of jobs>` if you want to parallelize the execution.


The given configuration file specifies the configuration of the experiment of the first fold, the following folds will be based on the same configuration but with different datasets.

Currently it's not possible to run walk forward validation with synthetic datasets.


## Datasets
For the simulation we discussed in our paper we used two datasets:
- Bitcoin hourly close price (BTC-USD)
Expand Down
2 changes: 1 addition & 1 deletion main.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
def run_experiment(path=None):

if par.train:
df_train, df_eval, df_test = load_train_eval_test_datasets(path)
df_train, df_eval, df_test = load_train_eval_test_datasets()
df_train.to_pickle(f'{path}/datasets/df_train.pkl')
df_eval.to_pickle(f'{path}/datasets/df_eval.pkl')
df_test.to_pickle(f'{path}/datasets/df_test.pkl')
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
joblib==1.2.0
matplotlib==3.5.3
numpy==1.23.2
pandas==1.4.4
Expand Down
2 changes: 1 addition & 1 deletion src/data/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def split_df(df: pd.DataFrame, cut: float):
return df.iloc[:split_point], df.iloc[split_point:]


def load_train_eval_test_datasets(path):
def load_train_eval_test_datasets():
df = fetch_dataset()

eval_test_prop = par.dataset.eval_proportion + par.dataset.test_proportion
Expand Down
76 changes: 76 additions & 0 deletions walk_forward_validation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import argparse
import json
import os
import random
from joblib import Parallel, delayed
from copy import deepcopy
from main import main
from src.util import load_json
from src.data.datasets import load_train_eval_test_datasets
from src.parameters import Parameters as par


def temporay_main(config: dict):
temporary_folder = f"/var/tmp/{hash(random.random())}"
os.mkdir(temporary_folder)
temporary_json_name = os.path.join(temporary_folder, 'config.json')

with open(temporary_json_name, 'w') as f:
json.dump(config, f, indent=4)
try:
main(temporary_json_name)
finally:
os.remove(temporary_json_name)
os.rmdir(temporary_folder)


def get_first_fold_info(json_file: str):
par.from_json(json_file)
df_train, _, df_test = load_train_eval_test_datasets()
return {'first_fold_start': df_train.index[0],
'first_fold_end': df_test.index[-1]+1,
'first_fold_test_size': df_test.index[-1] + 1 - df_test.index[0]}


def run_fold(first_fold_config: str, anchored: bool, fold_index: int, first_fold_info: dict):
config = deepcopy(first_fold_config)

shift = fold_index * first_fold_info['first_fold_test_size']
end = first_fold_info['first_fold_end'] + shift
start = first_fold_info['first_fold_start']

if not anchored:
start += shift

config['dataset']['start'] = start
config['dataset']['end'] = end
config['dataset']['length'] = None
config['dataset']['name'] += f'--fold{fold_index+1}'

if anchored:
config['dataset']['test_proportion'] = first_fold_info['first_fold_test_size']/(end-start)

temporay_main(config)


def run_folds(json_file: str, n_folds: int, anchored: bool, jobs: int):
first_fold_config = load_json(json_file)
if 'path' not in first_fold_config['dataset']:
raise ValueError('Dataset path is missing from configuration file')
first_fold_info = get_first_fold_info(json_file)

def process(i):
run_fold(first_fold_config=first_fold_config, anchored=anchored,
fold_index=i, first_fold_info=first_fold_info)

Parallel(n_jobs=jobs)(delayed(process)(i) for i in range(n_folds))


if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--config', default=None, type=str)
parser.add_argument('--n_folds', default=None, type=int)
parser.add_argument('--anchored', action=argparse.BooleanOptionalAction)
parser.add_argument('--jobs', default=1, type=int)
args = parser.parse_args()
run_folds(args.config, args.n_folds, args.anchored, args.jobs)