diff --git a/config/custom/tcn_config_local.yaml b/config/custom/tcn_config_local.yaml
index 7ede2d23..5da5c8f9 100644
--- a/config/custom/tcn_config_local.yaml
+++ b/config/custom/tcn_config_local.yaml
@@ -39,7 +39,7 @@ optuna_submission_delay: 30
 
 output_size: 3
 num_channels: [150, 150, 150, 150, 150, 150, 150]
-kernel_size: 3
+kernel_size: 10
 dropout: 0.5
 use_last_timepoint: True
 last_y: False
diff --git a/config/custom/tcn_config_m.yaml b/config/custom/tcn_config_m.yaml
index 1dba8fa5..64e60696 100644
--- a/config/custom/tcn_config_m.yaml
+++ b/config/custom/tcn_config_m.yaml
@@ -22,11 +22,11 @@ tags:
   - ethusdt_volume500
   - simple_lookahead_y
 no_comet_logger: True
-seed: 40
+seed: 42
 batch_size: 256
 early_stopping_rounds: 15
 optimizer: adabelief
-lr: 0.001
+lr: 'auto'
 max_lr: 0.1
 max_lr_multiplier: 10
 one_cycle_length: 60
@@ -40,7 +40,7 @@ output_size: 1
 num_channels: [20, 20, 20, 20]
 kernel_size: 3
 dropout: 0.5
-use_last_timepoint: False
+use_last_timepoint: True
 last_y: True
 non_last_y_frac: 0.5
 regression: False
@@ -58,14 +58,14 @@ no_sample_weights: True
 data_dir: "C:/Users/u164428/Desktop/Dagobert/data/modelling"
 
 lookback: auto
-mini_series_length: 20
+mini_series_length: auto
 
 # If this is set to a number, then simple lookahead labelling is in place
 simple_lookahead_y: 15
 simple_lookahead_reg: False
 
 # If this is True, anchor is labelled before preprocessing. to_label and simple_lookahead_y cannot be used together.
-to_label: False
+to_label: True
 label_sl: 1
 label_pt: 1
 label_first_or_max: "first"
@@ -84,6 +84,12 @@ cols_to_model:
     - high
     - low
     - close
+    - volume
+    - cum_volume_buy
+    - cum_volume_sell
+    - cum_volume_quote
+    - cum_volume_quote_buy
+    - cum_volume_quote_sell
 #    - open_fd_0.0
 #    - high_fd_0.0
 #    - low_fd_0.0
@@ -94,12 +100,6 @@ cols_to_model:
 #    - close_fd_tuned
 #    - cum_ticks
 #    - cum_dollar
-#    - volume
-#    - cum_volume_buy
-#    - cum_volume_sell
-#    - cum_volume_quote
-#    - cum_volume_quote_buy
-#    - cum_volume_quote_sell
 #    - sin_date
 #    - cos_date
 #    - sin_time
@@ -131,22 +131,23 @@ augment_method: random_fast
 augment_prob: 0.25
 simple_augment_dfs:
   - std_bar_BTCUSDT_tick_1.feather
-  - std_bar_LTCUSDT_tick_1.feather
-  - std_bar_XRPUSDT_tick_1.feather
-  - std_bar_BTCUSDT_volume_100.feather
-  - std_bar_LTCUSDT_volume_1000.feather
-  - std_bar_XRPUSDT_volume_125000.feather
-  - std_bar_BTCUSDT_dollar_1000000.feather
-  - std_bar_LTCUSDT_dollar_40000.feather
-  - std_bar_XRPUSDT_dollar_20000.feather
+#  - std_bar_LTCUSDT_tick_1.feather
+#  - std_bar_XRPUSDT_tick_1.feather
+#  - std_bar_BTCUSDT_volume_100.feather
+#  - std_bar_LTCUSDT_volume_1000.feather
+#  - std_bar_XRPUSDT_volume_125000.feather
+#  - std_bar_BTCUSDT_dollar_1000000.feather
+#  - std_bar_LTCUSDT_dollar_40000.feather
+#  - std_bar_XRPUSDT_dollar_20000.feather
 simple_augment_prob: 0.5
+augment_dfs_mix: 0.33
 
 # --------------------------------------------------------------------------------------
 # PREPROCESSING
 # --------------------------------------------------------------------------------------
 
 train_start_date: "2018-06-01"
-train_days: 30
+train_days: 2
 val_days: 1
 val_train_offset_days: 1
 val_puffer_days: 1
diff --git a/config/rl_config.yaml b/config/rl_config.yaml
new file mode 100644
index 00000000..cf24c076
--- /dev/null
+++ b/config/rl_config.yaml
@@ -0,0 +1,165 @@
+
+
+# --------------------------------------------------------------------------------------
+# LIGHTNING
+# --------------------------------------------------------------------------------------
+
+gpus: 1
+pin_memory: True
+profiler: True
+#val_check_interval: 0.5
+# enable it with 'power' or 'binsearch'
+auto_scale_batch_size:
+#precision: 16
+
+# --------------------------------------------------------------------------------------
+# RUN
+# --------------------------------------------------------------------------------------
+
+log_dir: logs
+num_workers: 1
+exp_name: RL-PPO-TCN
+tags:
+  - RL_test
+no_comet_logger: True
+seed: 42
+batch_size: 500
+max_epochs: 100
+
+# --------------------------------------------------------------------------------------
+# RL
+# --------------------------------------------------------------------------------------
+
+asset_names: 
+  - BTC
+  - ETH
+  - XRP
+  - LTC
+trading_cost: 0.002
+reward_type: portfolio_vs_market
+num_env_heads: 20
+num_env_workers: 1
+normalize_advantages: True
+pgportfolio: False
+max_episode_length: 2000
+steps_per_epoch: 80000
+n_optim_iters: 4
+gamma: 0.99
+lam: 0.95
+lr_actor: 0.0001
+lr_critic: 0.0003
+clip_ratio: 0.2
+target_kl: 0.01
+
+
+# don't change these, or preprocessing won't work 
+target_col: rl_return
+to_label: False
+no_sample_weights: True
+binariser_method: 
+
+# --------------------------------------------------------------------------------------
+# MODEL
+# --------------------------------------------------------------------------------------
+
+actor_num_channels: [50, 50, 50, 50, 50]
+actor_kernel_size: 5
+actor_dropout: 0.2
+# sample size - exp abs diff to mean | 20 - 5% | 50 - 3% | 100 - 2% | 500 - 1%
+actor_dirichlet_sample_size: 20
+critic_num_channels: [50, 50, 50, 50, 50]
+critic_kernel_size: 5
+critic_dropout: 0.2
+use_last_timepoint: False
+
+# --------------------------------------------------------------------------------------
+# DATA
+# --------------------------------------------------------------------------------------
+
+data_dir: "C:/Work/dagobert/data/modelling"
+
+lookback: auto
+mini_series_length: auto
+
+df_train:
+  # anchor: std_bar_BTCUSDT_volume_100.feather
+  # df2: std_bar_ETHUSDT_volume_500.feather
+  # df3: std_bar_XRPUSDT_volume_125000.feather
+  # df4: std_bar_LTCUSDT_volume_1000.feather
+  anchor: std_bar_BTCUSDT_tick_1.feather
+  df2: std_bar_ETHUSDT_tick_1.feather
+  df3: std_bar_XRPUSDT_tick_1.feather
+  df4: std_bar_LTCUSDT_tick_1.feather
+df_val: 
+df_test:
+cols_to_model:
+  anchor:
+    - date_diff
+    - open
+    - high
+    - low
+    - close
+    # - open_fd_0.0
+    # - high_fd_0.0
+    # - low_fd_0.0
+    # - close_fd_0.0
+    # - open_fd_tuned
+    # - high_fd_tuned
+    # - low_fd_tuned
+    # - close_fd_tuned
+    - cum_ticks
+    - cum_dollar
+    - volume
+    - cum_volume_buy
+    - cum_volume_sell
+    - cum_volume_quote
+    - cum_volume_quote_buy
+    - cum_volume_quote_sell
+    - sin_date
+    - cos_date
+    - sin_time
+    - cos_time
+    # - boll
+    # - boll_lb
+    # - boll_ub
+    # - macd
+    # - macds
+    # - macdh
+    # - wr_60
+    # - rsi_60
+    # - rsv_60
+    # - atr_60
+    # - cci_60
+    # - kdjk_60
+    # - kdjd_60
+    # - kdjj_60
+    # - pdi_60
+    # - mdi_60
+    # - vr_60
+  df2:
+  df3:
+  df4:
+    # the cols of the secondary DFs will automatically be set to anchor's if not defined
+    
+time_feat_n: 1
+time_embed_dim: 12
+
+augment_method: random_fast
+augment_prob: 0
+augment_dfs:
+augment_dfs_mix: 0
+
+# --------------------------------------------------------------------------------------
+# PREPROCESSING
+# --------------------------------------------------------------------------------------
+
+train_start_date: "2019-01-01"
+train_days: 500
+val_days: 30
+val_train_offset_days: 1
+val_puffer_days: 1
+test_days: 30
+test_train_offset_days: 62
+test_puffer_days: 1
+
+scaling_method: minmax
diff --git a/config/tcn_config_data.yaml b/config/tcn_config_data.yaml
index da8eca4b..ea9a5926 100644
--- a/config/tcn_config_data.yaml
+++ b/config/tcn_config_data.yaml
@@ -59,7 +59,8 @@ no_sample_weights: False
 # DATA
 # --------------------------------------------------------------------------------------
 
-data_dir: "/home/ubuntu/dagobert/data/modelling"
+#data_dir: "/home/ubuntu/dagobert/data/modelling"
+data_dir: "C:/Users/u164428/Desktop/Dagobert/data/modelling"
 
 lookback: auto
 mini_series_length: auto
diff --git a/config/timegan_config.yaml b/config/timegan_config.yaml
new file mode 100644
index 00000000..10a9f0bc
--- /dev/null
+++ b/config/timegan_config.yaml
@@ -0,0 +1,124 @@
+
+# --------------------------------------------------------------------------------------
+# LIGHTNING
+# --------------------------------------------------------------------------------------
+
+gpus: 1
+pin_memory: True
+val_check_interval: 0.5
+print_nan_grads: True
+
+# --------------------------------------------------------------------------------------
+# RUN
+# --------------------------------------------------------------------------------------
+
+log_dir: logs
+num_workers: 8
+exp_name: TGAN-test
+tags:
+  - time_gan_test
+no_comet_logger: True
+seed: 42
+batch_size: 256
+
+# --------------------------------------------------------------------------------------
+# PREPROCESSING
+# --------------------------------------------------------------------------------------
+
+# don't change these, or preprocessing won't work
+target_col:
+to_label: False
+no_sample_weights: True
+binariser_method:
+
+# --------------------------------------------------------------------------------------
+# MODEL
+# --------------------------------------------------------------------------------------
+
+
+# gru or lstm
+rnn: "lstm"
+
+# embedding weight in cost of generator loss
+emb_weight: 1
+
+optimizer: "adamw"
+dropout:
+  recovery: 0.2
+  embedder: 0.2
+  supervisor: 0.2
+  generator: 0.2
+  discriminator: 0.2
+
+num_layers: 3
+hidden_size: 24
+z_dim: 32
+mini_series_length: 256
+
+# don't change order with lr dict.
+# generator_, embedder1_ separated out for ease of code for now. keep lr constant
+lr:
+  embedder0: 0.0005
+  supervisor: 0.0005
+  generator: 0.0005
+  embedder1: 0.0005
+  generator_: 0.0005
+  embedder1_: 0.0005
+  discriminator: 0.0005
+
+# --------------------------------------------------------------------------------------
+# DATA
+# --------------------------------------------------------------------------------------
+
+#data_dir: "C:/Work/dagobert/data/modelling"
+#data_dir: "/home/daniel/dagobert_data/modelling"
+# data_dir: "C:/Users/marcell/d/data/modelling"
+data_dir: "/home/ubuntu/dagobert/data/modelling"
+
+df_train:
+  anchor: std_bar_ETHUSDT_tick_1.feather
+  df2: std_bar_BTCUSDT_tick_1.feather
+
+df_val:
+df_test:
+
+# the cols of the secondary DFs will automatically be set to anchor's if not defined
+cols_to_model:
+  anchor:
+    - date_diff
+    - open
+    - high
+    - low
+    - close
+    - cum_ticks
+#    - cum_dollar
+    - volume
+    - cum_volume_buy
+    - cum_volume_sell
+    - cum_volume_quote
+    - cum_volume_quote_buy
+    - cum_volume_quote_sell
+#    - sin_date
+#    - cos_date
+#    - sin_time
+#    - cos_time
+  df2:
+
+augment_method:
+augment_dfs:
+augment_dfs_mix: 0
+
+# --------------------------------------------------------------------------------------
+# PREPROCESSING DATES
+# --------------------------------------------------------------------------------------
+
+train_start_date: "2019-01-01"
+train_days: 500
+val_days: 1
+val_train_offset_days: 1
+val_puffer_days: 1
+test_days: 1
+test_train_offset_days: 62
+test_puffer_days: 1
+
+scaling_method: minmax
\ No newline at end of file
diff --git a/config/timegan_config_local.yaml b/config/timegan_config_local.yaml
new file mode 100644
index 00000000..ffe7ebc4
--- /dev/null
+++ b/config/timegan_config_local.yaml
@@ -0,0 +1,122 @@
+
+# --------------------------------------------------------------------------------------
+# LIGHTNING
+# --------------------------------------------------------------------------------------
+
+gpus: 1
+pin_memory: True
+val_check_interval: 0.5
+print_nan_grads: True
+
+# --------------------------------------------------------------------------------------
+# RUN
+# --------------------------------------------------------------------------------------
+
+log_dir: logs
+num_workers: 8
+exp_name: TGAN-test
+tags:
+  - time_gan_test
+no_comet_logger: True
+seed: 42
+batch_size: 256
+
+# --------------------------------------------------------------------------------------
+# GAN
+# --------------------------------------------------------------------------------------
+
+# gru or lstm
+rnn: "lstm"
+# embedding weight in cost of generator loss
+emb_weight: 1
+
+# don't change these, or preprocessing won't work
+target_col:
+to_label: False
+no_sample_weights: True
+binariser_method:
+
+# --------------------------------------------------------------------------------------
+# MODEL
+# --------------------------------------------------------------------------------------
+
+optimizer: "adamw"
+dropout:
+  recovery: 0.2
+  embedder: 0.2
+  supervisor: 0.2
+  generator: 0.2
+  discriminator: 0.2
+
+num_layers: 3
+hidden_size: 24
+z_dim: 32
+mini_series_length: 256
+
+# don't change order with lr dict.
+# generator_, embedder1_ separated out for ease of code for now. keep lr constant
+lr:
+  embedder0: 0.0005
+  supervisor: 0.0005
+  generator: 0.0005
+  embedder1: 0.0005
+  generator_: 0.0005
+  embedder1_: 0.0005
+  discriminator: 0.0005
+
+# --------------------------------------------------------------------------------------
+# DATA
+# --------------------------------------------------------------------------------------
+
+#data_dir: "C:/Work/dagobert/data/modelling"
+#data_dir: "/home/daniel/dagobert_data/modelling"
+data_dir: "C:/Users/marcell/d/data/modelling"
+# data_dir: "/home/ubuntu/dagobert/data/modelling"
+
+df_train:
+  anchor: std_bar_ETHUSDT_tick_1.feather
+  df2: std_bar_BTCUSDT_tick_1.feather
+
+df_val:
+df_test:
+
+# the cols of the secondary DFs will automatically be set to anchor's if not defined
+cols_to_model:
+  anchor:
+    - date_diff
+    - open
+    - high
+    - low
+    - close
+    - cum_ticks
+#    - cum_dollar
+    - volume
+    - cum_volume_buy
+    - cum_volume_sell
+    - cum_volume_quote
+    - cum_volume_quote_buy
+    - cum_volume_quote_sell
+#    - sin_date
+#    - cos_date
+#    - sin_time
+#    - cos_time
+  df2:
+
+augment_method:
+augment_dfs:
+augment_dfs_mix: 0
+
+# --------------------------------------------------------------------------------------
+# PREPROCESSING
+# --------------------------------------------------------------------------------------
+
+train_start_date: "2019-01-01"
+train_days: 500
+val_days: 1
+val_train_offset_days: 1
+val_puffer_days: 1
+test_days: 1
+test_train_offset_days: 62
+test_puffer_days: 1
+
+scaling_method: minmax
\ No newline at end of file
diff --git a/notebooks/experiments/interact_with_nodes.ipynb b/notebooks/experiments/interact_with_nodes.ipynb
index f6e63954..6aa7b0a9 100644
--- a/notebooks/experiments/interact_with_nodes.ipynb
+++ b/notebooks/experiments/interact_with_nodes.ipynb
@@ -228,7 +228,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.9"
+   "version": "3.8.5"
   }
  },
  "nbformat": 4,
diff --git a/notebooks/experiments/interact_with_nodes_tgan.ipynb b/notebooks/experiments/interact_with_nodes_tgan.ipynb
new file mode 100644
index 00000000..3692f08f
--- /dev/null
+++ b/notebooks/experiments/interact_with_nodes_tgan.ipynb
@@ -0,0 +1,268 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "ModuleNotFoundError",
+     "evalue": "No module named 'paramiko'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
+      "\u001b[1;32m<ipython-input-4-8b881a967c18>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mpathlib\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mPath\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      2\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mtime\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 3\u001b[1;33m \u001b[1;32mimport\u001b[0m \u001b[0mparamiko\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
+      "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'paramiko'"
+     ]
+    }
+   ],
+   "source": [
+    "from pathlib import Path\n",
+    "import time\n",
+    "import paramiko "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'C:\\\\Users\\\\marcell\\\\d\\\\dagobert\\\\notebooks\\\\experiments'"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pwd"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## London GPUs\n",
+    "\n",
+    "- log in to all 10\n",
+    "- pull latest branch\n",
+    "- delete prev data files"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "london_gpus = [\n",
+    "    \"ec2-3-8-198-113.eu-west-2.compute.amazonaws.com\",\n",
+    "    \"ec2-18-132-49-7.eu-west-2.compute.amazonaws.com\",\n",
+    "    \"ec2-35-178-168-24.eu-west-2.compute.amazonaws.com\",\n",
+    "    \"ec2-18-130-246-221.eu-west-2.compute.amazonaws.com\",\n",
+    "    \"ec2-52-56-202-156.eu-west-2.compute.amazonaws.com\",\n",
+    "    \"ec2-18-132-17-125.eu-west-2.compute.amazonaws.com\",\n",
+    "    \"ec2-35-178-170-162.eu-west-2.compute.amazonaws.com\",\n",
+    "    \"ec2-3-8-155-239.eu-west-2.compute.amazonaws.com\",\n",
+    "    \"ec2-18-130-180-205.eu-west-2.compute.amazonaws.com\",\n",
+    "    \"ec2-3-8-194-52.eu-west-2.compute.amazonaws.com\",\n",
+    "]\n",
+    "username = \"ubuntu\"\n",
+    "london_k = paramiko.RSAKey.from_private_key_file(\"../../../sec/dagobert.pem\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "metadata": {
+    "collapsed": true,
+    "jupyter": {
+     "outputs_hidden": true
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0 ec2-3-8-198-113.eu-west-2.compute.amazonaws.com\n",
+      "0 b''\n",
+      "1 ec2-18-132-49-7.eu-west-2.compute.amazonaws.com\n",
+      "1 b''\n",
+      "2 ec2-35-178-168-24.eu-west-2.compute.amazonaws.com\n",
+      "2 b''\n",
+      "3 ec2-18-130-246-221.eu-west-2.compute.amazonaws.com\n"
+     ]
+    },
+    {
+     "ename": "KeyboardInterrupt",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
+      "\u001b[1;32m<ipython-input-35-a80ad43b0faf>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m     16\u001b[0m         \"\"\"\n\u001b[0;32m     17\u001b[0m     \u001b[0mssh_stdin\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mssh_stdout\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mssh_stderr\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mssh\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexec_command\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcmd2\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 18\u001b[1;33m     \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mi\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mssh_stderr\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     19\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;32m~\\Anaconda3\\envs\\dagobert\\lib\\site-packages\\paramiko\\file.py\u001b[0m in \u001b[0;36mread\u001b[1;34m(self, size)\u001b[0m\n\u001b[0;32m    198\u001b[0m             \u001b[1;32mwhile\u001b[0m \u001b[1;32mTrue\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    199\u001b[0m                 \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 200\u001b[1;33m                     \u001b[0mnew_data\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_read\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_DEFAULT_BUFSIZE\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    201\u001b[0m                 \u001b[1;32mexcept\u001b[0m \u001b[0mEOFError\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    202\u001b[0m                     \u001b[0mnew_data\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;32m~\\Anaconda3\\envs\\dagobert\\lib\\site-packages\\paramiko\\channel.py\u001b[0m in \u001b[0;36m_read\u001b[1;34m(self, size)\u001b[0m\n\u001b[0;32m   1374\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1375\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0m_read\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msize\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1376\u001b[1;33m         \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mchannel\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrecv_stderr\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msize\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   1377\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1378\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0m_write\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdata\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;32m~\\Anaconda3\\envs\\dagobert\\lib\\site-packages\\paramiko\\channel.py\u001b[0m in \u001b[0;36mrecv_stderr\u001b[1;34m(self, nbytes)\u001b[0m\n\u001b[0;32m    745\u001b[0m         \"\"\"\n\u001b[0;32m    746\u001b[0m         \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 747\u001b[1;33m             \u001b[0mout\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0min_stderr_buffer\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mnbytes\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    748\u001b[0m         \u001b[1;32mexcept\u001b[0m \u001b[0mPipeTimeout\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    749\u001b[0m             \u001b[1;32mraise\u001b[0m \u001b[0msocket\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;32m~\\Anaconda3\\envs\\dagobert\\lib\\site-packages\\paramiko\\buffered_pipe.py\u001b[0m in \u001b[0;36mread\u001b[1;34m(self, nbytes, timeout)\u001b[0m\n\u001b[0;32m    158\u001b[0m                 \u001b[1;32mwhile\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_buffer\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;36m0\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mand\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_closed\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    159\u001b[0m                     \u001b[0mthen\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtime\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 160\u001b[1;33m                     \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_cv\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mwait\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    161\u001b[0m                     \u001b[1;32mif\u001b[0m \u001b[0mtimeout\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    162\u001b[0m                         \u001b[0mtimeout\u001b[0m \u001b[1;33m-=\u001b[0m \u001b[0mtime\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtime\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m-\u001b[0m \u001b[0mthen\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;32m~\\Anaconda3\\envs\\dagobert\\lib\\threading.py\u001b[0m in \u001b[0;36mwait\u001b[1;34m(self, timeout)\u001b[0m\n\u001b[0;32m    294\u001b[0m         \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m    \u001b[1;31m# restore state no matter what (e.g., KeyboardInterrupt)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    295\u001b[0m             \u001b[1;32mif\u001b[0m \u001b[0mtimeout\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 296\u001b[1;33m                 \u001b[0mwaiter\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0macquire\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    297\u001b[0m                 \u001b[0mgotit\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;32mTrue\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    298\u001b[0m             \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;31mKeyboardInterrupt\u001b[0m: "
+     ]
+    }
+   ],
+   "source": [
+    "for i, hostname in enumerate(london_gpus):\n",
+    "    ssh = paramiko.SSHClient()\n",
+    "    ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())\n",
+    "    ssh.connect(hostname=hostname, username=username, pkey=london_k)\n",
+    "    print(i, hostname)\n",
+    "    cmd = \"\"\"\n",
+    "        cd dagobert/dagobert;\n",
+    "        rm ../data/modelling/*;\n",
+    "        git pull https://danielhomola:4frvgh%GTB@github.com/danielhomola/dagobert hparams_labelling;\n",
+    "        \"\"\"\n",
+    "    ssh_stdin, ssh_stdout, ssh_stderr = ssh.exec_command(cmd)\n",
+    "    print(i, ssh_stderr.read())\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "## Zip all log folders\n",
+    "\n",
+    "- log in to all gpus and zip all folders that start with log and log the models too\n",
+    "- uplaod them to s3\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "import time\n",
+    "import paramiko "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "local_path = Path('/home/daniel/dagobert_data/')\n",
+    "username = \"ubuntu\"\n",
+    "ohio_k = paramiko.RSAKey.from_private_key_file(\"../../../sec/dagobert_preprocessing_node.pem\")\n",
+    "london_k = paramiko.RSAKey.from_private_key_file(\"../../../sec/dagobert.pem\")\n",
+    "nodes = {\n",
+    "    \"gpu1\": {\"hostname\": \"ec2-52-20-7-61.compute-1.amazonaws.com\", \"key\": ohio_k},\n",
+    "    \"gpu2\": {\"hostname\": \"ec2-52-22-178-27.compute-1.amazonaws.com\", \"key\": ohio_k},\n",
+    "    \"gpu3\": {\"hostname\": \"ec2-54-147-237-118.compute-1.amazonaws.com\", \"key\": ohio_k},\n",
+    "    \"gpu4\": {\"hostname\": \"ec2-54-152-39-74.compute-1.amazonaws.com\", \"key\": ohio_k},\n",
+    "    \"gpu5\": {\"hostname\": \"ec2-54-225-32-4.compute-1.amazonaws.com\", \"key\": ohio_k},\n",
+    "    \"gpu6\": {\"hostname\": \"ec2-54-90-219-179.compute-1.amazonaws.com\", \"key\": ohio_k},\n",
+    "    \"gpu7\": {\"hostname\": \"ec2-100-24-115-15.compute-1.amazonaws.com\", \"key\": ohio_k},\n",
+    "    \"gpu8\": {\"hostname\": \"ec2-3-236-251-175.compute-1.amazonaws.com\", \"key\": ohio_k},\n",
+    "    \"gpu9\": {\"hostname\": \"ec2-34-237-76-111.compute-1.amazonaws.com\", \"key\": ohio_k},\n",
+    "    \"gpu10\": {\"hostname\": \"ec2-3-10-228-3.eu-west-2.compute.amazonaws.com\",\"key\": london_k},\n",
+    "    \"gpu11\": {\"hostname\": \"ec2-18-130-191-126.eu-west-2.compute.amazonaws.com\",\"key\": london_k},\n",
+    "    \"gpu12\": {\"hostname\": \"ec2-3-10-150-229.eu-west-2.compute.amazonaws.com\",\"key\": london_k},\n",
+    "    \"gpu13\": {\"hostname\": \"ec2-3-8-28-118.eu-west-2.compute.amazonaws.com\",\"key\": london_k},\n",
+    "    \"gpu14\": {\"hostname\": \"ec2-35-176-172-205.eu-west-2.compute.amazonaws.com\",\"key\": london_k},\n",
+    "    \"gpu15\": {\"hostname\": \"ec2-18-133-29-17.eu-west-2.compute.amazonaws.com\",\"key\": london_k},\n",
+    "    \"gpu16\": {\"hostname\": \"ec2-18-133-64-254.eu-west-2.compute.amazonaws.com\",\"key\": london_k},\n",
+    "    \"gpu17\": {\"hostname\": \"ec2-3-8-197-96.eu-west-2.compute.amazonaws.com\",\"key\": london_k},\n",
+    "    \"gpu18\": {\"hostname\": \"ec2-35-178-66-77.eu-west-2.compute.amazonaws.com\",\"key\": london_k},\n",
+    "    \"gpu19\": {\"hostname\": \"ec2-3-8-181-180.eu-west-2.compute.amazonaws.com\",\"key\": london_k},\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "-------------------------------------------\n",
+      "gpu6\n",
+      "-------------------------------------------\n",
+      "Uploaded everything successfully for gpu6\n",
+      "-------------------------------------------\n",
+      "gpu7\n",
+      "-------------------------------------------\n",
+      "Uploaded everything successfully for gpu7\n",
+      "-------------------------------------------\n",
+      "gpu8\n",
+      "-------------------------------------------\n",
+      "Uploaded everything successfully for gpu8\n",
+      "-------------------------------------------\n",
+      "gpu9\n",
+      "-------------------------------------------\n",
+      "Uploaded everything successfully for gpu9\n"
+     ]
+    }
+   ],
+   "source": [
+    "for name, node in nodes.items():\n",
+    "    ssh = paramiko.SSHClient()\n",
+    "    ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())\n",
+    "    ssh.connect(hostname=node['hostname'], username=username, pkey=node['key'])\n",
+    "    \n",
+    "    print('-------------------------------------------')\n",
+    "    print(name)\n",
+    "    print('-------------------------------------------')\n",
+    "    \n",
+    "    cmd = (\n",
+    "        f\"cd dagobert/dagobert;\"\n",
+    "        f\"sudo apt install zip;\"\n",
+    "        f\"rm -rf logs_run1;\"\n",
+    "        f\"ls | grep logs | xargs  zip {name}_all_logs.zip -r;\"\n",
+    "        f\"zip {name}_models.zip -r TCN;\"\n",
+    "        f\"aws s3 cp {name}_all_logs.zip s3://dagobert/;\"\n",
+    "        f\"aws s3 cp {name}_models.zip s3://dagobert/;\"\n",
+    "    )\n",
+    "    ssh_stdin, ssh_stdout, ssh_stderr = ssh.exec_command(cmd)\n",
+    "    print(f'Uploaded everything successfully for {name}')\n",
+    "    \n",
+    "    # download zip - not used in the end because we have better ways\n",
+    "    # ftp_client=ssh.open_sftp()\n",
+    "    # ftp_client.get(\"/home/ubuntu/dagobert/dagobert/all_logs.zip\", local_path / f\"{name}_all_logs.zip\")\n",
+    "    # print (f\"Downloaded all_zips from {name}.\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/notebooks/modelling/rl_env.ipynb b/notebooks/modelling/rl_env.ipynb
new file mode 100644
index 00000000..772e8a63
--- /dev/null
+++ b/notebooks/modelling/rl_env.ipynb
@@ -0,0 +1,359 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%load_ext autoreload\n",
+    "%autoreload 2\n",
+    "\n",
+    "from pathlib import Path\n",
+    "\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "\n",
+    "from dagobert.io import S3Connector"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## PortfolioSim\n",
+    "\n",
+    "We go through the `step` function of the `PortfolioSim` class to understand what is it doing. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([0.48192771, 0.26506024, 0.25301205])"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "eps = np.finfo(float).eps\n",
+    "\n",
+    "# orig portfolio value\n",
+    "p0 = 1\n",
+    "\n",
+    "# orig portfolio allocation (50% cash, nothing in 25% btc, 25% eth)\n",
+    "w0 = np.array([.5, .25, .25])\n",
+    "\n",
+    "# new relative price vector, expressed as returns (BTC went up 10%, ETH 5%)\n",
+    "y1 = np.array([1, 1.1, 1.05])\n",
+    "\n",
+    "# (eq7) since we last acted prices changed, so weights evolve (see below)\n",
+    "dw1 = (y1 * w0) / (np.dot(y1, w0) + eps)\n",
+    "dw1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.00020481927710843396"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# new weight vector from the agent for this timestep (predicted correctly that BTC/ETH will go up and allocated more USD to them)\n",
+    "w1 = np.array([.4, .3, .3])\n",
+    "\n",
+    "# (eq16) cost to change portfolio:\n",
+    "# excluding change in cash to avoid double counting for transaction cost\n",
+    "mu = 0.0025 * (np.abs(dw1[1:] - w1[1:])).sum()\n",
+    "mu"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.9997951807228915"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "p0 * (1 - mu)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "1"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "p0"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "1.0372875000000001"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# (eq11) new portfolio value: see section between (eq19-20) why this works\n",
+    "p1 = p0 * (1 - mu) * np.dot(y1, w0)\n",
+    "p1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.03728750000000014"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "rho1 = p1 / p0 - 1  # rate of returns\n",
+    "rho1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "p0 = p1\n",
+    "w0 = w1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "p1 1.0366750000000002\n",
+      "rho 0.036675000000000235\n",
+      "p1 1.0519309684687503\n",
+      "rho 0.014716249999999986\n",
+      "p1 1.05953774428449\n",
+      "rho 0.007231250000000022\n",
+      "p1 1.05953774428449\n",
+      "rho 0.0\n"
+     ]
+    }
+   ],
+   "source": [
+    "p0 = 1\n",
+    "w0 = np.array([.5, .25, .25])\n",
+    "\n",
+    "def step(y1, w1, w0, p0):\n",
+    "    dw1 = (y1 * w0) / (np.dot(y1, w0) + eps)\n",
+    "    mu = 0.0025 * (np.abs(dw1[1:] - w1[1:])).sum()\n",
+    "    p1 = p0 * (1 - mu) * np.dot(y1, w0)\n",
+    "    rho1 = p1 / p0 - 1\n",
+    "    print('p1', p1)\n",
+    "    print('rho', rho1)\n",
+    "    return p1\n",
+    "\n",
+    "# BTC, ETH is going up but the agent is selling them, so return and p value should go down due to transaction cost in last sale\n",
+    "y1 = np.array([1, 1.1, 1.05])\n",
+    "w1 = np.array([.8, .1, .1])\n",
+    "p1 = step(y1, w1, w0, p0)\n",
+    "\n",
+    "y2 = np.array([1, 1.1, 1.05])\n",
+    "w2 = np.array([.9, .05, .05])\n",
+    "p2 = step(y2, w2, w1, p1)\n",
+    "\n",
+    "\n",
+    "y3 = np.array([1, 1.1, 1.05])\n",
+    "w3 = np.array([1, 0, 0])\n",
+    "p3 = step(y3, w3, w2, p2)\n",
+    "\n",
+    "y4 = np.array([1, 1.1, 1.05])\n",
+    "w4 = np.array([1, 0, 0])\n",
+    "p4 = step(y4, w4, w3, p3)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 50,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "p1 0.9618249999999999\n",
+      "rho -0.03817500000000007\n",
+      "p1 0.9471319208437499\n",
+      "rho -0.015276250000000102\n",
+      "p1 0.9253212485790698\n",
+      "rho -0.023028125000000066\n"
+     ]
+    }
+   ],
+   "source": [
+    "# BTC, ETH is going down and the agent is buying them, so return and p value should go down fast\n",
+    "y1 = np.array([1, 0.9, .95])\n",
+    "w1 = np.array([.8, .1, .1])\n",
+    "\n",
+    "p1 = step(y1, w1, w0, p0)\n",
+    "\n",
+    "y2 = np.array([1, .9, .95])\n",
+    "w2 = np.array([.7, .15, .15])\n",
+    "p2 = step(y2, w2, w1, p1)\n",
+    "\n",
+    "y3 = np.array([1, .9, .95])\n",
+    "w3 = np.array([.5, .25, .25])\n",
+    "p3 = step(y3, w3, w2, p2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 51,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "p1 0.9618249999999999\n",
+      "rho -0.03817500000000007\n",
+      "p1 0.9471896303437499\n",
+      "rho -0.015216250000000042\n",
+      "p1 0.9398666705141548\n",
+      "rho -0.007731249999999967\n"
+     ]
+    }
+   ],
+   "source": [
+    "# BTC, ETH is going down and the agent is selling them, so return and p value should go down but not as fast as in the prev example\n",
+    "y1 = np.array([1, 0.9, .95])\n",
+    "w1 = np.array([.8, .1, .1])\n",
+    "\n",
+    "p1 = step(y1, w1, w0, p0)\n",
+    "\n",
+    "y2 = np.array([1, .9, .95])\n",
+    "w2 = np.array([.9, .05, .05])\n",
+    "p2 = step(y2, w2, w1, p1)\n",
+    "\n",
+    "y3 = np.array([1, .9, .95])\n",
+    "w3 = np.array([1, 0, 0])\n",
+    "p3 = step(y3, w3, w2, p2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 52,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "p1 1.0242\n",
+      "rho 0.0242\n",
+      "p1 1.0342038734999999\n",
+      "rho 0.009767499999999929\n",
+      "p1 1.049219220988378\n",
+      "rho 0.014518749999999914\n"
+     ]
+    }
+   ],
+   "source": [
+    "# BTC, ETH is going up and the agent is buying them, so return and p value should go up\n",
+    "y1 = np.array([1, 1.05, 1.05])\n",
+    "w1 = np.array([.8, .1, .1])\n",
+    "\n",
+    "p1 = step(y1, w1, w0, p0)\n",
+    "\n",
+    "y2 = np.array([1, 1.05, 1.05])\n",
+    "w2 = np.array([.7, .15, .15])\n",
+    "\n",
+    "p2 = step(y2, w2, w1, p1)\n",
+    "\n",
+    "y3 = np.array([1, 1.05, 1.05])\n",
+    "w3 = np.array([.5, .25, .25])\n",
+    "\n",
+    "p3 = step(y3, w3, w2, p2)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/notebooks/modelling/test_cryptodataset.ipynb b/notebooks/modelling/test_cryptodataset.ipynb
index c6d23254..8e3399e3 100644
--- a/notebooks/modelling/test_cryptodataset.ipynb
+++ b/notebooks/modelling/test_cryptodataset.ipynb
@@ -2,9 +2,18 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 2,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The autoreload extension is already loaded. To reload it, use:\n",
+      "  %reload_ext autoreload\n"
+     ]
+    }
+   ],
    "source": [
     "%load_ext autoreload\n",
     "%autoreload 2\n",
@@ -12,8 +21,9 @@
     "import pandas as pd\n",
     "import numpy as np\n",
     "import torch\n",
+    "from pathlib import Path\n",
     "\n",
-    "from dagobert.modelling.dl import CryptoDataset\n",
+    "from dagobert.modelling.dl import CryptoDataset, GeneratorCryptoDataset\n",
     "from dagobert.preprocessing.utils import set_dt_index"
    ]
   },
@@ -27,7 +37,8 @@
      "output_type": "stream",
      "text": [
       "  dev\n",
-      "* test/cryptodata\n"
+      "  feat/orderbook_data\n",
+      "* feat/tgan\n"
      ]
     }
    ],
@@ -39,11 +50,229 @@
    "cell_type": "code",
    "execution_count": 3,
    "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "# conda environments:\n",
+      "#\n",
+      "base                     C:\\Users\\u164428\\AppData\\Local\\Continuum\\anaconda3\n",
+      "dagobert              *  C:\\Users\\u164428\\AppData\\Local\\Continuum\\anaconda3\\envs\\dagobert\n",
+      "tensorenviron            C:\\Users\\u164428\\AppData\\Local\\Continuum\\anaconda3\\envs\\tensorenviron\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "! conda env list"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
    "outputs": [],
    "source": [
     "df = pd.read_feather(\"C:/Users/u164428/Desktop/Dagobert/data/modelling/std_bar_XRPUSDT_volume_125000.feather\")"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>date_diff</th>\n",
+       "      <th>sin_time</th>\n",
+       "      <th>cos_time</th>\n",
+       "      <th>sin_date</th>\n",
+       "      <th>cos_date</th>\n",
+       "      <th>date_time</th>\n",
+       "      <th>open</th>\n",
+       "      <th>close</th>\n",
+       "      <th>high</th>\n",
+       "      <th>low</th>\n",
+       "      <th>cum_ticks</th>\n",
+       "      <th>cum_dollar</th>\n",
+       "      <th>volume</th>\n",
+       "      <th>cum_volume_buy</th>\n",
+       "      <th>cum_volume_sell</th>\n",
+       "      <th>cum_volume_quote</th>\n",
+       "      <th>cum_volume_quote_buy</th>\n",
+       "      <th>cum_volume_quote_sell</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>-0.946930</td>\n",
+       "      <td>0.321439</td>\n",
+       "      <td>0.587785</td>\n",
+       "      <td>-0.809017</td>\n",
+       "      <td>2018-05-26 19:15:00</td>\n",
+       "      <td>0.62049</td>\n",
+       "      <td>0.62073</td>\n",
+       "      <td>0.62230</td>\n",
+       "      <td>0.61990</td>\n",
+       "      <td>118.0</td>\n",
+       "      <td>82411.086333</td>\n",
+       "      <td>132750.45</td>\n",
+       "      <td>65058.60</td>\n",
+       "      <td>67691.85</td>\n",
+       "      <td>82434.100899</td>\n",
+       "      <td>40405.129359</td>\n",
+       "      <td>42028.971540</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>840.0</td>\n",
+       "      <td>-0.925541</td>\n",
+       "      <td>0.378649</td>\n",
+       "      <td>0.587785</td>\n",
+       "      <td>-0.809017</td>\n",
+       "      <td>2018-05-26 19:29:00</td>\n",
+       "      <td>0.62074</td>\n",
+       "      <td>0.62124</td>\n",
+       "      <td>0.62181</td>\n",
+       "      <td>0.62000</td>\n",
+       "      <td>119.0</td>\n",
+       "      <td>81044.155573</td>\n",
+       "      <td>130494.81</td>\n",
+       "      <td>88327.54</td>\n",
+       "      <td>42167.27</td>\n",
+       "      <td>81042.379975</td>\n",
+       "      <td>54868.965179</td>\n",
+       "      <td>26173.414796</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>420.0</td>\n",
+       "      <td>-0.913545</td>\n",
+       "      <td>0.406737</td>\n",
+       "      <td>0.587785</td>\n",
+       "      <td>-0.809017</td>\n",
+       "      <td>2018-05-26 19:36:00</td>\n",
+       "      <td>0.62069</td>\n",
+       "      <td>0.62151</td>\n",
+       "      <td>0.62157</td>\n",
+       "      <td>0.62001</td>\n",
+       "      <td>71.0</td>\n",
+       "      <td>79926.853613</td>\n",
+       "      <td>128650.06</td>\n",
+       "      <td>126545.46</td>\n",
+       "      <td>2104.60</td>\n",
+       "      <td>79913.083642</td>\n",
+       "      <td>78606.055488</td>\n",
+       "      <td>1307.028154</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>420.0</td>\n",
+       "      <td>-0.900698</td>\n",
+       "      <td>0.434445</td>\n",
+       "      <td>0.587785</td>\n",
+       "      <td>-0.809017</td>\n",
+       "      <td>2018-05-26 19:43:00</td>\n",
+       "      <td>0.62151</td>\n",
+       "      <td>0.62225</td>\n",
+       "      <td>0.62225</td>\n",
+       "      <td>0.62010</td>\n",
+       "      <td>97.0</td>\n",
+       "      <td>84404.854894</td>\n",
+       "      <td>135782.73</td>\n",
+       "      <td>129644.41</td>\n",
+       "      <td>6138.32</td>\n",
+       "      <td>84370.106540</td>\n",
+       "      <td>80558.691512</td>\n",
+       "      <td>3811.415028</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>840.0</td>\n",
+       "      <td>-0.872496</td>\n",
+       "      <td>0.488621</td>\n",
+       "      <td>0.587785</td>\n",
+       "      <td>-0.809017</td>\n",
+       "      <td>2018-05-26 19:57:00</td>\n",
+       "      <td>0.62108</td>\n",
+       "      <td>0.62056</td>\n",
+       "      <td>0.62224</td>\n",
+       "      <td>0.61925</td>\n",
+       "      <td>118.0</td>\n",
+       "      <td>91415.801332</td>\n",
+       "      <td>147278.78</td>\n",
+       "      <td>93035.02</td>\n",
+       "      <td>54243.76</td>\n",
+       "      <td>91370.899395</td>\n",
+       "      <td>57733.407535</td>\n",
+       "      <td>33637.491861</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   date_diff  sin_time  cos_time  sin_date  cos_date           date_time  \\\n",
+       "0        0.0 -0.946930  0.321439  0.587785 -0.809017 2018-05-26 19:15:00   \n",
+       "1      840.0 -0.925541  0.378649  0.587785 -0.809017 2018-05-26 19:29:00   \n",
+       "2      420.0 -0.913545  0.406737  0.587785 -0.809017 2018-05-26 19:36:00   \n",
+       "3      420.0 -0.900698  0.434445  0.587785 -0.809017 2018-05-26 19:43:00   \n",
+       "4      840.0 -0.872496  0.488621  0.587785 -0.809017 2018-05-26 19:57:00   \n",
+       "\n",
+       "      open    close     high      low  cum_ticks    cum_dollar     volume  \\\n",
+       "0  0.62049  0.62073  0.62230  0.61990      118.0  82411.086333  132750.45   \n",
+       "1  0.62074  0.62124  0.62181  0.62000      119.0  81044.155573  130494.81   \n",
+       "2  0.62069  0.62151  0.62157  0.62001       71.0  79926.853613  128650.06   \n",
+       "3  0.62151  0.62225  0.62225  0.62010       97.0  84404.854894  135782.73   \n",
+       "4  0.62108  0.62056  0.62224  0.61925      118.0  91415.801332  147278.78   \n",
+       "\n",
+       "   cum_volume_buy  cum_volume_sell  cum_volume_quote  cum_volume_quote_buy  \\\n",
+       "0        65058.60         67691.85      82434.100899          40405.129359   \n",
+       "1        88327.54         42167.27      81042.379975          54868.965179   \n",
+       "2       126545.46          2104.60      79913.083642          78606.055488   \n",
+       "3       129644.41          6138.32      84370.106540          80558.691512   \n",
+       "4        93035.02         54243.76      91370.899395          57733.407535   \n",
+       "\n",
+       "   cum_volume_quote_sell  \n",
+       "0           42028.971540  \n",
+       "1           26173.414796  \n",
+       "2            1307.028154  \n",
+       "3            3811.415028  \n",
+       "4           33637.491861  "
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df = df.iloc[:, :18]\n",
+    "df.head()"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 73,
@@ -1389,6 +1618,103 @@
    "source": [
     "x.searchsorted(5, side=\"left\"), x.searchsorted(5, side=\"right\")"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# TimeGAN"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "BATCH_SIZE = 4"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "std_bar_XRPUSDT_tick_1.feather doesn't have enough bars to use with the anchor after restricting it to the max date of the anchor's index:std_bar_XRPUSDT_tick_1.feather: 1279233 bars / anchor: 1624303 bars.\n",
+      "The samples from this augment_dfs will be less unique as we approachthe end date of the anchor 2020-10-17T00:06:00.000000000.\n",
+      "std_bar_XRPUSDT_tick_1.feather doesn't have adequate time-coverage for anchor DF. This could lead to non-unique samples from this augment_dfs.\n",
+      "\n",
+      "Anchor min/max dates: 2017-08-19T00:39:00.000000000/2020-10-17T00:06:00.000000000. \n",
+      "std_bar_XRPUSDT_tick_1.feather min/max dates: 2018-05-06 00:09:00/2020-10-17 00:06:00.\n"
+     ]
+    }
+   ],
+   "source": [
+    "dataset = GeneratorCryptoDataset(\n",
+    "        df_to_load = {\"anchor\": \"std_bar_ETHUSDT_tick_1.feather\",\n",
+    "                      \"df2\": \"std_bar_XRPUSDT_tick_1.feather\"},\n",
+    "        cols_to_model = {\"anchor\": [\"date_diff\", \"open\", \"high\", \"low\", \"close\"],\n",
+    "                        \"df2\": [\"date_diff\", \"open\"]},\n",
+    "        target_col = None,\n",
+    "        mini_series_length = 3,\n",
+    "        last_y = True,\n",
+    "        date_col = \"date_time\",\n",
+    "        data_dir =  Path(\"C:/Users/u164428/Desktop/Dagobert/data/modelling\"),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data_loaded = list(torch.utils.data.DataLoader(dataset, batch_size = BATCH_SIZE))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "541435"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(data_loaded)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "torch.Size([3, 3, 7])"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data_loaded[0].shape "
+   ]
   }
  ],
  "metadata": {
diff --git a/notebooks/notes/rl_episodes.xlsx b/notebooks/notes/rl_episodes.xlsx
new file mode 100644
index 00000000..db08bae4
Binary files /dev/null and b/notebooks/notes/rl_episodes.xlsx differ
diff --git a/setup.cfg b/setup.cfg
index c845e0c5..5c702758 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -57,6 +57,8 @@ console_scripts =
     dagobert-tcn = dagobert.modelling.dl.tcn_runner:run
     dagobert-optuna = dagobert.modelling.dl.optuna:run
     dagobert-s3 = dagobert.io.runner:run
+    dagobert-rl = dagobert.modelling.rl.rl_runner:run
+    dagobert-tgan = dagobert.modelling.augmentation.tgan_runner:run
 
 [test]
 # py.test options when running `python setup.py test`
diff --git a/src/dagobert/data/lambda/orderbook_data.py b/src/dagobert/data/lambda/orderbook_data.py
new file mode 100644
index 00000000..0ddbfb33
--- /dev/null
+++ b/src/dagobert/data/lambda/orderbook_data.py
@@ -0,0 +1,28 @@
+import os
+import boto3
+from binance.client import Client
+import pandas as pd
+import time
+
+
+def fetch_orderbook_data():
+    s3 = boto3.resource("s3")
+    client = Client("", "")
+
+    pairs = ["BTCUSDT", "ETHUSDT", "XRPUSDT", "BCHUSDT", "LTCUSDT"]
+    bucket_name = "dagobert-orderbook"
+
+    for pair in pairs:
+        response = client.get_order_book(symbol=pair, limit=1000)
+
+        df = pd.DataFrame(response)
+        df = df[["bids", "asks"]]
+
+        name = f"{pair}_{int(time.time())}.csv"
+
+        df.to_csv(name, compression="gzip")
+
+        file_object = s3.Object(bucket_name, name)
+        file_object.upload_file(name)
+
+        os.remove(name)
diff --git a/src/dagobert/modelling/augmentation/README.md b/src/dagobert/modelling/augmentation/README.md
new file mode 100644
index 00000000..fc4ee69f
--- /dev/null
+++ b/src/dagobert/modelling/augmentation/README.md
@@ -0,0 +1,54 @@
+# Dagobert augmentation / TGAN module
+
+This module holds the implementation of TimeGAN. It is adopted to fit into Pytorch Lightning so we get benefits of easy 
+set up, checkpointing etc. The network is based on 
+[this paper](https://papers.nips.cc/paper/2019/file/c9efe5f26cd17ba6216bbe2a7d26d490-Paper.pdf).
+
+A couple of other useful links can be found in [this issue](https://github.com/danielhomola/dagobert/issues/63)
+
+## Running it
+Like the `dl` module, this module this is to be driven from the cmd line via an entry-point and a config file.
+```
+dagobert-tgan -c config/timegan_config.yaml
+```
+
+## Config params
+There are some config params which we can tinker with for optimal training. The example one 
+(`config/timegan_config.yml`) is nicely structured in blocks so it should be easy to understand which relate to the 
+TimeGAN model structure.
+
+The params are nicely documented in `augmentation/tgan_args.py` so make sure to check there before trying to find out 
+from the code what each of these do.
+
+## The TimeGAN 
+Much of the structure was implemented as quoted from the original paper (above). Five RNNs work together to create a 
+learned embedding space optimized with both supervised and adversarial objectives, encouraging the network to adhere to 
+the temporal dynamics of the training data.
+
+GAN convergence is notorioulsy tricky, and there are a bunch of handles and hyperparameters we can toggle 
+(some of this is inspired by various literature about training GANs more widely):
+- in order for the discirminator not to get 'too smart', we optimise it on the simple condition that the loss is not too
+  small. 
+- the generator (and one of the embedders) is optimised twice before every optimisation of the discriminator - this is currently implemented in a 
+  crude way, but it works.
+- convergence is very training intensive, the authors and [other implementations](https://github.com/jsyoon0823/TimeGAN)
+  all refer to 5-10k epochs
+- one imprtant aspect of this network is that the ouput series' lenght is a hyperparameter we set before training, and 
+  need to feed in the same lenght for X - we call this mini series length
+- performance is measured visually by PCA, and t-SNE analyses between the original and the synthetic data. For 
+discriminative performance the authors use an rnn classifier to distinguish between real and synthetic data. For 
+  predictive performance they train an RNN to predict the last element of a series on synthetic samples. This trained 
+  rnn is then validated on real data, measuring MAE. These latter two are not implemented currently, as synthetics data 
+  was already fucked up upon visual inspection of PCA, t-SNE.
+  
+## Future work
+- experiment with more advanced learning rates for different components of TimeGAN
+- warm up training of generator is a common measure to avoid lack of convergence
+- try different thresholds for prohibiting discriminator optimisation
+- it is challenging to iterate fast with this project. Convergence can take time, and training the generator 
+can go well initially and then deteriorate or vica versa. To always wait for being able to inspect (PCA,t-SNE) visuals, 
+  is time consuming and inconsistent. Some distance measurements between real/synthetic can come in handy for triggering
+  various actions, or just introducing more consistency into monitoring.
+
+
+
diff --git a/src/dagobert/modelling/augmentation/tgan_args.py b/src/dagobert/modelling/augmentation/tgan_args.py
new file mode 100644
index 00000000..0491bbcb
--- /dev/null
+++ b/src/dagobert/modelling/augmentation/tgan_args.py
@@ -0,0 +1,81 @@
+"""
+All custom arguments and hyper-parameters for the TimeGAN module.
+"""
+
+from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
+
+from pytorch_lightning import Trainer
+
+from dagobert.modelling.dl.tcn import TCNLightning
+from dagobert.modelling.dl.tcn_args import (
+    add_run_specific_args,
+    add_model_specific_args,
+    add_data_specific_args,
+    add_preprocessing_specific_args,
+)
+from dagobert.naming import NGAN
+
+
+def add_tgan_specific_args(parent_parser):
+    parser = ArgumentParser(
+        parents=[parent_parser],
+        add_help=False,
+        formatter_class=ArgumentDefaultsHelpFormatter,
+    )
+
+    # this is just a place-holder so it's easier to read the million params in the cmd
+    parser.add_argument("--TGAN_PARAMS", help="====================================")
+    parser.add_argument(
+        "--z_dim",
+        type=int,
+        default=50,
+        help="number of dimensions of noise vector (input of generator) at t timepoint",
+    )
+    parser.add_argument(
+        "--hidden_size",
+        type=int,
+        default=50,
+        help="The number of features in the hidden state, ie in embedded state.",
+    )
+    parser.add_argument(
+        "--num_layers",
+        type=int,
+        default=1,
+        help=(
+            "Number of RNN layers stacked onto each other, ie with new one using output"
+            " of previous"
+        ),
+    )
+    parser.add_argument(
+        "--rnn",
+        type=str,
+        default=NGAN.lstm,
+        choices=[NGAN.lstm, NGAN.gru],
+        help="Choice of RNN to use, either LSTM or GRU",
+    )
+    parser.add_argument(
+        "--emb_weight",
+        type=int,
+        default=1,
+        help="Weight multiplier for embedding component in generator loss",
+    )
+
+    return parser
+
+
+def get_all_args():
+    parser = ArgumentParser(
+        description="Lightning TimeGAN module",
+        formatter_class=ArgumentDefaultsHelpFormatter,
+    )
+
+    # add model params of lightning trainer (this HAS to be first)
+    parser = Trainer.add_argparse_args(parser)
+
+    # add model and run specific params
+    parser = add_tgan_specific_args(parser)
+    parser = add_model_specific_args(parser)
+    parser = add_run_specific_args(parser)
+    parser = add_data_specific_args(parser)
+    parser = add_preprocessing_specific_args(parser)
+    return parser.parse_args()
diff --git a/src/dagobert/modelling/augmentation/tgan_runner.py b/src/dagobert/modelling/augmentation/tgan_runner.py
new file mode 100644
index 00000000..90d4e8b6
--- /dev/null
+++ b/src/dagobert/modelling/augmentation/tgan_runner.py
@@ -0,0 +1,39 @@
+"""
+Dagobert's runner for TimeGAN.
+
+This module is driven by the `dagobert-tgan` command which can be parametrised by
+command line arguments, but it's much more convenient to use YAML configs for this,
+see the `tcn_args.py` and `tgan_args.py` for more detail.
+"""
+import os
+import sys
+import logging
+from pathlib import Path
+
+from dagobert.utils import setup_logging
+from dagobert.runner_utils import load_config, update_args
+from dagobert.modelling.augmentation.tgan_args import get_all_args
+from dagobert.modelling.augmentation.timegan import run_tgan
+
+logger = logging.getLogger(__name__)
+
+
+def run():
+    """
+    Initialise a TimeGan network and train it.
+    """
+
+    # parse arguments and setup logging
+    args = get_all_args()
+    setup_logging(logger, "dagobert-tgan", logging.INFO, args.log_dir)
+
+    # load config yaml if exist
+    if args.config_path != "":
+        config = load_config(Path(args.config_path))
+        args = update_args(args, config)
+
+    run_tgan(args)
+
+
+if __name__ == "__main__":
+    run()
diff --git a/src/dagobert/modelling/augmentation/timegan.py b/src/dagobert/modelling/augmentation/timegan.py
new file mode 100644
index 00000000..bdccfd7e
--- /dev/null
+++ b/src/dagobert/modelling/augmentation/timegan.py
@@ -0,0 +1,787 @@
+"""
+TimeGAN network, following the original implementation:
+https://bitbucket.org/mvdschaar/mlforhealthlabpub/src/master/alg/timegan/tgan.py.
+&
+https://papers.nips.cc/paper/2019/file/c9efe5f26cd17ba6216bbe2a7d26d490-Paper.pdf
+"""
+from typing import List, Optional
+from argparse import Namespace
+import logging
+from copy import deepcopy
+from pathlib import Path
+
+from sklearn.manifold import TSNE
+import numpy as np
+import pandas as pd
+import matplotlib
+from scipy.stats import spearmanr
+from matplotlib.figure import Figure
+
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import torch.nn.functional as f
+from torch.nn.utils import weight_norm
+from torch.utils.data import Dataset, WeightedRandomSampler, RandomSampler, DataLoader
+
+from pytorch_lightning import LightningModule
+from pytorch_lightning.trainer import seed_everything
+from pytorch_lightning.callbacks import ModelCheckpoint
+from pytorch_lightning import Trainer, Callback, loggers
+from pytorch_lightning.metrics import functional as plm
+
+from dagobert.naming import NStudy, NGAN, NPreprocessingArgs as npa
+
+from dagobert.modelling.dl import Preprocessing
+from dagobert.modelling.dl.data import GeneratorCryptoDataset
+from dagobert.modelling.dl import AdaBelief
+
+from dagobert.modelling.augmentation.utils import (
+    get_noise,
+    pca_analysis,
+    tsne_analysis,
+)
+from dagobert.modelling.utils import (
+    triple_barrier_error,
+    non_vertical_error,
+    t2n,
+    cm_from_tensor,
+    hist_from_tensor,
+    plot_from_tensor,
+    plot_cm,
+    fig_to_tb,
+    fig_to_comet,
+    plot_pca,
+    plot_tsne,
+    update_lookback,
+    plot_anchor_sample,
+)
+
+
+logger = logging.getLogger(__name__)
+
+
+def run_tgan(args):
+    # setup loggers
+    seed_everything(args.seed)
+    tb_logger_name = None
+    comet_name = args.exp_name
+    gan_loggers = []
+    tb_logger = loggers.TensorBoardLogger(
+        save_dir=Path(args.log_dir), name=args.exp_name, version=tb_logger_name
+    )
+    gan_loggers.append(tb_logger)
+    if not args.no_comet_logger:
+        gan_loggers.append(
+            loggers.CometLogger(
+                api_key=NStudy.comet_api_key,
+                workspace=NStudy.comet_workspace,
+                save_dir=args.log_dir,
+                project_name=NStudy.comet_project_name,
+                experiment_name=f"{comet_name}_{tb_logger.version}",
+            )
+        )
+
+    # setup callbacks
+    checkpoint_callback = ModelCheckpoint(
+        monitor="loss_gen/val",
+        filename="_{epoch:02d}_{avg_reward:.10f}",
+        dirpath=f"{args.log_dir}/models/{args.exp_name}_{tb_logger.version}",
+        save_top_k=5,
+        mode="min",
+    )
+
+    # define trainer and and lightning module
+    args.multiprocessing = True if args.gpus != 1 else False
+    trainer = Trainer.from_argparse_args(
+        args,
+        logger=gan_loggers,
+        checkpoint_callback=checkpoint_callback,
+    )
+    model = TimeGANLightning(args)
+    trainer.fit(model)
+    # trainer.test()
+
+
+class RnnBlock(nn.Module):
+    """
+    Class for creating 5 different rnn-based nets as components of TimeGAN.
+    """
+
+    def __init__(
+        self,
+        input_size: int,
+        hidden_size: int,
+        num_layers: int,
+        linear_input_size: int,
+        linear_output_size: int,
+        dropout: float = 0.2,
+        batch_first: bool = True,
+        rnn: str = "lstm",
+        linear_activation: bool = True,
+    ):
+        super(RnnBlock, self).__init__()
+
+        # input/output: (batch, seq, feature)
+        if rnn == NGAN.lstm:
+            self.rnn = nn.LSTM(
+                input_size=input_size,
+                hidden_size=hidden_size,
+                num_layers=num_layers,
+                dropout=dropout,
+                batch_first=batch_first,
+            )
+        elif rnn == NGAN.gru:
+            self.rnn = nn.GRU(
+                input_size=input_size,
+                hidden_size=hidden_size,
+                num_layers=num_layers,
+                dropout=dropout,
+                batch_first=batch_first,
+            )
+        self.linear_output_size = linear_output_size
+        self.tanh = nn.Tanh()
+        self.linear = nn.Linear(linear_input_size, linear_output_size)
+        self.sigmoid = nn.Sigmoid()
+        self.linear_activation = linear_activation
+
+    def forward(self, x):
+        rnn_out, _hidden = self.rnn(x)
+        rnn_out = self.tanh(rnn_out)
+        # reshape if net is the discriminator, and no activation
+        if self.linear_output_size == 1:
+            rnn_out = rnn_out.reshape(rnn_out.shape[0], -1)
+            output = self.linear(rnn_out)
+        else:
+            output = self.linear(rnn_out)
+            output = self.sigmoid(output)
+        return output
+
+
+class TimeGANLightning(LightningModule):
+    """
+    Lightning model made of 5 RNN nets working together:
+        - Embedding network between original feature space to latent space, provides
+        lower-dimensional adversarial learning space.
+        - Recovery network from latent space to original space.
+        - Generator function: generate time-series data in latent space.
+        - Discriminate the original and synthetic time-series data
+        - Supervisor generating next sequence using the previous sequence to better
+        capture temporal dynamics
+    """
+
+    # ----------------------------------------------------------------------------------
+    # INIT, (FORWARD)
+    # ----------------------------------------------------------------------------------
+
+    def __init__(self, hparams: Namespace):
+        """
+        Class constructor.
+
+        Args:
+            hparams: Hyper-params passed in to the module. See the docs for more details
+                https://pytorch-lightning.readthedocs.io/en/latest/hyperparameters.html
+                and dagobert.modelling.dl.tcn_args for more information on the params.
+        """
+
+        # define main vars (other than model)
+        super().__init__()
+        hparams = TimeGANLightning._pre_sanity_check(hparams)
+        # lightning sets this to cuda too late for some of our setup to work
+        self.tgan_device = "cuda" if hparams.gpus > 0 else "cpu"
+        # prepare datafiles if necessary
+        self.hparams = Preprocessing().preprocess_train_dfs(hparams)
+        # TODO: any sanity checks on data, hypermparams
+
+        self.comet_logging = not self.hparams.no_comet_logger
+
+        # get feature number of instruments
+        num_inputs = [len(cols) for dataset, cols in self.hparams.cols_to_model.items()]
+        all_inputs = sum(num_inputs)
+
+        # components of network
+        self.generator = RnnBlock(
+            input_size=self.hparams.z_dim,
+            hidden_size=self.hparams.hidden_size,
+            num_layers=self.hparams.num_layers,
+            linear_input_size=self.hparams.hidden_size,
+            linear_output_size=self.hparams.hidden_size,
+            dropout=self.hparams.dropout[NGAN.generator],
+            batch_first=True,
+            rnn=self.hparams.rnn,
+            linear_activation=True,
+        )
+        self.embedder = RnnBlock(
+            input_size=all_inputs,
+            hidden_size=self.hparams.hidden_size,
+            num_layers=self.hparams.num_layers,
+            linear_input_size=self.hparams.hidden_size,
+            linear_output_size=self.hparams.hidden_size,
+            dropout=self.hparams.dropout[NGAN.embedder],
+            batch_first=True,
+            rnn=self.hparams.rnn,
+            linear_activation=True,
+        )
+        # Generate next sequence using the previous sequence.
+        self.supervisor = RnnBlock(
+            input_size=self.hparams.hidden_size,
+            hidden_size=self.hparams.hidden_size,
+            num_layers=self.hparams.num_layers,
+            linear_input_size=self.hparams.hidden_size,
+            linear_output_size=self.hparams.hidden_size,
+            dropout=self.hparams.dropout[NGAN.supervisor],
+            batch_first=True,
+            rnn=self.hparams.rnn,
+            linear_activation=True,
+        )
+        self.recovery = RnnBlock(
+            input_size=self.hparams.hidden_size,
+            hidden_size=self.hparams.hidden_size,
+            num_layers=self.hparams.num_layers,
+            linear_input_size=self.hparams.hidden_size,
+            linear_output_size=all_inputs,
+            dropout=self.hparams.dropout[NGAN.recovery],
+            batch_first=True,
+            rnn=self.hparams.rnn,
+            linear_activation=True,
+        )
+        # TODO: shape of disc (batch, time, 1) or more rather (batch, 1)
+        self.discriminator = RnnBlock(
+            input_size=self.hparams.hidden_size,
+            hidden_size=self.hparams.hidden_size,
+            num_layers=self.hparams.num_layers,
+            linear_input_size=self.hparams.hidden_size
+            * self.hparams.mini_series_length,
+            linear_output_size=1,
+            dropout=self.hparams.dropout[NGAN.discriminator],
+            batch_first=True,
+            rnn=self.hparams.rnn,
+            linear_activation=False,
+        )
+        self = self.float()
+
+    # ----------------------------------------------------------------------------------
+    # OPTIMIZER SETUP & TRAIN
+    # ----------------------------------------------------------------------------------
+
+    def training_step(self, batch, batch_idx, optimizer_idx):
+        """
+        Carries out updates to networks from a batch of real samples.
+        Args:
+            batch: batch of X
+            batch_idx: idx of batch
+            optimizer_idx: idx that controls optimizing the 5 networks
+
+        Returns:
+            Loss
+        """
+        x = batch.float()
+        batch_len = len(x)
+
+        h = self.embedder(x)
+
+        # optimizers #0, #3 & #5 update embedder nets
+        if optimizer_idx in [0, 3, 5]:
+            x_tilde = self.recovery(h)
+            # optimize embedding via embedder and recovery nets
+            if optimizer_idx == 0:
+                loss_e = TimeGANLightning.embed_loss0(x_tilde, x)
+                self.log(
+                    "loss_e/train",
+                    loss_e,
+                    on_step=False,
+                    on_epoch=True,
+                    prog_bar=True,
+                    logger=True,
+                )
+
+                return loss_e
+
+            elif optimizer_idx in [3, 5]:
+                h_hat_supervise = self.supervisor(h)
+                loss_embed = TimeGANLightning.embedder_loss(
+                    x_tilde,
+                    x,
+                    h_hat_supervise,
+                    h,
+                )
+                self.log(
+                    "loss_embed/train",
+                    loss_embed,
+                    on_step=False,
+                    on_epoch=True,
+                    prog_bar=True,
+                    logger=True,
+                )
+                return loss_embed
+
+        # optimize supervisor
+        elif optimizer_idx == 1:
+            h_hat_supervise = self.supervisor(h)
+            loss_supervisor = TimeGANLightning.supervisor_loss(h_hat_supervise, h)
+            self.log(
+                "loss_supervisor/train",
+                loss_supervisor,
+                on_step=False,
+                on_epoch=True,
+                prog_bar=True,
+                logger=True,
+            )
+            return loss_supervisor
+
+        # TODO: can we streamline 7 optimizers to 5 with optimizer_step() hook?
+        elif optimizer_idx in [2, 4, 6]:
+            # random input to generator
+            z = get_noise(
+                batch_len,
+                self.hparams.mini_series_length,
+                self.hparams.z_dim,
+                device=self.tgan_device,
+            )
+            # update generator
+            if optimizer_idx in [2, 4]:
+                e_hat = self.generator(z)
+                h_hat = self.supervisor(e_hat)
+                h_hat_supervise = self.supervisor(h)
+
+                # synthetic data
+                x_hat = self.recovery(h_hat)
+
+                # no_grad to leave discriminator unchanged
+                with torch.no_grad():
+                    y_fake = self.discriminator(h_hat)
+                    y_fake_e = self.discriminator(e_hat)
+                loss_gen = TimeGANLightning.generator_loss(
+                    y_fake,
+                    y_fake_e,
+                    h,
+                    h_hat_supervise,
+                    x,
+                    x_hat,
+                    self.hparams.emb_weight,
+                )
+                self.log(
+                    "loss_gen/train",
+                    loss_gen,
+                    on_step=False,
+                    on_epoch=True,
+                    prog_bar=True,
+                    logger=True,
+                )
+                return loss_gen
+
+            # update discriminator
+            elif optimizer_idx == 6:
+                e_hat = self.generator(z)
+                h_hat = self.supervisor(e_hat.detach())
+                # detach to update only discriminator
+                y_fake = self.discriminator(h_hat.detach())
+                y_fake_e = self.discriminator(e_hat.detach())
+                y_real = self.discriminator(h.detach())
+
+                loss_disc = TimeGANLightning.discriminator_loss(
+                    y_fake, y_fake_e, y_real, self.hparams.emb_weight
+                )
+                self.log(
+                    "loss_disc/train",
+                    loss_disc,
+                    on_step=False,
+                    on_epoch=True,
+                    prog_bar=True,
+                    logger=True,
+                )
+                # limit discriminator from being "too good"
+                if loss_disc > 0.15:
+                    # pytorch lightning needs to have "loss" in the return dict
+                    return {
+                        "loss": loss_disc,
+                        "loss_disc/train": loss_disc,
+                        "y_fake/train": y_fake,
+                        "y_fake_e/train": y_fake_e,
+                        "y_real/train": y_real,
+                    }
+
+    def configure_optimizers(self) -> List[optim.Optimizer]:
+        """
+        Optimizer setup. List of optimizers accessed by idx in training step.
+        """
+        optimizers = []
+        param_pairs = [
+            list(self.embedder.parameters()) + list(self.recovery.parameters()),
+            list(self.generator.parameters()) + list(self.supervisor.parameters()),
+            list(self.generator.parameters()) + list(self.supervisor.parameters()),
+            list(self.embedder.parameters()) + list(self.recovery.parameters()),
+            list(self.generator.parameters()) + list(self.supervisor.parameters()),
+            list(self.embedder.parameters()) + list(self.recovery.parameters()),
+            list(self.discriminator.parameters()),
+        ]
+        if "adamw" in self.hparams.optimizer.lower():
+            for param_pair, network in zip(param_pairs, self.hparams.lr.keys()):
+                optimizer = torch.optim.AdamW(param_pair, lr=self.hparams.lr[network])
+                optimizers.append(optimizer)
+        elif "adabelief" in self.hparams.optimizer.lower():
+            for param_pair, network in zip(param_pairs, self.hparams.lr.keys()):
+                optimizer = AdaBelief(param_pair, lr=self.hparams.lr[network])
+                optimizers.append(optimizer)
+        return optimizers
+
+    def train_dataloader(self):
+        return self._get_dataloader(self.hparams.df_train, "train")
+
+    def validation_step(self, batch, batch_idx):
+        # change float64 to float32
+        x = batch.float()
+        batch_len = len(x)
+
+        # noise
+        z = get_noise(
+            batch_len,
+            self.hparams.mini_series_length,
+            self.hparams.z_dim,
+            device=self.tgan_device,
+        )
+
+        # generate fake data and compare with validation set
+        h = self.embedder(x)
+        e_hat = self.generator(z)
+        h_hat = self.supervisor(e_hat)
+        x_hat = self.recovery(h_hat)
+        h_hat_supervise = self.supervisor(h)
+
+        y_fake = self.discriminator(h_hat.detach())
+        y_fake_e = self.discriminator(e_hat.detach())
+        y_real = self.discriminator(h.detach())
+
+        pca_x, pca_x_hat = pca_analysis(t2n(x), t2n(x_hat))
+        tsne_x, tsne_x_hat = tsne_analysis(t2n(x), t2n(x_hat))
+
+        loss_gen = TimeGANLightning.generator_loss(
+            y_fake,
+            y_fake_e,
+            h,
+            h_hat_supervise,
+            x,
+            x_hat,
+            self.hparams.emb_weight,
+        )
+        self.log(
+            "loss_gen/val",
+            loss_gen,
+            on_step=False,
+            on_epoch=True,
+            prog_bar=True,
+            logger=True,
+        )
+        loss_disc = TimeGANLightning.discriminator_loss(
+            y_fake, y_fake_e, y_real, self.hparams.emb_weight
+        )
+        self.log(
+            "loss_disc/val",
+            loss_disc,
+            on_step=False,
+            on_epoch=True,
+            prog_bar=True,
+            logger=True,
+        )
+        return {
+            "loss_gen/val": loss_gen,
+            "loss_disc/val": loss_disc,
+            "y_fake/val": y_fake,
+            "y_fake_e/val": y_fake_e,
+            "y_real/val": y_real,
+            "pca_x/val": pca_x,
+            "pca_x_hat/val": pca_x_hat,
+            "tsne_x/val": tsne_x,
+            "tsne_x_hat/val": tsne_x_hat,
+        }
+
+    def validation_epoch_end(self, outputs):
+        return self._epoch_end(outputs, "val")
+
+    def training_epoch_end(self, outputs):
+        return self._epoch_end(outputs, "train")
+
+    def val_dataloader(self):
+        return self._get_dataloader(self.hparams.df_val, "val")
+
+    # ----------------------------------------------------------------------------------
+    # SETUP FUNCTIONS
+    # ----------------------------------------------------------------------------------
+    def _get_dataloader(self, dfs_to_load: dict, prefix: str) -> DataLoader:
+        """
+        Returns a dataloader for train and validation sets.
+        Args:
+            dfs_to_load: Either train, validation or test DFs to load.
+            prefix: Name of phase, either train or val.
+        Returns:
+            Instantiated DataLoader.
+        """
+        # define dataset and plot it
+        if prefix == "train":
+            shuffle = True
+        else:
+            shuffle = False
+
+        dataset = GeneratorCryptoDataset(
+            df_to_load=dfs_to_load,
+            cols_to_model=self.hparams.cols_to_model,
+            target_col=self.hparams.target_col,
+            mini_series_length=self.hparams.mini_series_length,
+            last_y=self.hparams.last_y,
+            data_dir=self.hparams.data_dir,
+        )
+        self._plot_dataset(*dataset.plot(), prefix)
+        return DataLoader(
+            dataset,
+            batch_size=self.hparams.batch_size,
+            shuffle=shuffle,
+            num_workers=self.hparams.num_workers,
+        )
+
+    # ----------------------------------------------------------------------------------
+    # LOSS CALCULATION
+    # ----------------------------------------------------------------------------------
+    @staticmethod
+    def embed_loss0(x_tilde, x):
+        """
+        Loss guiding reversible mapping between feature and latent spaces to enable
+        embedding and recovery nets to reconstruct original data.
+        Args:
+            x_tilde: decoded real samples
+            x: real samples
+
+        Returns:
+            Loss
+        """
+        e_loss_t0 = nn.MSELoss()(x_tilde, x)
+        e_loss0 = 10 * torch.sqrt(e_loss_t0)
+        return e_loss0
+
+    @staticmethod
+    def supervisor_loss(h_hat_supervise, h):
+        """
+        This loss further ensures that generator produces similar stepwise transitions
+        (evaluated by ground-truth targets). Responsible to capture how well the
+        generator approximates the next time step in the latent space.
+        Args:
+            h_hat_supervise: supervisors output from feeding h (real embedding) through
+            h: real embedding defined by embedder net
+
+        Returns:
+            Loss
+        """
+        gen_sup_loss = nn.MSELoss()(h_hat_supervise[:, 1:, :], h[:, 1:, :])
+        return gen_sup_loss
+
+    @staticmethod
+    def generator_loss(
+        y_fake,
+        y_fake_e,
+        h,
+        h_hat_supervise,
+        x,
+        x_hat,
+        emb_weight,
+    ):
+        """
+        Loss of generator combining adversarial & supervisor losses together with
+        looking at difference between final synthetic output and original data.
+        Args:
+            y_fake: logits for classification of fakes (from h_hat)
+            y_fake_e: logits for classification of fake embeddings (from e_hat)
+            h: real embedding defined by embedder net
+            h_hat_supervise: supervisors output from feeding h (real embedding) through
+            x: real samples
+            x_hat: decoded samples of embedding created by generator
+            emb_weight: weight defining how much embedded fake contributes to loss
+
+        Returns:
+            Loss
+        """
+        # adversarial
+        criterion = nn.BCEWithLogitsLoss()
+        g_loss_u = criterion(y_fake, torch.ones_like(y_fake))
+        g_loss_u_e = criterion(y_fake_e, torch.ones_like(y_fake_e))
+        w_g_loss_u_e = emb_weight * g_loss_u_e
+        # supervisor
+        g_loss_s = TimeGANLightning.supervisor_loss(h_hat_supervise, h)
+        # 2 moments
+        d = torch.sqrt(torch.var(x_hat, 0) + 1e-6) - torch.sqrt(torch.var(x, 0) + 1e-6)
+        g_loss_v1 = torch.mean(torch.abs(d))
+        g_loss_v2 = torch.mean(torch.abs(torch.mean(x_hat, 0) - torch.mean(x, 0)))
+        g_loss_v = g_loss_v1 + g_loss_v2
+        # sum
+        g_loss = g_loss_u + w_g_loss_u_e + 100 * torch.sqrt(g_loss_s) + 100 * g_loss_v
+        return g_loss
+
+    @staticmethod
+    def embedder_loss(x_tilde, x, h_hat_supervise, h):
+        """
+        Loss to further improve reversible mapping between feature and latent space,
+        combined with
+        Args:
+            x_tilde: decoded real samples
+            x: real samples
+            h_hat_supervise: supervisors output from feeding h (real embedding) through
+            h: real embedding defined by embedder net
+
+        Returns:
+            Loss
+        """
+        e_loss0 = TimeGANLightning.embed_loss0(x_tilde, x)
+        e_loss = e_loss0 + 0.1 * TimeGANLightning.supervisor_loss(h_hat_supervise, h)
+        return e_loss
+
+    @staticmethod
+    def discriminator_loss(y_fake, y_fake_e, y_real, emb_weight):
+        """
+        Discriminator’s binary adversarial feedback, both on fake and real data. Real
+        data is labelled as 1, fake as 0.
+        Args:
+            y_fake: logits for classification of fakes (from h_hat)
+            y_fake_e: logits for classification of fake embeddings (from e_hat)
+            y_real: logits for classification of real embeddings (from h)
+            emb_weight: weight defining how much embedded fake contributes to loss
+
+        Returns:
+            Loss
+        """
+        criterion = nn.BCEWithLogitsLoss()
+        d_loss_fake_e = criterion(y_fake_e, torch.zeros_like(y_fake_e))
+        d_loss_fake = criterion(y_fake, torch.zeros_like(y_fake))
+        d_loss_real = criterion(y_real, torch.ones_like(y_real))
+        # TODO: any use of dividing loss by (2 + emb_weight)? probably readability
+        return emb_weight * d_loss_fake_e + d_loss_fake + d_loss_real
+
+    # ----------------------------------------------------------------------------------
+    # OTHER CALCULATION
+    # ----------------------------------------------------------------------------------
+    def _epoch_end(self, outputs, prefix="val"):
+        """
+        We average the loss across all batches, calculate metrics based on all batches
+        and log them. Finally, we make plots using all the y_true and y_preds.
+        Args:
+            outputs:
+            prefix: indicates train or val epoch end
+        """
+        if prefix == "train":
+            pass
+        elif prefix == "val":
+            avg_loss = []
+            y_real = []
+            y_fake = []
+            y_fake_e = []
+            pca_x = []
+            pca_x_hat = []
+            tsne_x = []
+            tsne_x_hat = []
+            for x in outputs:
+                avg_loss.append(x[f"loss_disc/{prefix}"])
+                y_real.append(x[f"y_real/{prefix}"])
+                y_fake.append(x[f"y_fake/{prefix}"])
+                y_fake_e.append(x[f"y_fake_e/{prefix}"])
+                pca_x.append(x[f"pca_x/{prefix}"])
+                pca_x_hat.append(x[f"pca_x_hat/{prefix}"])
+                tsne_x.append(x[f"tsne_x/{prefix}"])
+                tsne_x_hat.append(x[f"tsne_x_hat/{prefix}"])
+            # log sampled images, only first batch (2 validation rounds @ start)
+            self._make_plots(
+                y_real[0],
+                y_fake[0],
+                pca_x[0],
+                pca_x_hat[0],
+                tsne_x[0],
+                tsne_x_hat[0],
+                prefix,
+            )
+
+    # ----------------------------------------------------------------------------------
+    # PLOTTING AND LOGGING FUNCTIONS
+    # ----------------------------------------------------------------------------------
+    def _plot_dataset(
+        self, fig_close: Figure, fig_data: Figure, fig_target: Figure, prefix: str
+    ):
+        """
+        Plots the close price and the target column of the train/val/test datasets.
+
+        Args:
+            fig_close: First element of the returned tuple of `CryptoDataset.plot()`
+            fig_data: Second element of the returned tuple of `CryptoDataset.plot()`
+            prefix: One of train, val, test.
+        """
+        self._log_image(f"anchor_close/{prefix}", fig_close, 0)
+
+    def _log_image(self, image_name, image_data, i):
+        """
+        Logs any generated image to both tensorboard and comet.
+        """
+        self.logger.experiment[0].add_image(image_name, fig_to_tb(image_data), i)
+        if self.comet_logging:
+            self.logger.experiment[1].log_image(
+                fig_to_comet(image_data), name=image_name, step=i
+            )
+
+    def _log_graph(self, datasets: GeneratorCryptoDataset):
+        """
+        Logs the graph of the model to both tensorboard and comet.
+        """
+        examples_dataloader = DataLoader(datasets, batch_size=32)
+        example_shapes = [xi.shape for xi in next(iter(examples_dataloader))[0]]
+        examples = [torch.rand(*s).float().to(self.tgan_device) for s in example_shapes]
+        self.logger.experiment[0].add_graph(self, examples)
+
+    def _make_plots(self, y_real, y_fake, pca_x, pca_x_hat, tsne_x, tsne_x_hat, prefix):
+        """
+        Makes following useful summary plots:
+            - plotting 2-dim PCA for visualising diversity learned
+            - (discriminator's) histogram of y_true, y_fake, y_fake_e
+        """
+        # PCA SCATTER
+        self._log_image(
+            f"real v fake PCA-scatter/{prefix}",
+            plot_pca(pca_x, pca_x_hat),
+            self.current_epoch,
+        )
+
+        # TSNE SCATTER
+        self._log_image(
+            f"real v fake TSNE-scatter/{prefix}",
+            plot_tsne(tsne_x, tsne_x_hat),
+            self.current_epoch,
+        )
+
+        # HISTOGRAM
+        # discriminator's take on real data
+        y_real_class = (torch.sigmoid(y_real) > 0.5).int()
+        y_real_for_hist = torch.sigmoid(y_real)
+        # discriminator's take on fake data
+        y_fake_class = (torch.sigmoid(y_fake) > 0.5).int()
+        y_fake_for_hist = torch.sigmoid(y_fake)
+        self._log_image(
+            f"real v fake hist/{prefix}",
+            hist_from_tensor(y_real, y_real_for_hist),
+            self.current_epoch,
+        )
+
+    # ----------------------------------------------------------------------------------
+    # SANITY CHECK FUNCTIONS
+    # ----------------------------------------------------------------------------------
+
+    @staticmethod
+    def _pre_sanity_check(hparams: Namespace) -> Namespace:
+        """Certain sanity checks must happen before preprocessing takes place."""
+
+        # ensure we have the no specific target column in the config
+        if hparams.target_col:
+            raise ValueError("target_col has to be None for GAN development.")
+
+        # fill in the same cols for any df that doesn't have the cols_to_model defined
+        if len(hparams.cols_to_model) > 1:
+            for df_name, cols in hparams.cols_to_model.items():
+                if df_name != npa.anchor and (cols is None or len(cols) == 0):
+                    hparams.cols_to_model[df_name] = deepcopy(
+                        hparams.cols_to_model[npa.anchor]
+                    )
+        if hparams.rnn not in [NGAN.gru, NGAN.lstm]:
+            raise ValueError("rnn has to be either 'gru' or 'lstm'.")
+        return hparams
diff --git a/src/dagobert/modelling/augmentation/utils.py b/src/dagobert/modelling/augmentation/utils.py
new file mode 100644
index 00000000..72fde853
--- /dev/null
+++ b/src/dagobert/modelling/augmentation/utils.py
@@ -0,0 +1,67 @@
+"""Util functions for TimeGAN and other augmentation related tasks"""
+
+import torch
+import numpy as np
+from sklearn.decomposition import PCA
+from sklearn.manifold import TSNE
+
+
+def get_noise(n_samples: int, mini_series_length: int, z_dim: int, device: str = "cpu"):
+    """
+    Function for creating noise vectors given the dimensions (n_samples,
+    mini_series_length, z_dim). Research shows that it is not hyperimportant which
+    distribution is the noise from, here we'll use uniform
+
+    Args:
+        n_samples: the number of samples to generate
+        mini_series_length: length of series
+        z_dim: dimension for generator input at given time point
+        device: the device type
+
+    Returns:
+        Tensor of filled with random numbers from uniform distribution.
+    """
+    return torch.rand(n_samples, mini_series_length, z_dim, device=device)
+
+
+def pca_analysis(x, x_hat, components: int = 2):
+    """
+    PCA on 2 (real and synthetic) datasets
+    Args:
+        x: real data of shape (batch, time, feature)
+        x_hat: synthetic data of the same shape
+        components: number of pca components to keep
+
+    Returns:
+    2 arrays of PCA-reduced real and synthetic data
+    """
+    x = np.mean(x, 2)
+    x_hat = np.mean(x_hat, 2)
+
+    pca = PCA(n_components=components)
+    pca.fit(x)
+    pca_results = pca.transform(x)
+    pca_hat_results = pca.transform(x_hat)
+    return pca_results, pca_hat_results
+
+
+def tsne_analysis(x, x_hat, components: int = 2, n_iter=300):
+    """
+    TSNE on 2 (real and synthetic) datasets
+    Args:
+        x: real data of shape (batch, time, feature)
+        x_hat: synthetic data of the same shape
+        components: number of components to keep
+
+    Returns:
+    2 arrays of TSNE-reduced real and synthetic data
+    """
+    x = np.mean(x, 2)
+    x_hat = np.mean(x_hat, 2)
+    batch_len = x.shape[0]
+
+    tsne = TSNE(n_components=components, n_iter=n_iter)
+    tsne_all = tsne.fit_transform(np.concatenate((x, x_hat), axis=0))
+    tsne_results = tsne_all[:batch_len]
+    tsne_hat_results = tsne_all[batch_len:]
+    return tsne_results, tsne_hat_results
diff --git a/src/dagobert/modelling/dl/__init__.py b/src/dagobert/modelling/dl/__init__.py
index b69b500d..52b1e0bb 100644
--- a/src/dagobert/modelling/dl/__init__.py
+++ b/src/dagobert/modelling/dl/__init__.py
@@ -1,4 +1,4 @@
-from .data import CryptoDataset
+from .data import PortfolioCryptoDataset, ExperienceSourceDataset, CryptoDataset
 from .tcn_net import TemporalConvNet
 from .utils import LogCoshLoss, FocalLoss, MixedNormalPDFLoss
 from .adabelief import AdaBelief
diff --git a/src/dagobert/modelling/dl/data.py b/src/dagobert/modelling/dl/data.py
index e1a29584..b3f043f6 100644
--- a/src/dagobert/modelling/dl/data.py
+++ b/src/dagobert/modelling/dl/data.py
@@ -1,10 +1,11 @@
 """
-Classes defining PyTorch datasets for modelling.
+Classes defining PyTorch datasets for supervised deep learning and multi-instrument 
+reinforcement learning. 
 """
 import logging
 from pathlib import Path
 from argparse import Namespace
-from typing import List, Tuple, Union
+from typing import List, Tuple, Union, Iterable, Callable
 
 import torch
 import numpy as np
@@ -12,9 +13,9 @@
 from matplotlib.figure import Figure
 from matplotlib import pyplot as plt
 from sklearn.preprocessing import MinMaxScaler
-from torch.utils.data import Dataset, DataLoader
+from torch.utils.data import Dataset, DataLoader, IterableDataset
 
-from dagobert.naming import NBarVars, NAugmentationMethods
+from dagobert.naming import NBarVars, NAugmentationMethods, NRL
 from dagobert.naming import NPreprocessingArgs as npa
 from dagobert.preprocessing.utils import set_dt_index
 from dagobert.modelling.augmentation import augment
@@ -68,12 +69,12 @@ class CryptoDataset(Dataset):
                 - std_bar_BTCUSDT_volume_250.feather
                 - std_bar_BTCUSDT_volume_500.feather
 
-    Here, we have the lovest granularity ETHUSDT and BTCUSDT bars simultaneously used
+    Here, we have the lowest granularity ETHUSDT and BTCUSDT bars simultaneously used
     to model whether the ETH price will be up or down in 30 minutes: simple_lookahead_y.
     Both input DFs use 4 columns (OHLC) and both can be augmented with a 50% chance,
     meaning, on average in every batch 256 samples would come from the augment_dfs.
     The anchor can only be augmented by one of the other less granular ETHUSDT volume
-    bar datasets, while the secondary input DF (BTCUSDT) chas its own two augmentation
+    bar datasets, while the secondary input DF (BTCUSDT) has its own two augmentation
     data sources.
 
     Internally these 6 DFs will be rapackaged as a single dict (`self.dfs`), where the
@@ -94,7 +95,7 @@ class CryptoDataset(Dataset):
 
     The reason why we disect the data (pandas DFs) into these dicts and list of np
     arrays is because of huge performance gains when we do the indexing in np instead
-    of pandas .loc.
+    of using the .loc method of pandas.
     """
 
     def __init__(
@@ -199,7 +200,7 @@ def __getitem__(self, idx):
 
     def _load_df_anchor(
         self,
-    ) -> pd.DatetimeIndex:
+    ) -> pd.DataFrame:
         """
         Loads the anchor DF, and returns it. We use the anchor df for plotting and to
         extract the master index which we measure everything else against in batching.
@@ -265,7 +266,7 @@ def _load_dfs_indices_targets(self) -> Tuple[dict, dict, list]:
             if df_name == npa.anchor:
                 targets.append(self._get_target(df))
 
-        # load augmnet DFs - dict of list of paths
+        # load augment DFs - dict of list of paths
         if self.augment_dfs:
             for df_name, df_paths in self.augment_dfs.items():
                 if isinstance(df_paths, str):
@@ -324,7 +325,7 @@ def _get_target(self, df: pd.DataFrame) -> np.array:
         """Returns the target values (y) to use for batching for a given DF."""
         if self.simple_lookahead_y:
             # calculate simple moving average on the close original to smooth it
-            mean_bar_length = pd.Series(self.idx).diff().dt.seconds.mean() / 60
+            mean_bar_length = pd.Series(self.idx).diff().dt.total_seconds().mean() / 60
             window_size = int(np.round(self.simple_lookahead_y / mean_bar_length))
             return (
                 df[npa.close_original]
@@ -333,6 +334,9 @@ def _get_target(self, df: pd.DataFrame) -> np.array:
                 .bfill()
                 .ffill()
             ).values
+        # dummy solution for GAN - benefit is that rest of code is unchanged
+        elif not self.target_col:
+            return np.zeros(len(df))
         else:
             return df[self.target_col].values
 
@@ -386,7 +390,7 @@ def _get_from_upto_idxs(
         Returns the from and upto idx for a given sample in the batch given the idx.
         Since we are indexing with numerical idxes and not dates, if we have multiple
         dfs in df_train (e.g. anchor and df2), we need to ensure that df2's from and
-        upto idx-es are at not leaking info from the future and are from roughly the
+        upto idx-es are not leaking info from the future and are from roughly the
         same date time period. The same holds for the situation when we replaced anchor
         df with one of its augment_dfs. Therefore we always return lists of from_idxs
         and upto_idxs for each df in `batch_dfs`.
@@ -540,7 +544,7 @@ def plot(self) -> Tuple[Figure]:
 
         # plot the data columns, add date_diff_seconds to data coumns for sanity check
         df_data = df[self.cols_to_model[npa.anchor]].copy(deep=True)
-        date_diff_secs = pd.Series(df.index).diff().dt.seconds.values
+        date_diff_secs = pd.Series(df.index).diff().dt.total_seconds().values
         df_data.insert(0, "date_diff_secs", date_diff_secs)
         subplot_cols_n = 5
         cols_n_to_plot = len(df_data.columns)
@@ -559,3 +563,93 @@ def plot(self) -> Tuple[Figure]:
             plt.ylabel("Count")
         plt.close()
         return fig_close, fig_data, fig_target
+
+
+class PortfolioCryptoDataset(CryptoDataset):
+    """
+    This extends :class:`dagobert.modelling.dl.data.CryptoDataset` to make it
+    suitable for multi instrument portfolio optimization through reinforcement-learning.
+
+    Instead of returning an array of Xs and single y, this returns for each X a y. This
+    is achieved by adding the rl_return target column to the cols_to_model at init, and
+    then fishing it out for each sample before returning it.
+
+    This convoluted way was used so we can repurposed and keep as much of the original
+    CryptoDataset as possible, without extensive refactoring.
+    """
+
+    def __init__(self, *args, **kwargs):
+        # for each instrument, we add the rl_return target col to their cols_to_model
+        for df_name, _ in kwargs[npa.cols_to_model].items():
+            kwargs[npa.cols_to_model][df_name].append(NRL.rl_return)
+            # lazy way to check if we have datediff as first feature, if so, cumsum it
+            if kwargs[npa.cols_to_model][df_name][0] == "date_diff":
+                self.sum_date_diffs = True
+            else:
+                self.sum_date_diffs = False
+        super().__init__(*args, **kwargs)
+
+    def __getitem__(self, idx):
+        """
+        We don't need to calculate or fetch y, as we can simply use the last timepoint
+        for that.
+        """
+        idx = idx.tolist() if torch.is_tensor(idx) else idx
+        batch_dfs, batch_indices, _ = self._get_batch_dfs_indices_target()
+        from_idx, upto_idx = self._get_from_upto_idxs(idx, batch_indices)
+        Xs = self._get_Xs(batch_dfs, from_idx, upto_idx)
+
+        # the last column is y (see __init__), so we fish it out and delete it from X
+        ys = np.empty(len(Xs))
+        for i, X in enumerate(Xs):
+            ys[i] = X[-1, -1]
+            Xs[i] = X[:-1, :]
+            if self.sum_date_diffs:
+                # make the cumulative flow from right (present) to left (past)
+                cs = Xs[i][0][::-1].cumsum()[::-1]
+                Xs[i][0] = MinMaxScaler().fit_transform(cs.reshape([-1, 1])).ravel()
+        return Xs, ys
+
+
+class ExperienceSourceDataset(IterableDataset):
+    """
+    Implementation from PyTorch Lightning Bolts. This allows us to use Lightning in a
+    reinforcement learning setting where we first need to generate our training data
+    by interacting with the environment, and only then use it to train our policy.
+
+    Basic experience source dataset. Takes a generate_batch function that returns an
+    iterator. The logic for the experience source and how the batch is generated is
+    defined ihbthe PPO Lightning model itself.
+    """
+
+    def __init__(self, generate_batch: Callable):
+        self.generate_batch = generate_batch
+
+    def __iter__(self) -> Iterable:
+        iterator = self.generate_batch()
+        return iterator
+
+
+class GeneratorCryptoDataset(CryptoDataset):
+    """
+    This extends :class:`dagobert.modelling.dl.data.CryptoDataset` to make it
+    suitable for synthetic data generation through generative adversarial learning.
+
+    Instead of returning an array of Xs and single y, this returns for only X.
+    This convuluted way was used so we can repurpose and keep as much of the original
+    CryptoDataset as possible, without extensive refactoring.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def __getitem__(self, idx):
+        """
+        We don't need to calculate or fetch y, as we only need X to be modelled.
+        """
+        idx = idx.tolist() if torch.is_tensor(idx) else idx
+        batch_dfs, batch_indices, _ = self._get_batch_dfs_indices_target()
+        from_idx, upto_idx = self._get_from_upto_idxs(idx, batch_indices)
+        Xs = self._get_Xs(batch_dfs, from_idx, upto_idx)
+        X = np.concatenate(Xs).T
+        return X
diff --git a/src/dagobert/modelling/dl/preprocessing.py b/src/dagobert/modelling/dl/preprocessing.py
index 6c4b8d12..dddf7c82 100644
--- a/src/dagobert/modelling/dl/preprocessing.py
+++ b/src/dagobert/modelling/dl/preprocessing.py
@@ -16,7 +16,7 @@
 from sklearn.utils.validation import check_is_fitted
 from sklearn.exceptions import NotFittedError
 
-from dagobert.naming import NBarriers, NCrossValidation, NTimeFeatures, NBarVars
+from dagobert.naming import NBarriers, NCrossValidation, NTimeFeatures, NBarVars, NRL
 from dagobert.naming import NPreprocessingArgs as npa
 from dagobert.naming import NBarriers as nb
 from dagobert.io import FeatherWriter, S3Connector
@@ -41,9 +41,9 @@ class Preprocessing(object):
     @staticmethod
     def preprocess_augment_dfs(hparams: Namespace) -> Namespace:
         """
-        If DFs are defined in augment_dfs we download and scale them. We only
-        check the scaling_method parameter of hparams for deciding if  we already have
-        a downloaeded and processed version in the data_dir.
+        If DFs are defined in augment_dfs we download and scale them plus label the
+        anchor. We only check the labelling & scaling_method parameters of hparams for
+        deciding if we already have a downloaeded and processed version in the data_dir.
 
         Args:
             hparams: Parsed hypere parameters of the experiment.
@@ -138,6 +138,11 @@ def _preprocess_augment_dfs(
                     df, hparams.binariser_method, hparams.binariser_threshold, df_name
                 )
 
+        # add rl return if required
+        if hparams.target_col == NRL.rl_return:
+            rl_return = (df[NBarVars.close] / df[NBarVars.close].shift()).fillna(1)
+            df.insert(0, NRL.rl_return, rl_return)
+
         # scale all numeric columns
         if hparams.scaling_method is not None:
             cols_not_to_scale = set(
@@ -162,7 +167,11 @@ def _preprocess_augment_dfs(
                     logger.info(f"Fit-transformed {log_msg}")
 
         # save transformed file
-        if hparams.to_label or hparams.scaling_method:
+        if (
+            hparams.to_label
+            or hparams.scaling_method
+            or hparams.target_col == NRL.rl_return
+        ):
             feather_writer = FeatherWriter(output_path=data_dir / df_path_prev)
             feather_writer.write(df)
 
@@ -178,7 +187,7 @@ def preprocess_train_dfs(hparams: Namespace) -> Namespace:
         supplied parameters. If the `df_train` is defined, but the `df_val` and
         `df_test` are not, then we split the train data into 3, scale them, add sample
         weights to the train portion and if required binarise the label too. If a
-        particular combination of preprocessing parameters where already used and
+        particular combination of preprocessing parameters were already used and
         therefore we have an existing file already on the machine, we skip the input DF.
 
         Args:
@@ -292,6 +301,11 @@ def _preprocess_train_dfs(
                     df, hparams.binariser_method, hparams.binariser_threshold, df_name
                 )
 
+        # add rl return if required
+        if hparams.target_col == NRL.rl_return:
+            rl_return = (df[NBarVars.close] / df[NBarVars.close].shift()).fillna(1)
+            df.insert(0, NRL.rl_return, rl_return)
+
         # split
         splitter = TrainValTestSplitter(data_connector=df)
         df_train, df_val, df_test = splitter.split(
@@ -331,6 +345,7 @@ def _preprocess_train_dfs(
                     df_to_scale = df_split[cols_to_scale].values.reshape(-1, 1)
                 log_msg = f"{cols_to_scale} of {df_name} {df_split_name} with {id(sc)}."
                 try:
+                    #todo: scikit 0.24+ needs values.reshape(-1,1) to transform
                     check_is_fitted(sc)
                     df_split[cols_to_scale] = sc.transform(df_to_scale)
                     logger.info(f"Transformed {log_msg}")
@@ -452,12 +467,13 @@ def _get_scalers_for_train_dfs(hparams) -> dict:
     def _get_scalers_from_cols(cols: list, scaling_method: str) -> list:
         """
         For a given dataset's columns, this works out which columns to scale together
-        (OHLC). It will also creates a scaler for the non-OHLC columns. Furthermore
-        it will create groups of the supplied OHLC columns depending on the fractional
-        differencing suffix _fd_x at the end of theses columns. Then it will add the
-        first element of each group with a scaler then the rest of the group
-        with the same scaler instance. This is to ensure that OHLC columns are scaled
-        together and not independently.
+        (OHLC & volume). It will also creates a scaler for the non-OHLC/volume columns.
+        Furthermore, it will create groups of the supplied OHLC columns depending on the
+        fractional differencing suffix _fd_x at the end of these columns. It will also
+        create groups of the supplied volume cols based on base or quote quantity.
+        Then it will add the first element of each group with a scaler then the rest of
+        the group with the same scaler instance. This is to ensure that OHLC & volume
+        columns are scaled together and not independently.
 
         Args:
             cols: Columns to work on and check for OHLC columns.
@@ -475,10 +491,14 @@ def _instantiate_scaler(scaling_method):
                 return MinMaxScaler()
 
         scalers = []
-        # extrack OHLC cols
+        # extract OHLC and volume cols
         s_cols = pd.Series(list(cols))
         ohlc_cols = s_cols[s_cols.str.contains("open|high|low|close")]
-        non_ohlc_cols = list(s_cols[~s_cols.str.contains("open|high|low|close")])
+        vol_cols = list(
+            s_cols[(s_cols.str.contains("volume")) & (~s_cols.str.contains("quote"))]
+        )
+        vol_quote_cols = list(s_cols[s_cols.str.contains("volume_quote")])
+        non_ohlc_cols = list(s_cols[~s_cols.str.contains("open|high|low|close|volume")])
 
         # find groups of OHLC cols for multiple fd values (easiest to with a df)
         dh_ohlc_cols_data = [x.split("_fd_") for x in ohlc_cols] + [["na", "na"]]
@@ -494,7 +514,7 @@ def _instantiate_scaler(scaling_method):
             boolean_group_mask = df_ohlc_cols.suffix == unique_group_suffix
             ohlc_cols_groups.append(list(ohlc_cols.values[boolean_group_mask]))
 
-        # add scaler to non OHLC cols
+        # add scaler to non OHLC/volume cols
         scalers.append((_instantiate_scaler(scaling_method), non_ohlc_cols))
 
         # add scalers to OHLC col groups: one for the 1st col; then same for rest
@@ -502,6 +522,13 @@ def _instantiate_scaler(scaling_method):
             ohlc_scaler = _instantiate_scaler(scaling_method)
             scalers.append((ohlc_scaler, ohlc_cols_group.pop(0)))
             scalers.append((ohlc_scaler, ohlc_cols_group))
+
+        # add scaler to volume cols, like OHLC
+        for vol_group in [vol_cols, vol_quote_cols]:
+            vol_scaler = _instantiate_scaler(scaling_method)
+            scalers.append((vol_scaler, vol_group.pop(0)))
+            scalers.append((vol_scaler, vol_group))
+
         return scalers
 
     @staticmethod
diff --git a/src/dagobert/modelling/dl/tcn.py b/src/dagobert/modelling/dl/tcn.py
index 3db77df8..e17790a6 100644
--- a/src/dagobert/modelling/dl/tcn.py
+++ b/src/dagobert/modelling/dl/tcn.py
@@ -5,7 +5,6 @@
 import logging
 from pathlib import Path
 from typing import Optional
-from functools import partial
 from argparse import Namespace
 
 import numpy as np
@@ -178,11 +177,12 @@ def __init__(self, hparams: Namespace):
         Args:
             hparams: Hyper-params passed in to the module. See the docs for more details
                 https://pytorch-lightning.readthedocs.io/en/latest/hyperparameters.html
+                and dagobert.modelling.dl.tcn_args for more information on the params.
         """
 
         # define main vars (other than model)
         super().__init__()
-        TCNLightning._pre_sanity_check(hparams)
+        hparams = TCNLightning._pre_sanity_check(hparams)
         # lightning sets this to cuda too late for some of our setup to work
         self.tcn_device = "cuda" if hparams.gpus > 0 else "cpu"
         hparams = Preprocessing().preprocess_augment_dfs(hparams)
@@ -626,7 +626,7 @@ def _log_graph(self, datasets: CryptoDataset):
     # ----------------------------------------------------------------------------------
 
     @staticmethod
-    def _pre_sanity_check(hparams: Namespace):
+    def _pre_sanity_check(hparams: Namespace) -> Namespace:
         """Certain sanity checks must happen before preprocessing takes place."""
         # TARGET VARIABLE
         if (
@@ -654,38 +654,12 @@ def _pre_sanity_check(hparams: Namespace):
                 "Classification is not applicable with mixed density nets"
             )
 
-        # NET
-        net_depth = len(hparams.num_channels)
-        k_size = hparams.kernel_size
-        max_seq_len = TemporalConvNet.get_tcn_receptive_field_size(k_size, net_depth)
-        logger.info(
-            f"A TCN with kernel size: {k_size} and depth: {net_depth} has a receptive "
-            f"field (can read a maximum sequence length) of {max_seq_len}."
-        )
-        if hparams.mini_series_length == "auto":
-            logger.info(f"We set mini_series_length from 'auto' to {max_seq_len}.")
-            hparams.mini_series_length = max_seq_len
-        if (
-            hparams.mini_series_length != "auto"
-            and hparams.mini_series_length > max_seq_len
-        ):
-            logger.warning(
-                f"Provided mini-series length: {hparams.mini_series_length} is "
-                f"larger than the networks receptive field size: {max_seq_len}."
-            )
-        # calcualte what the current TCN setup corresponds to in hourly lookback
-        df_anchor = TCNLightning._load_anchor(hparams)
-        hparams.lookback = update_lookback(df_anchor, hparams.mini_series_length)
-        logger.info(
-            f"The current mini_series_legnth {hparams.mini_series_length}, "
-            f"corresponds to an estimated lookback of {hparams.lookback} hours."
-        )
-
         # ETC
         if hparams.augment_dfs and npa.anchor not in hparams.augment_dfs.keys():
             raise ValueError(
                 "If you use augment_dfs you must at least define the anchor key."
             )
+        return hparams
 
     def _sanity_check(self):
         """
@@ -704,6 +678,9 @@ def _sanity_check(self):
         ):
             raise ValueError("You can either provide both df_val/df_test or neither!")
 
+        # MINI SERIES / LOOKBACK
+        self.hparams = TCNLightning._check_mini_series_lookback(self.hparams)
+
         # TARGET VARIABLE
         if not self.hparams.regression:
             if self.hparams.simple_lookahead_reg:
@@ -745,3 +722,46 @@ def _sanity_check(self):
             raise ValueError(
                 "non_last_y_frac has to be between 0 and 1 when using lasy_y=False."
             )
+
+    @staticmethod
+    def _check_mini_series_lookback(hparams: Namespace) -> Namespace:
+        """
+        Calculate lookback and mini_series_length if necessary.
+
+        Note, this works for both DL and RL (with two simultaneously trained nets).
+        However, for RL, we use the actor network's params to set the mini_series_len.
+        """
+
+        cases = ["", "critic_", "actor_"]
+        for case in cases:
+            num_channels = f"{case}num_channels"
+            k_size = f"{case}kernel_size"
+            if num_channels in hparams:
+                net_depth = len(hparams[num_channels])
+                k_size = hparams[k_size]
+                max_seq_len = TemporalConvNet.get_tcn_receptive_field_size(
+                    k_size, net_depth
+                )
+                logger.info(
+                    f"A {case}TCN with kernel size: {k_size} and depth: {net_depth} "
+                    f"can read a maximum sequence length of {max_seq_len}."
+                )
+        if hparams.mini_series_length == "auto":
+            logger.info(f"We set mini_series_length from 'auto' to {max_seq_len}.")
+            hparams.mini_series_length = max_seq_len
+        if (
+            hparams.mini_series_length != "auto"
+            and hparams.mini_series_length > max_seq_len
+        ):
+            logger.warning(
+                f"Provided mini-series length: {hparams.mini_series_length} is "
+                f"larger than the networks receptive field size: {max_seq_len}."
+            )
+        # calcualte what the current TCN setup corresponds to in hourly lookback
+        df_anchor = TCNLightning._load_anchor(hparams)
+        hparams.lookback = update_lookback(df_anchor, hparams.mini_series_length)
+        logger.info(
+            f"The current mini_series_legnth {hparams.mini_series_length}, "
+            f"corresponds to an estimated lookback of {hparams.lookback} hours."
+        )
+        return hparams
diff --git a/src/dagobert/modelling/dl/tcn_args.py b/src/dagobert/modelling/dl/tcn_args.py
index d08b0a0b..48370ef3 100644
--- a/src/dagobert/modelling/dl/tcn_args.py
+++ b/src/dagobert/modelling/dl/tcn_args.py
@@ -1,9 +1,11 @@
 """
 All custom arguments and hyper-parameters for the TCN Lightning module. 
 """
+from typing import Union
 
 from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
 
+
 from pytorch_lightning import Trainer
 
 from dagobert.modelling.dl.tcn import TCNLightning
@@ -79,9 +81,12 @@ def add_run_specific_args(parent_parser):
     )
     parser.add_argument(
         "--lr",
-        type=float,
+        type=Union[float, dict],
         default=0.003,
-        help="Learning rate. If set to 'auto' we'll find it automatically.",
+        help=(
+            "Learning rate. If set to 'auto' we'll find it automatically. In TimeGAN"
+            "different learning rates can be used for the various networks"
+        ),
     )
     parser.add_argument(
         "--max_lr",
diff --git a/src/dagobert/modelling/dl/tcn_net.py b/src/dagobert/modelling/dl/tcn_net.py
index b49b139a..344ad328 100644
--- a/src/dagobert/modelling/dl/tcn_net.py
+++ b/src/dagobert/modelling/dl/tcn_net.py
@@ -48,33 +48,34 @@ def __init__(
         dilation,
         padding,
         dropout=0.2,
+        no_weight_norm=False,
     ):
         super(TemporalBlock, self).__init__()
-        self.conv1 = weight_norm(
-            nn.Conv1d(
-                n_inputs,
-                n_outputs,
-                kernel_size,
-                stride=stride,
-                padding=padding,
-                dilation=dilation,
-            )
+        self.conv1 = nn.Conv1d(
+            n_inputs,
+            n_outputs,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
         )
+        if not no_weight_norm:
+            self.conv1 = weight_norm(self.conv1)
         self.chomp1 = Chomp1d(padding)
         self.relu1 = nn.ReLU()
         self.dropout1 = nn.Dropout(dropout)
         self.batch_norm1 = nn.BatchNorm1d(n_outputs)
 
-        self.conv2 = weight_norm(
-            nn.Conv1d(
-                n_outputs,
-                n_outputs,
-                kernel_size,
-                stride=stride,
-                padding=padding,
-                dilation=dilation,
-            )
+        self.conv2 = nn.Conv1d(
+            n_outputs,
+            n_outputs,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
         )
+        if not no_weight_norm:
+            self.conv2 = weight_norm(self.conv2)
         self.chomp2 = Chomp1d(padding)
         self.relu2 = nn.ReLU()
         self.dropout2 = nn.Dropout(dropout)
@@ -112,6 +113,7 @@ def __init__(
         dropout: float = 0.2,
         time_feat_n: int = 1,
         time_embed_dim: int = 12,
+        no_weight_norm: bool = False,
     ):
         """
         Class constructor.
@@ -126,6 +128,8 @@ def __init__(
             time_feat_n: Number of time features per input DF. Note this has to be
                 consistent across all input DFs, you can't mix and match.
             time_embed_dim: Dimensionality of time2vec vectors.
+            no_weight_norm: If True, we don't add weight_norm to 1dconv layers. See
+                no_weight_norm param help in `tcn_args.py` for more info.
         """
         super(TemporalConvNet, self).__init__()
 
@@ -147,6 +151,7 @@ def __init__(
                     dilation=1,
                     padding=(kernel_size - 1),
                     dropout=dropout,
+                    no_weight_norm=no_weight_norm,
                 )
             )
 
@@ -167,6 +172,7 @@ def __init__(
                     dilation=dilation_size,
                     padding=(kernel_size - 1) * dilation_size,
                     dropout=dropout,
+                    no_weight_norm=no_weight_norm,
                 )
             ]
         self.later_layers = nn.Sequential(*layers)
diff --git a/src/dagobert/modelling/rl/__init__.py b/src/dagobert/modelling/rl/__init__.py
new file mode 100644
index 00000000..fd00b0bd
--- /dev/null
+++ b/src/dagobert/modelling/rl/__init__.py
@@ -0,0 +1,3 @@
+from .environment import RLData, RLPortfolio, RLEnv
+from .networks import ActorCriticTCN, ActorCriticAgent, ActorContinuous
+from .ppo import PPO
diff --git a/src/dagobert/modelling/rl/environment.py b/src/dagobert/modelling/rl/environment.py
new file mode 100644
index 00000000..7ce7a15c
--- /dev/null
+++ b/src/dagobert/modelling/rl/environment.py
@@ -0,0 +1,296 @@
+"""
+Class defining PyTorch datasets for supervised modelling of a single instrument.
+"""
+import logging
+from copy import deepcopy
+from typing import List, Tuple
+from argparse import Namespace
+
+import gym
+import torch
+import numpy as np
+from torch.utils.data import Dataset, DataLoader
+from torch.utils.data.dataloader import default_collate
+
+from dagobert.naming import NPreprocessingArgs as npa
+from dagobert.modelling.dl import PortfolioCryptoDataset
+
+
+logger = logging.getLogger(__name__)
+eps = np.finfo(float).eps
+
+
+class RLData(object):
+    """
+    Creates a multi-head data reader, meaning, we can concurrently return the next
+    state of the environment for arbitrarily many times (see step function). Leverages
+    the data class and configuration methods from the `dagobert.modelling.dl` module as
+    much as possible.
+    """
+
+    def __init__(
+        self,
+        hparams: Namespace,
+        train_val_test: str = "train",
+    ):
+        """
+        Class constructor.
+
+        Args:
+            hparams: Hyperparams parsed by the rl_runner. Similar to how `TCNLightning`
+                is initialized with the following fields:
+                - max_episode_length
+                - cols_to_model
+                - target_col
+                - mini_series_length
+                - data_dir
+                - augment_dfs
+                - augment_prob
+                - augment_method
+                - augment_dfs_mix
+                - num_env_heads
+            train_val_test: Whether we are training, validating or testing, it must be
+                either train, val or test.
+        """
+        self.hparams = hparams
+        if train_val_test == "train":
+            augment_dfs = self.hparams.augment_dfs
+            augment_method = self.hparams.augment_method
+        else:
+            augment_dfs = None
+            augment_method = None
+        self.dataset = PortfolioCryptoDataset(
+            df_to_load=getattr(self.hparams, f"df_{train_val_test}"),
+            cols_to_model=deepcopy(self.hparams.cols_to_model),
+            target_col=self.hparams.target_col,
+            mini_series_length=self.hparams.mini_series_length,
+            data_dir=self.hparams.data_dir,
+            augment_method=augment_method,
+            augment_prob=self.hparams.augment_prob,
+            augment_dfs=augment_dfs,
+            augment_dfs_mix=self.hparams.augment_dfs_mix,
+        )
+        self.dataset_len = len(self.dataset)
+        self.latest_idx = self.dataset_len - self.hparams.max_episode_length
+        self._reset_idxs()
+
+    def step2(self):
+        Xs, ys = self.dataset[self.idxs[0]]
+        # add cash price (always 1) to the new price vector
+        y1 = np.concatenate([[1.0], ys])
+        # turn Xs into a batch of 1, ready to be fed into the actor/critic
+        Xs = [torch.Tensor(x).unsqueeze(0) for x in Xs]
+        self.idxs += 1
+        return Xs, y1
+
+    def step(self):
+        Xs = []
+        ys = []
+        for idx in self.idxs:
+            X, y = self.dataset[idx]
+            # Xs.append([torch.Tensor(x).unsqueeze(0) for x in X])
+            # making sure have float32 data so we don't get torch.float64 tensors later
+            Xs.append([x.astype("float32") for x in X])
+            ys.append(y)
+        self.idxs += 1
+
+        # add cash price (always 1) to the new price vector (a column of ones)
+        ys = np.vstack(ys)
+        y1 = np.ones((ys.shape[0], ys.shape[1] + 1))
+        y1[:, 1:] = ys
+        return default_collate(Xs), y1
+
+    def reset(self):
+        self._reset_idxs()
+        return self.step()
+
+    def _reset_idxs(self):
+        # reset all head's starting index
+        self.idxs = np.random.randint(self.latest_idx, size=self.hparams.num_env_heads)
+
+
+class RLPortfolio(object):
+    """
+    Portfolio management class, loosely based on https://arxiv.org/abs/1706.10059
+
+    I started with this https://github.com/wassname/rl-portfolio-management
+    and compared it with the  article and the original implementation (see below).
+
+    The original implementation is a bit of a shitshow but the calculation of mu at
+    least agrees with what we have here:
+    https://github.com/ZhengyaoJiang/PGPortfolio/blob/master/pgportfolio/learn/nnagent.py
+    however there are questions around how the code corresponds to the paper, e.g.:
+    https://github.com/ZhengyaoJiang/PGPortfolio/issues/99
+    """
+
+    def __init__(self, hparams: Namespace):
+        """
+        Class constructor.
+
+        Args:
+            hparams: Hyparams parsed by the rl_runner. Similar to how `TCNLightning` is
+                initialized with the following fields:
+                - asset_names
+                - num_env_heads
+                - trading_cost
+                - reward_type
+        """
+        self.asset_names = hparams.asset_names
+        self.num_env_heads = hparams.num_env_heads
+        self.asset_n = len(self.asset_names)
+        self.trading_cost = hparams.trading_cost
+        self.reward_type = hparams.reward_type
+        self.reset()
+
+    def step(self, w1: np.array, y1: np.array) -> Tuple[float, dict, bool]:
+        """
+        See Figure 1 in https://arxiv.org/abs/1706.10059 to understand what this one
+        step corresponds to, also the equation numbers correspond to the paper's.
+
+        Args:
+            w1: New weights of the portfolio.
+            y1: New relative price vector for the portfolio's instruments. The first
+                    element refers to the cash asset (USD). Therefore y1[0] = 1 always.
+
+        Returns:
+            Typical reward, info, done vars for an OpenAI Gym.
+        """
+        w0 = self.w0
+        p0 = self.p0
+        m0 = self.m0
+
+        # market return for new timepoint for each head
+        m1 = m0 * y1.mean(axis=1)
+
+        # (eq7) since we last acted prices changed, so weights evolve into
+        new_price_old_weights_sum = np.sum(y1 * w0, axis=1)
+        dw1 = ((y1 * w0).T / (new_price_old_weights_sum + eps)).T
+
+        # (eq16) cost to change portfolio:
+        # excluding change in cash to avoid double counting for transaction cost
+        mu = self.trading_cost * (np.abs(dw1[:, 1:] - w1[:, 1:])).sum(axis=1)
+
+        # (eq11) final portfolio value
+        p1 = p0 * (1 - mu) * new_price_old_weights_sum
+
+        # (eq9 & 10) rate of return log rate of return
+        rho1 = p1 / p0 - 1  # rate of returns
+        r1 = np.log(p1 + eps) - np.log(p0 + eps)
+
+        # (eq22) immediate reward is log rate of return scaled by episode length
+        if self.reward_type == "return":
+            reward = r1
+        elif self.reward_type == "portfolio_vs_market":
+            reward = np.log(p1 + eps) - np.log(m1 + eps)
+        # TODO: implement the differentiable sharpe ratio reward
+        # https://quant.stackexchange.com/a/38040
+
+        # remember for next step
+        self.w0 = w1
+        self.p0 = p1
+        self.m0 = m1
+
+        # if we run out of money we're done: all env heads are linked here unfortunately
+        done = np.any(p1 <= 0)
+
+        infos = []
+        for i in range(self.num_env_heads):
+            info = {
+                "reward": reward[i],
+                "log_return": r1[i],
+                "portfolio_value": p1[i],
+                "market_return": m1[i],
+                "rate_of_return": rho1[i],
+                "weights_std": w1[i].std(),
+                "rebalancing_cost": mu[i],
+            }
+            # record weights and prices
+            for j, name in enumerate(["USD"] + self.asset_names):
+                info["weight_" + name] = w1[i, j]
+                info["price_" + name] = y1[i, j]
+            infos.append(info)
+        return reward, infos, done
+
+    def reset(self):
+        # weights for each head
+        self.w0 = np.zeros((self.num_env_heads, self.asset_n + 1))
+        self.w0[:, 0] = 1
+        # portfolio value for each head
+        self.p0 = np.ones(self.num_env_heads)
+        # market return for each head
+        self.m0 = np.ones(self.num_env_heads)
+
+
+class RLEnv(gym.Env):
+    """
+    A reinforcement learning environment for financial portfolio management, based on
+    https://arxiv.org/abs/1706.10059 and this implementation
+    https://github.com/wassname/rl-portfolio-management
+    """
+
+    def __init__(self, hparams: Namespace):
+        """
+        An environment for financial portfolio management.
+
+        Args:
+            hparams: Hyparams parsed by the rl_runner. Similar to how `TCNLightning` is
+                initialized with the following fields:
+                - max_episode_length
+                - cols_to_model
+                - target_col
+                - mini_series_length
+                - data_dir
+                - augment_dfs
+                - augment_prob
+                - augment_method
+                - augment_dfs_mix
+                - asset_names
+                - trading_cost
+                - reward_type
+        """
+        self.infos = []
+        self.hparams = hparams
+        self.asset_n = len(self.hparams.asset_names)
+        self.feat_n = len(self.hparams.cols_to_model[npa.anchor])
+        self.data = RLData(self.hparams, train_val_test="train")
+        self.portfolio = RLPortfolio(self.hparams)
+
+        # setup openai gym env - include cash in the portfolio action space
+        self.action_space = gym.spaces.Box(
+            0.0, 1.0, shape=(self.asset_n + 1,), dtype=np.float32
+        )
+        # observation space isn't used anywhere, but we define it for documnetation
+        self.observation_space = gym.spaces.Dict(
+            {
+                "state": gym.spaces.Box(
+                    low=-10,
+                    high=10,
+                    shape=(self.asset_n, self.feat_n, self.hparams.mini_series_length),
+                    dtype=np.float32,
+                ),
+                "weights": self.action_space,
+            }
+        )
+
+    def step(self, action: np.array):
+        """
+        Step in the environment.
+
+        Args:
+            action: Portfolio weights for the N assets and the cash (first item).
+                They should all be between 0 and 1 (no shorting) and sum to 1.
+        """
+        next_state, y1 = self.data.step()
+        reward, info, done = self.portfolio.step(action, y1)
+        self.infos.append(info)
+
+        return next_state, reward, done, info
+
+    def reset(self):
+        self.infos = []
+        self.portfolio.reset()
+        next_state, _ = self.data.reset()
+        return next_state
+
+    def render(self):
+        pass
diff --git a/src/dagobert/modelling/rl/networks.py b/src/dagobert/modelling/rl/networks.py
new file mode 100644
index 00000000..238a03db
--- /dev/null
+++ b/src/dagobert/modelling/rl/networks.py
@@ -0,0 +1,217 @@
+# pylint: disable=no-member
+from argparse import Namespace
+from typing import Union, Tuple
+
+import numpy as np
+import torch
+from torch import nn
+from torch.distributions import Dirichlet
+
+from dagobert.modelling.dl import TemporalConvNet
+
+
+class ActorCriticTCN(nn.Module):
+    """
+    Creates either the actor/policy or the critic/value network as a TCN net, followed
+    by the appropriate linear layers.
+    """
+
+    def __init__(
+        self, hparams: Namespace, n_actions: int, output_size: int, actor: bool = True
+    ) -> nn.Module:
+        """
+        Init a TCN like we do in `dagobert.modelling.dl.tcn`.
+
+        Args:
+            hparams: Hparam parsed and updated by PPO module in dagobert.modelling.rl.
+            n_actions: Dimension of actions which is one of the inputs to the networks
+                along with the state (history price tensor).
+            output_size: Number of units at the end of the network. This is
+                different for actor/critic.
+            actor: If True, we are using the network params in hparams for the actor
+                net, else we take the params for the critic.
+
+        Returns:
+            Initiated TCN with the appropriate size for actor or critic.
+        """
+        super().__init__()
+        self.hparams = hparams
+        self.n_actions = n_actions
+        self.actor = actor
+        num_inputs = [len(cols) for dataset, cols in hparams.cols_to_model.items()]
+        num_channels = (
+            hparams.actor_num_channels if actor else hparams.critic_num_channels
+        )
+        kernel_size = hparams.actor_kernel_size if actor else hparams.critic_kernel_size
+        dropout = hparams.actor_dropout if actor else hparams.critic_dropout
+        self.tcn = TemporalConvNet(
+            num_inputs=num_inputs,
+            num_channels=num_channels,
+            kernel_size=kernel_size,
+            dropout=dropout,
+            time_feat_n=hparams.time_feat_n,
+            time_embed_dim=hparams.time_embed_dim,
+            no_weight_norm=hparams.no_weight_norm,
+        )
+        self.linear_a = nn.Linear(n_actions + 1, num_channels[-1])
+        self.linear1 = nn.Linear(hparams.mini_series_length, 1)
+        self.linear2 = nn.Linear(num_channels[-1] * 2, output_size)
+        self.linear_m = nn.Linear(num_channels[-1] * 2, 1)
+
+    def forward(self, state, past_pw):
+        s1 = self.tcn(*state)
+        a1 = torch.tanh(self.linear_a(past_pw))
+        if self.hparams.use_last_timepoint:
+            s2 = s1[:, :, -1]
+        else:
+            s2 = torch.tanh(self.linear1(s1).squeeze(-1))
+        # bring together the state and past_pw representations
+        if self.actor:
+            # m decides whether we update old weights or not by mixing past_pw and new
+            m = torch.sigmoid(self.linear_m(torch.cat([s2, a1], dim=1)))
+            past_w = past_pw[:, 1:]
+            return m * past_w + (1 - m) * self.linear2(torch.cat([s2, a1], dim=1))
+        else:
+            return self.linear2(torch.cat([s2, a1], dim=1))
+
+
+class ActorContinuous(nn.Module):
+    """
+    Policy network, for continuous action spaces, which returns a distribution
+    and an action given an observation
+    """
+
+    def __init__(self, actor_net: TemporalConvNet, sample_size: int = 1):
+        """
+        The original PPO can be used for discrete action spaces with a Categorical
+        distribution or for a continuous actions space with a multivariate Gaussian,
+        where the network's outputs as raw logits go into it as the vector of mu and
+        the std is a separate learned parameter (same for all components).
+
+        This is fine if we want to sample unbounded continuos actions between -inf/inf,
+        but we here we need a mixture of weights for our portfolio that sums up to one.
+
+        The recommended thing to do in RL circles in this setup is to use DDPG, which
+        is a completely different and deterministic policy gradient algo. Instead of
+        that, here we implement an idea that I found here on this reddit discussion
+        https://www.reddit.com/r/reinforcementlearning/comments/cl2kqn/
+        special_case_of_continuous_action_space_rl/
+        where they recommend swapping the Gaussian distribution for a Dirichlet one
+        and sampling our actions from that. This by design returns a probability
+        summing to one and there's no need to learn a separate std param.
+
+        We can make this more deterministic with the sample_size param, see docs.
+
+        NOTE!
+        I'm not sure how well this works or how legit it is, as I haven't found
+        any papers or implementations actually doing this.
+
+        Args:
+            actor_net: Initialized actor net.
+            sample_size: Determines how deterministic our Dirichlet based sampling is.
+                At default (1), we return a single sample from the dist. With higher
+                sample sizes the returned weights are closer and closer to the actual
+                mean of the distribution. If it's set to zero, we return the mean and
+                basically the model becomes deterministic.
+        """
+        super().__init__()
+        self.actor_net = actor_net
+        self.inv_lin = InverseLinear()
+        self.sample_size = sample_size
+
+    def forward(self, states, past_pw):
+        # get params for Dirichlet, and drop batch dim if batch_size=1
+        logits = self.actor_net(states, past_pw)
+        concentrations = self.inv_lin(logits).squeeze(0)
+        pi = Dirichlet(concentrations)
+
+        # take 20 samples - corresponds to +/- 5% compared to returning the mean
+        if self.sample_size > 0:
+            actions = pi.sample((self.sample_size,)).mean(dim=0)
+        else:
+            actions = pi.mean
+
+        # very rarely we get actions that don't sum to 1 or are negative, fix it here
+        if actions.sum() != past_pw.shape[0] or torch.any(actions < 0):
+            actions = torch.clamp(actions, 0.01, 0.99)
+            actions = (actions.T / actions.sum(dim=-1)).T
+        return pi, actions, logits
+
+    def get_log_prob(self, pi: Dirichlet, actions: torch.Tensor):
+        """
+        Takes in a distribution and actions and returns log prob of actions under
+        the distribution
+
+        Args:
+            pi: torch distribution
+            actions: actions taken by distribution
+        Returns:
+            log probability of the action under pi
+        """
+        return pi.log_prob(actions)
+
+
+class ActorCriticAgent(object):
+    """
+    Actor Critic Agent used during trajectory collection. It returns a
+    distribution and an action given an observation. Agent based on the
+    implementations found here: https://github.com/Shmuma/ptan/blob/master/ptan/agent.py
+
+    """
+
+    def __init__(self, actor_net: nn.Module, critic_net: nn.Module):
+        self.actor_net = actor_net
+        self.critic_net = critic_net
+
+    @torch.no_grad()
+    def __call__(
+        self, state: torch.Tensor, past_pw: torch.Tensor, device: str
+    ) -> Tuple:
+        """
+        Takes in the current state and returns the agents policy, sampled
+        action, log probability of the action, and value of the given state
+        Args:
+            state: current state of the environment
+            past_pw: the previous portfolio value and weights
+            device: the device used for the current batch
+        Returns:
+            torch distribution and randomly sampled action, the logits that went into
+                the Dirichlet dist, the probability of the sample, the estimated reward
+                for this action by the critic
+        """
+
+        state = [s.to(device=device) for s in state]
+        past_pw = past_pw.to(device=device)
+
+        pi, actions, actor_logits = self.actor_net(state, past_pw)
+        log_p = self.get_log_prob(pi, actions)
+
+        value = self.critic_net(state, past_pw)
+        return pi, actions, actor_logits, log_p, value
+
+    def get_log_prob(self, pi: Dirichlet, actions: torch.Tensor) -> torch.Tensor:
+        """
+        Takes in the current state and returns the agents policy, a sampled
+        action, log probability of the action, and the value of the state
+        Args:
+            pi: torch distribution
+            actions: actions taken by distribution
+        Returns:
+            log probability of the action under pi
+        """
+        return self.actor_net.get_log_prob(pi, actions)
+
+
+class InverseLinear(nn.Module):
+    """
+    Implements a layer specifically designed for Dirichlet distribution as final
+    layer, see here: https://openreview.net/pdf?id=BJeRg205Fm
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        x[x >= 0] = x[x >= 0] + 1
+        x[x < 0] = 1 / (1 - x[x < 0])
+        return x
diff --git a/src/dagobert/modelling/rl/ppo.py b/src/dagobert/modelling/rl/ppo.py
new file mode 100644
index 00000000..0200ff0b
--- /dev/null
+++ b/src/dagobert/modelling/rl/ppo.py
@@ -0,0 +1,669 @@
+"""
+Pytorch Lightning module of Proximal Policy Optimization RL algorithm, taken and 
+modified from https://github.com/sid-sundrani/ppo_lightning.
+"""
+# pylint: disable=no-member
+import sys
+import logging
+from copy import deepcopy
+from pathlib import Path
+from typing import List, Tuple
+from argparse import Namespace
+from itertools import chain
+
+import gym
+import torch
+import numpy as np
+import pandas as pd
+import torch.optim as optim
+from torch.utils.data import DataLoader
+from pytorch_lightning import LightningModule
+from pytorch_lightning import Trainer, Callback, loggers
+from pytorch_lightning.metrics import functional as plm
+from pytorch_lightning.callbacks import ModelCheckpoint
+from pytorch_lightning.trainer import seed_everything
+
+from dagobert.naming import NRL, NStudy, NPreprocessingArgs as npa
+from dagobert.modelling.rl import (
+    RLEnv,
+    ActorCriticTCN,
+    ActorContinuous,
+    ActorCriticAgent,
+)
+from dagobert.modelling.dl import (
+    ExperienceSourceDataset,
+    Preprocessing,
+    TCNLightning,
+)
+
+
+logger = logging.getLogger(__name__)
+torch.multiprocessing.set_sharing_strategy("file_system")
+mp = torch.multiprocessing.get_context("spawn")
+eps = np.finfo(float).eps
+
+
+def run_rl(args):
+    # setup loggers
+    seed_everything(args.seed)
+    tb_logger_name = None
+    comet_name = args.exp_name
+    tcn_loggers = []
+    tb_logger = loggers.TensorBoardLogger(
+        save_dir=Path(args.log_dir), name=args.exp_name, version=tb_logger_name
+    )
+    tcn_loggers.append(tb_logger)
+    if not args.no_comet_logger:
+        tcn_loggers.append(
+            loggers.CometLogger(
+                api_key=NStudy.comet_api_key,
+                workspace=NStudy.comet_workspace,
+                save_dir=args.log_dir,
+                project_name=NStudy.comet_project_name,
+                experiment_name=f"{comet_name}_{tb_logger.version}",
+            )
+        )
+
+    # setup callbacks
+    checkpoint_callback = ModelCheckpoint(
+        monitor="avg_total_reward",
+        filename="_{epoch:02d}_{avg_reward:.10f}",
+        dirpath=f"{args.log_dir}/models/{args.exp_name}_{tb_logger.version}",
+        save_top_k=3,
+        mode="max",
+    )
+
+    # define trainer and and lightning module
+    args.multiprocessing = True if args.gpus != 1 else False
+    args.windows = True if "win" in sys.platform else False
+    trainer = Trainer.from_argparse_args(
+        args,
+        logger=tcn_loggers,
+        checkpoint_callback=checkpoint_callback,
+    )
+    model = PPO(args)
+    trainer.fit(model)
+    # trainer.test()
+
+
+class PPO(LightningModule):
+    """
+    PyTorch Lightning implementation of PPO https://arxiv.org/abs/1707.06347
+    The algorithm closely follows this:
+    https://github.com/openai/baselines/blob/master/baselines/ppo2/ppo2.py
+    """
+
+    def __init__(self, hparams: Namespace):
+        """
+        Class constructor.
+
+        Args:
+            hparams: Hyper-params passed in to the module. See the docs for more details
+                https://pytorch-lightning.readthedocs.io/en/latest/hyperparameters.html
+                and dagobert.modelling.rl.rl_args for more information on the params.
+        """
+        super().__init__()
+        # sanity check and setup device
+        hparams = PPO._pre_sanity_check(hparams)
+        self.tcn_device = "cuda" if hparams.gpus > 0 else "cpu"
+
+        # prepare datafiles if necessary
+        hparams = Preprocessing().preprocess_augment_dfs(hparams)
+        hparams = Preprocessing().preprocess_train_dfs(hparams)
+        self.hparams = TCNLightning._check_mini_series_lookback(hparams)
+
+        # create env, policy/value networks and experience buffer + tracking vars
+        self.envs = [RLEnv(self.hparams) for _ in range(self.hparams.num_env_workers)]
+        n_actions = self.envs[0].action_space.shape[0]
+        self.critic = ActorCriticTCN(
+            self.hparams, n_actions=n_actions, output_size=1, actor=False
+        )
+        self.actor = ActorContinuous(
+            ActorCriticTCN(self.hparams, n_actions=n_actions, output_size=n_actions),
+            self.hparams.actor_dirichlet_sample_size,
+        )
+        self.agent = ActorCriticAgent(self.actor, self.critic)
+        self.buffer = ExperienceBuffer()
+        self.to_log = {}
+
+    # ----------------------------------------------------------------------------------
+    # EXPERIENCE COLLECTION FOR TRAIN DATALOADER
+    # ----------------------------------------------------------------------------------
+
+    def train_dataloader(self) -> DataLoader:
+        """Initialize the Experience Buffer dataset used for retrieving experiences"""
+        dataset = ExperienceSourceDataset(self.generate_experience_buffer)
+        dataloader = DataLoader(dataset=dataset, batch_size=self.hparams.batch_size)
+        return dataloader
+
+    def generate_experience_buffer(
+        self,
+    ) -> Tuple[
+        List[torch.Tensor],
+        List[torch.Tensor],
+        List[torch.Tensor],
+        List[torch.Tensor],
+        List[torch.Tensor],
+    ]:
+        """
+        Logic for generating trajectory data to train policy and value networks. If
+        `num_env_workers` > 1, this is done leveraging the `Process` and `Queue`
+        classes of the `multiprocessing` module of Python. We'll launch
+        hparams.num_env_workers number of new processes, each replicating the
+        environment in memory, so this can get expensive where `num_assets` in the
+        portfolio is large. If `num_env_workers` = 1, we simply collect experience with
+        the main environment in the main process, running PPO.
+
+        Yield:
+           Tuple of Lists containing tensors for states, actions, log probs, qvals
+               and advantage.
+        """
+        device = self.setup_model_for_experience_gathering()
+        # spawn multiple processes and gather experience in parallel
+        if self.hparams.num_env_workers > 1:
+            max_steps_per_worker = int(
+                self.hparams.steps_per_epoch
+                / self.hparams.num_env_workers
+                / self.hparams.num_env_heads
+            )
+            parallel_experiences = ParallelExperiences()
+            for i in range(self.hparams.num_env_workers):
+                args = (
+                    self.envs[i],
+                    self.agent,
+                    device,
+                    max_steps_per_worker,
+                    self.hparams.max_episode_length,
+                    len(self.hparams.asset_names),
+                    self.hparams.num_env_heads,
+                    self.hparams.gamma,
+                    self.hparams.lam,
+                    self.hparams.pgportfolio,
+                )
+                parallel_experiences.create_worker(*args)
+            # collect experiences in parallel, then merge them, calculate metrics
+            self.buffer.merge_buffers(parallel_experiences.collect_experiences())
+        else:
+            self.buffer = gather_experience(
+                self.envs[0],
+                self.agent,
+                device,
+                int(self.hparams.steps_per_epoch / self.hparams.num_env_heads),
+                self.hparams.max_episode_length,
+                len(self.hparams.asset_names),
+                self.hparams.num_env_heads,
+                self.hparams.gamma,
+                self.hparams.lam,
+                self.hparams.pgportfolio,
+            )
+        self.update_metrics_to_log()
+        self.setup_model_for_training()
+
+        # yield a dataset for dataloader for updating actor/critic and clear buffer
+        for state, past_pw, action, old_logp, qval, adv in zip(
+            self.buffer.states,
+            self.buffer.past_pws,
+            self.buffer.actions,
+            self.buffer.logps,
+            self.buffer.qvals,
+            self.buffer.advs,
+        ):
+            yield state, past_pw, action, old_logp, qval, adv
+        self.buffer.clear_buffer()
+
+    def setup_model_for_experience_gathering(self):
+        """Moves model to CPU if necessary for parallel experience gathering."""
+        # dropout and batch-norm doesn't make sense for experience gathering
+        self.agent.critic_net.eval()
+        self.agent.actor_net.eval()
+        if self.hparams.windows:
+            if self.hparams.num_env_workers > 1:
+                # we cannot use cuda tensor sharing on windows for multiprocessing
+                device = "cpu"
+                self.agent.critic_net.cpu()
+                self.agent.actor_net.cpu()
+            else:
+                device = self.device
+        else:
+            device = self.device
+            if self.hparams.num_env_workers > 1:
+                # use cuda tensor sharing on linux
+                self.agent.critic_net.share_memory()
+                self.agent.actor_net.share_memory()
+        return device
+
+    def setup_model_for_training(self):
+        """Moves model back to GPU if necessary after parallel experience gathering."""
+        if self.hparams.windows and self.hparams.gpus != 0:
+            self.agent.critic_net.cuda()
+            self.agent.actor_net.cuda()
+        self.agent.critic_net.train()
+        self.agent.actor_net.train()
+
+    def update_metrics_to_log(self):
+        """Helper function recalculating metrics we track at end of each epoch"""
+        done_eps = self.buffer.done_episodes + eps
+        ep_rewards = self.buffer.epoch_rewards
+
+        # pytorch lightning model checkpoint needs metric name without /
+        e = "episode"
+        self.to_log["avg_total_reward"] = ep_rewards / done_eps
+        self.to_log[f"{e}/avg_total_reward"] = ep_rewards / done_eps
+        self.to_log[f"{e}/avg_step_reward"] = ep_rewards / self.hparams.steps_per_epoch
+        self.to_log[f"{e}/avg_len"] = self.hparams.steps_per_epoch / done_eps
+
+        p = "portfolio"
+        infos = pd.DataFrame(list(self.buffer.infos)).mean()
+        p_val = np.array(list(self.buffer.p_ep_end_value))
+        m_ret = np.array(list(self.buffer.p_ep_end_market_return))
+        self.to_log[f"{p}/avg_value_ep_end"] = p_val.mean()
+        self.to_log[f"{p}/avg_market_return_ep_end"] = m_ret.mean()
+        self.to_log[f"{p}/avg_portfolio_vs_market"] = (p_val - m_ret).mean()
+        self.to_log[f"{p}/avg_value"] = infos["portfolio_value"]
+        self.to_log[f"{p}/avg_rebalancing_cost"] = infos["rebalancing_cost"]
+        self.to_log["weights/avg_weight_std"] = infos["weights_std"]
+        for w in infos.index[infos.index.str.contains("weight_")]:
+            self.to_log[f"weights/{w}"] = infos[w]
+
+    @staticmethod
+    def discount_rewards(rewards: List[float], gamma: float = 0.99) -> List[float]:
+        """
+        Calculate the discounted rewards of all rewards in list. This is used as
+        Q-values for training the critic network so it becomes better approximating
+        the real reward we can expect from a given state.
+
+        Args:
+            rewards: list of rewards/advantages
+            gamma: Gamma for discounting the long-term rewards.
+
+        Returns:
+            list of discounted rewards/advantages
+        """
+        assert isinstance(rewards[0], float)
+        cum_r = []
+        sum_r = 0.0
+        for r in reversed(rewards):
+            sum_r = (sum_r * gamma) + r
+            cum_r.append(sum_r)
+        return list(reversed(cum_r))
+
+    @staticmethod
+    def calc_advantage(
+        rewards: List[float],
+        values: List[float],
+        gamma: float = 0.99,
+        lam: float = 0.95,
+    ) -> List[float]:
+        """
+        Calculate the advantage given rewards and state values for an episode.
+        The advantage compares how much better the actor did compared to what the
+        critic thought the given state is worth in reward (value).
+
+        Args:
+            rewards: list of episode rewards
+            values: list of state values from critic
+            gamma: Gamma for discounting the long-term rewards.
+            lam: Lambda for the GAE advantage calculation.
+
+        Returns:
+            List of advantages.
+        """
+        # GAE
+        delta = [
+            rewards[i] + gamma * values[i + 1] - values[i]
+            for i in range(len(rewards) - 1)
+        ]
+        adv = PPO.discount_rewards(delta, gamma * lam)
+        return adv
+
+    # ----------------------------------------------------------------------------------
+    # LOSSES AND OPTIMIZERS
+    # ----------------------------------------------------------------------------------
+
+    def actor_loss(self, state, past_pw, action, old_logp, adv) -> torch.Tensor:
+        pi, _, _ = self.actor(state, past_pw)
+        logp = self.actor.get_log_prob(pi, action)
+        old_new_diff = logp - old_logp
+        ratio = torch.exp(old_new_diff)
+        # idea taken from spinningup PPO implemenetation to prevent exploding loss
+        approx_kl = old_new_diff.mean().item()
+        clip_ratio = torch.clamp(
+            ratio, 1 - self.hparams.clip_ratio, 1 + self.hparams.clip_ratio
+        )
+        loss_actor = -torch.min(ratio * adv, clip_ratio * adv).mean()
+        return loss_actor, approx_kl
+
+    def critic_loss(self, state, past_pw, qval) -> torch.Tensor:
+        value = self.critic(state, past_pw)
+        loss_critic = (qval - value).pow(2).mean()
+        return loss_critic
+
+    def configure_optimizers(self) -> List[optim.Optimizer]:
+        """ Initialize Adam optimizer"""
+        optimizer_actor = optim.Adam(self.actor.parameters(), lr=self.hparams.lr_actor)
+        optimizer_critic = optim.Adam(
+            self.critic.parameters(), lr=self.hparams.lr_critic
+        )
+        return optimizer_actor, optimizer_critic
+
+    def optimizer_step(self, *args, **kwargs):
+        # TODO: figure out a way to do kl divergence clipping
+        for _ in range(self.hparams.n_optim_iters):
+            super().optimizer_step(*args, **kwargs)
+
+    # ----------------------------------------------------------------------------------
+    # TRAINING PHASE
+    # ----------------------------------------------------------------------------------
+
+    def training_step(
+        self, batch: Tuple[torch.Tensor, torch.Tensor], batch_idx, optimizer_idx
+    ):
+        """
+        Carries out a n_optim_iter number of updates to actor and critic network from a
+        batch of replay buffer.
+
+        Args:
+            batch: batch of replay buffer/trajectory data
+            batch_idx: not used
+            optimizer_idx: idx that controls optimizing actor or critic network
+        Returns:
+            loss
+        """
+        state, past_pw, action, old_logp, qval, adv = batch
+
+        # normalize advantages within batch
+        if self.hparams.normalize_advantages:
+            adv = (adv - adv.mean()) / adv.std()
+
+        # log all metrics (other than loss)
+        for k, v in self.to_log.items():
+            self.log(k, v, on_step=False, on_epoch=True)
+        if optimizer_idx == 0:
+            loss_actor, approx_kl = self.actor_loss(
+                state, past_pw, action, old_logp, adv
+            )
+            self.log("loss/actor", loss_actor, on_epoch=True, on_step=False)
+            self.log("loss/approx_kl", approx_kl, on_epoch=True, on_step=False)
+            return loss_actor
+
+        elif optimizer_idx == 1:
+            loss_critic = self.critic_loss(state, past_pw, qval)
+            self.log("loss/critic", loss_critic, on_epoch=True, on_step=False)
+            return loss_critic
+
+    @staticmethod
+    def _pre_sanity_check(hparams: Namespace):
+        if hparams.target_col != NRL.rl_return:
+            raise ValueError("target_col has to be rl_return for RL tasks.")
+        if hparams.num_env_workers > 1 and not hparams.no_weight_norm:
+            hparams.no_weight_norm = True
+            logger.warning("We set no_weight_norm=True as you have num_env_workers>1.")
+
+        # fill in the same cols for any df that doesn't have the cols_to_model defined
+        if len(hparams.cols_to_model) > 1:
+            for df_name, cols in hparams.cols_to_model.items():
+                if df_name != npa.anchor and (cols is None or len(cols) == 0):
+                    hparams.cols_to_model[df_name] = deepcopy(
+                        hparams.cols_to_model[npa.anchor]
+                    )
+        return hparams
+
+
+# --------------------------------------------------------------------------------------
+# PARALLEL EXPERIENCE COLLECTION
+#
+# Moving this to another module would result in circular dependencies. Been there,
+# done that, it was painful, so let's just leave these here.
+# --------------------------------------------------------------------------------------
+
+
+class ExperienceBuffer:
+    """
+    Object holding all states, rewards, actions, logp vals, etc of a rollout session,
+    i.e. the phase of the training when we're collecting experience to train on later
+    using the current policy of the actor.
+
+    This is designed to work both with a single worker (single process) or with
+    multiple workers collecting experience in parallel.
+    """
+
+    def __init__(self):
+        """Class constructor"""
+        self.clear_buffer()
+
+    def append(
+        self,
+        state: torch.Tensor,
+        past_pw: torch.Tensor,
+        action: torch.Tensor,
+        logp: torch.Tensor,
+        reward: float,
+        value: torch.Tensor,
+        info: dict,
+    ):
+        """
+        Appends the state (including portfolio value and weights), actions, logp,
+        reward to the buffer after a single step taken in the environment.
+
+        Args:
+            state: State that went into the agent (i.e. both actor and critic).
+            past_pw: Past portfolio value and weights that went into the agent.
+            action: Agent's action to the state and past_pw.
+            logp: Log-probability of the action sampled from the actor's distribution.
+            reward: Reward obtained by the action.
+            value: Estimated (by critic) reward we should have got with this action.
+            info: Portfolio related information returned by the env after the step.
+        """
+        # drop first batch dim so dataloader later can resample them for backprop
+        self.states.append([s.squeeze(0) for s in state])
+        self.past_pws.append(past_pw.squeeze(0))
+        self.actions.append(action)
+        self.logps.append(logp)
+        self.infos.append(info)
+        self.ep_rewards.append(reward)
+        self.ep_values.append(value)
+        self.ep_market_returns.append(info["market_return"])
+
+    def shift_rewards(self):
+        """
+        The reward at time t was is realised as a consequence of action t-1. This is
+        special to our environment (see last paragraph of page 9 in the article:
+        https://arxiv.org/pdf/1706.10059.pdf). This means, at the end of each episode
+        we need to drop the very last element of state/action/logp/value/info and
+        shift the rewards by one to the right, i.e. making r0 align with a1, r1 with a2,
+        ... rn-1 with an.
+        """
+        self.ep_rewards = self.ep_rewards[1:]
+        self.states.pop(-1)
+        self.past_pws.pop(-1)
+        self.actions.pop(-1)
+        self.logps.pop(-1)
+        self.infos.pop(-1)
+        self.ep_values.pop(-1)
+        self.ep_market_returns.pop(-1)
+
+    def merge_buffers(self, buffers):
+        """
+        Merges the passed in ExperienceBuffers and overwrites the current state with it.
+        Used when experience is gathered by multiple workers in parallel.
+
+        Args:
+            buffers: List of smaller ExpereinceBuffers to merge together from parallel
+                processes.
+        """
+        self.states = chain(*[buffer.states for buffer in buffers])
+        self.past_pws = chain(*[buffer.past_pws for buffer in buffers])
+        self.actions = chain(*[buffer.actions for buffer in buffers])
+        self.logps = chain(*[buffer.logps for buffer in buffers])
+        self.qvals = chain(*[buffer.qvals for buffer in buffers])
+        self.advs = chain(*[buffer.advs for buffer in buffers])
+        self.infos = chain(*[buffer.infos for buffer in buffers])
+        self.p_ep_end_value = chain(*[buffer.p_ep_end_value for buffer in buffers])
+        self.p_ep_end_market_return = chain(
+            *[buffer.p_ep_end_market_return for buffer in buffers]
+        )
+        self.done_episodes = sum([buffer.done_episodes for buffer in buffers])
+        self.epoch_rewards = sum([buffer.epoch_rewards for buffer in buffers])
+
+    def clear_buffer(self):
+        """Resets the ExperienceBuffer."""
+        # step vars
+        self.states = []
+        self.past_pws = []
+        self.actions = []
+        self.advs = []
+        self.qvals = []
+        self.logps = []
+        self.infos = []
+        self.p_ep_end_value = []
+        self.p_ep_end_market_return = []
+
+        # episode / epoch vars
+        self.ep_rewards = []
+        self.ep_values = []
+        self.ep_market_returns = []
+        self.done_episodes = 0
+        self.epoch_rewards = 0
+
+
+class ParallelExperiences:
+    """
+    Parallelised experience gathering, idea from https://stackoverflow.com/a/45829852
+    Used to spawn parallel processes for each `env_worker` which can independently can
+    interact with a copy of the environment and return its rewards, logps, values, etc
+    from the rollout.
+    """
+
+    def __init__(self):
+        """Class constructor."""
+        self.queue = mp.JoinableQueue()
+        self.processes = []
+
+    def collect_experiences(self) -> List[ExperienceBuffer]:
+        """Returns the experiences from parallel workers. You need to wait for these."""
+        buffers = []
+        # gather results from workers using the queue and merge them into one
+        for process in self.processes:
+            buffers.append(self.queue.get())  # will block
+            self.queue.task_done()
+        for process in self.processes:
+            process.join()
+        return buffers
+
+    def create_worker(self, *args, **kwargs):
+        """Creates a new worker, with the args passed in for `_gather_experience`."""
+        args_for_wrapper = [gather_experience, self.queue, args, kwargs]
+        process = mp.Process(target=self._wrapper, args=args_for_wrapper)
+        self.processes.append(process)
+        process.start()
+
+    @staticmethod
+    def _wrapper(func, queue, args, kwargs):
+        """This NEEDS to be a static method for multiprocessing to work"""
+        buffer = func(*args, **kwargs)
+        # add collected experience to the queue so it can be returned to master process
+        queue.put(buffer)
+        queue.join()
+
+
+def gather_experience(
+    env: gym.Env,
+    agent: ActorCriticAgent,
+    device: torch.device,
+    max_steps: int,
+    max_episode_length: int,
+    asset_num: int,
+    num_env_heads: int,
+    gamma: float,
+    lam: float,
+    pgportfolio: bool = True,
+):
+    """
+    Workhorse function of the parallel experience gathering. This function can be
+    called as many times as many CPUs are available on the system, to collect the
+    desired number of steps and store them into an `ExperienceBuffer` that is then
+    passed back (via a `multiprocessing.Queue` object) to the main process that
+    spawned the parallel processes. Crucially, this also works if we only have a
+    single worker i.e. the main process of PPO.
+
+    Args:
+        env: An instance of the environment to act on.
+        agent: An instance of the PPO's `ActorCriticAgent`.
+        device: Device where the agent lives (GPU or CPU).
+        max_steps: Total number of steps (over multiple episodes) a worker can take.
+        max_episode_length: Maximum length of a trajectory / episode.
+        asset_num: Number of assets we are modelling (not including USD).
+        num_env_heads: Number of environment heads we use to interact with the env.
+        gamma: See docs of :func:`PPO.calc_advantage`
+        lam: See docs of :func:`PPO.calc_advantage`
+        pgportfolio: If True, we calculate the q-values and advantages according to
+            https://arxiv.org/pdf/1706.10059.pdf, else we use the traditional PPO algo.
+
+    Returns:
+        Experience collected in this parallel worker.
+    """
+    state = env.reset()
+    buffers = [ExperienceBuffer() for _ in range(num_env_heads)]
+    past_pw = torch.ones(num_env_heads, asset_num + 2).to(device)
+
+    for step in range(1, max_steps + 1):
+        # get action, make step, get reward and info from env
+        pi, action, actor_logits, logp, value = agent(state, past_pw, device)
+        if num_env_heads == 1:
+            action = action.unsqueeze(0)
+            logp = logp.unsqueeze(0)
+        next_state, reward, done, info = env.step(action.cpu().numpy())
+
+        # update past portfolio value / weights for next round
+        p1 = torch.Tensor([i["portfolio_value"] for i in info]).to(device).unsqueeze(0)
+        past_pw = torch.cat([p1.T, actor_logits], -1)
+
+        # store everything, we need to do this for each environment head separately
+        for i, buffer in enumerate(buffers):
+            buffer.append(
+                [s[i] for s in state],
+                past_pw[i],
+                action[i],
+                logp[i],
+                reward[i],
+                value[i].item(),
+                info[i],
+            )
+        state = next_state
+
+        episode_end = step > 0 and step % max_episode_length == 0
+        if done or episode_end or step == max_steps - 1:
+            for buffer in buffers:
+                # according to the PGPortfolio paper, reward should be the sum of
+                # immediate rewards (portfolio returns p1/p0) div by length of episode
+                if pgportfolio:
+                    buffer.shift_rewards()
+                    buffer.qvals += buffer.ep_rewards
+                    buffer.advs += list(
+                        np.array(buffer.ep_rewards) - np.array(buffer.ep_values)
+                    )
+                # classic PPO qval and reward estimation
+                else:
+                    buffer.qvals += PPO.discount_rewards(buffer.ep_rewards, gamma)
+                    buffer.advs += PPO.calc_advantage(
+                        buffer.ep_rewards, buffer.ep_values, gamma, lam
+                    )
+                if done or episode_end:
+                    buffer.done_episodes += 1
+                buffer.epoch_rewards += sum(buffer.ep_rewards)
+                buffer.p_ep_end_value.append(buffer.infos[-1]["portfolio_value"])
+                buffer.p_ep_end_market_return.append(
+                    np.array(buffer.ep_market_returns).prod()
+                )
+
+                # episode over, reset the env and the episode  buffer
+                buffer.ep_rewards = []
+                buffer.ep_values = []
+                buffer.ep_market_returns = []
+                state = env.reset()
+                past_pw = torch.ones(num_env_heads, asset_num + 2).to(device)
+
+    # merge buffers from each environment head and return new buffer
+    merged_buffers = ExperienceBuffer()
+    merged_buffers.merge_buffers(buffers)
+    return merged_buffers
diff --git a/src/dagobert/modelling/rl/rl_args.py b/src/dagobert/modelling/rl/rl_args.py
new file mode 100644
index 00000000..6fd9a604
--- /dev/null
+++ b/src/dagobert/modelling/rl/rl_args.py
@@ -0,0 +1,244 @@
+"""
+All custom arguments and hyper-parameters for the reinforcement learning module.
+"""
+
+from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
+
+from pytorch_lightning import Trainer
+
+from dagobert.modelling.dl.tcn import TCNLightning
+from dagobert.modelling.dl.tcn_args import (
+    add_run_specific_args,
+    add_data_specific_args,
+    add_preprocessing_specific_args,
+)
+from dagobert.naming import (
+    NInputDataCols,
+    NAugmentationMethods,
+    NBarriers,
+    NPreprocessingArgs,
+)
+
+
+def add_rl_specific_args(parent_parser):
+    parser = ArgumentParser(
+        parents=[parent_parser],
+        add_help=False,
+        formatter_class=ArgumentDefaultsHelpFormatter,
+    )
+
+    # this is just a place-holder so it's easier to read the million params in the cmd
+    parser.add_argument("--RL_PARAMS", help="====================================")
+    parser.add_argument(
+        "--asset_names",
+        type=str,
+        nargs="+",
+        default=["BTC", "ETH"],
+        help=(
+            "Names of instruments to include in the portfolio, corresponding to "
+            "anchor, df2, df3, etc."
+        ),
+    )
+    parser.add_argument(
+        "--trading_cost",
+        type=float,
+        default=0.002,
+        help="Commission rate of making trades + an estimated cost of slippage.",
+    )
+    parser.add_argument(
+        "--reward_type",
+        type=str,
+        default="return",
+        help=(
+            "Determines the overall reward to maximise by the agent. Either return or "
+            "sharpe. See RLPortfolio class for more details."
+        ),
+    )
+    parser.add_argument(
+        "--num_env_heads",
+        type=int,
+        default=1,
+        help=(
+            "Number of heads we want to read the environment with concurrently. This "
+            "is an easy and cheap way to parallelize experience gathering on its own "
+            "does not require multiple processes to be spawn."
+        ),
+    )
+    parser.add_argument(
+        "--num_env_workers",
+        type=int,
+        default=1,
+        help=(
+            "Number parallel processes to spawn to gather experience. This represents "
+            "second layer of concurrency (num_env_heads being the first and simplest). "
+            "If this is set to higher than 1, we will have to turn no_weight_norm=True."
+        ),
+    )
+
+    parser.add_argument(
+        "--pgportfolio",
+        action="store_true",
+        help=(
+            "If True, we calculate the q-values and advantages according to "
+            "https://arxiv.org/pdf/1706.10059.pdf, else we use traditional PPO algo."
+        ),
+    )
+
+    parser.add_argument(
+        "--normalize_advantages",
+        action="store_true",
+        help="If used, we normalize the advantages in each batch of the learning phase.",
+    )
+    parser.add_argument(
+        "--max_episode_length",
+        type=int,
+        default=1000,
+        help=(
+            "Maximum number of interactions between the agent and the environment in "
+            "an episode."
+        ),
+    )
+    parser.add_argument(
+        "--steps_per_epoch",
+        type=int,
+        default=10000,
+        help=(
+            "How many action-state pairs to rollout for trajectory collection per "
+            "epoch. I.e. if all episodes run to their max_episode_length, we'll have "
+            "steps_per_epoch/max_episode_length number of unique episodes/trajectories."
+        ),
+    )
+    parser.add_argument(
+        "--n_optim_iters",
+        type=int,
+        default=4,
+        help=(
+            "How many steps of gradient descent to perform on each batch. This might "
+            "seem weird, but it helps sampling efficiency, done by the original PPO "
+            "implementation and the Google ablation study found it to be useful."
+        ),
+    )
+    parser.add_argument(
+        "--gamma", type=float, default=0.99, help="Discounting of rewards."
+    )
+    parser.add_argument(
+        "--lam",
+        type=float,
+        default=0.95,
+        help="Lambda parameter in the advantage discounting equation.",
+    )
+    parser.add_argument(
+        "--lr_actor",
+        type=float,
+        default=0.0003,
+        help="Learning rate for the actor/policy network.",
+    )
+    parser.add_argument(
+        "--lr_critic",
+        type=float,
+        default=0.001,
+        help="Learning rate for the critic/value network.",
+    )
+    parser.add_argument(
+        "--clip_ratio",
+        type=float,
+        default=0.2,
+        help="Clipping parameter for the PPO's policy upgrade cost function.",
+    )
+    parser.add_argument(
+        "--actor_dirichlet_sample_size",
+        type=float,
+        default=1,
+        help=(
+            "Determines how deterministic our Dirichlet based sampling is. At default "
+            "(1), we return a single sample from the dist. With higher sample sizes "
+            "the returned weights are closer and closer to the actual mean of the "
+            "distribution. If it's set to zero, we return the mean and basically "
+            "the model becomes deterministic."
+        ),
+    )
+    return parser
+
+
+def add_model_specific_args(parent_parser):
+    parser = ArgumentParser(
+        parents=[parent_parser],
+        add_help=False,
+        formatter_class=ArgumentDefaultsHelpFormatter,
+    )
+
+    # this is just a place-holder so it's easier to read the million params in the cmd
+    parser.add_argument("--MODEL_PARAMS", help="====================================")
+    parser.add_argument(
+        "--actor_num_channels",
+        type=int,
+        nargs="+",
+        default=[50, 50, 50, 50, 50],
+        help=(
+            "Determines the number of layers (depth) of the actor / policy network and "
+            "the hidden unit count in each layer."
+        ),
+    )
+    parser.add_argument(
+        "--critic_num_channels",
+        type=int,
+        nargs="+",
+        default=[50, 50, 50, 50, 50],
+        help=(
+            "Determines the number of layers (depth) of the critic / value network and "
+            "the hidden unit count in each layer."
+        ),
+    )
+    parser.add_argument("--actor_kernel_size", type=int, default=5, help=" ")
+    parser.add_argument("--critic_kernel_size", type=int, default=5, help=" ")
+    parser.add_argument("--actor_dropout", type=float, default=0, help=" ")
+    parser.add_argument("--critic_dropout", type=float, default=0, help=" ")
+    parser.add_argument(
+        "--no_class_weights",
+        action="store_true",
+        help=(
+            "Set this to True so we can leverage the Preprocessing pipeline written "
+            "for the supervised DL module."
+        ),
+    )
+    parser.add_argument(
+        "--no_weight_norm",
+        action="store_true",
+        help=(
+            " Weight norm is registered as a pre_forward_hook on the 1D convolutional "
+            "layers of the TemporalBlock, and these cannot be serialised when training "
+            "with parallel processes interacting with the model concurrently. If True, "
+            "we add weight normalisation around these layers, and TCN cannot be used "
+            "in a multiprocessing setting. If False, then it can be used, even staying "
+            "on GPU in linux (CPU only on Windows)."
+        ),
+    )
+    parser.add_argument(
+        "--use_last_timepoint",
+        action="store_true",
+        help=(
+            "If this flag is used the only the network's representation "
+            "corresponding at the latest time-point is used to predict the outcome."
+            "By default, we combine all representations across the sequence length"
+            "to make a prediction from, instead of just using the last one."
+        ),
+    )
+    return parser
+
+
+def get_all_args():
+    parser = ArgumentParser(
+        description="Lightning RL module",
+        formatter_class=ArgumentDefaultsHelpFormatter,
+    )
+
+    # add model params of lightning trainer (this HAS to be first)
+    parser = Trainer.add_argparse_args(parser)
+
+    # add model and run specific params
+    parser = add_rl_specific_args(parser)
+    parser = add_model_specific_args(parser)
+    parser = add_run_specific_args(parser)
+    parser = add_data_specific_args(parser)
+    parser = add_preprocessing_specific_args(parser)
+    return parser.parse_args()
diff --git a/src/dagobert/modelling/rl/rl_runner.py b/src/dagobert/modelling/rl/rl_runner.py
new file mode 100644
index 00000000..5b3b9d1c
--- /dev/null
+++ b/src/dagobert/modelling/rl/rl_runner.py
@@ -0,0 +1,38 @@
+"""
+Dagobert's runner for reinforcement learning.
+
+This module is driven by the `dagobert-rl` command which can be parametrised by 
+command line arguments, but it's much more convenient to use YAML configs for this, 
+see the `tcn_args.py` and `rl_args.py` for more detail.
+"""
+import logging
+from pathlib import Path
+
+from dagobert.utils import setup_logging
+from dagobert.runner_utils import load_config, update_args
+from dagobert.modelling.rl.rl_args import get_all_args
+from dagobert.modelling.rl.ppo import run_rl
+
+
+logger = logging.getLogger(__name__)
+
+
+def run():
+    """
+    Initialise a reinforcement-learning environment and a PPO agent and train it.
+    """
+
+    # parse arguments amd setup logging
+    args = get_all_args()
+    setup_logging(logger, "dagobert-rl", logging.INFO, args.log_dir)
+
+    # load config yaml if exist
+    if args.config_path != "":
+        config = load_config(Path(args.config_path))
+        args = update_args(args, config)
+
+    run_rl(args)
+
+
+if __name__ == "__main__":
+    run()
diff --git a/src/dagobert/modelling/rl/utils.py b/src/dagobert/modelling/rl/utils.py
new file mode 100644
index 00000000..fad0f6c7
--- /dev/null
+++ b/src/dagobert/modelling/rl/utils.py
@@ -0,0 +1,26 @@
+"""
+Util functions for portfolio optimization and other RL related tasks, including the 
+classes for gathering experience in parallel.
+"""
+# pylint: disable=no-member
+
+import numpy as np
+
+
+eps = np.finfo(float).eps
+
+
+def sharpe_ratio(returns, freq: int = 30, rfr: int = 0):
+    """
+    Given a set of returns, calculates naive (rfr=0) sharpe (eq 28).
+    """
+    return (np.sqrt(freq) * np.mean(returns - rfr + eps)) / np.std(returns - rfr + eps)
+
+
+def max_drawdown(returns):
+    """
+    Max drawdown. See https://www.investopedia.com/terms/m/maximum-drawdown-mdd.asp
+    """
+    peak = returns.max()
+    trough = returns[returns.argmax() :].min()
+    return (trough - peak) / (peak + eps)
diff --git a/src/dagobert/modelling/utils.py b/src/dagobert/modelling/utils.py
index ac34c93c..c84612bb 100644
--- a/src/dagobert/modelling/utils.py
+++ b/src/dagobert/modelling/utils.py
@@ -409,7 +409,7 @@ def update_lookback(
     for _ in range(num_samples):
         s = np.random.randint(anchor_len)
         diffs.append(idx[s + mini_series_length] - idx[s])
-    lookback = (pd.Series(diffs).dt.seconds / 3600).quantile(quantile)
+    lookback = (pd.Series(diffs).dt.total_seconds() / 3600).quantile(quantile)
     return lookback
 
 
@@ -421,3 +421,59 @@ def plot_anchor_sample(i, obj, x):
     cols = obj.hparams.cols_to_model["anchor"]
     df = pd.DataFrame(x[0][i].detach().cpu().numpy().T, columns=cols)
     df.plot(subplots=True, layout=(int(np.ceil((len(cols) / 4))), 4))
+
+
+def plot_pca(pca_x, pca_x_hat) -> Figure:
+    """
+    Plot PCA-reduced x and x_hat to visualise similarity. Overlap suggests similarity.
+    Args:
+        pca_x: 2-component-PCA of x
+        pca_x_hat: 2-component-PCA of x_hat
+
+    Returns:
+        Scatter plot showing 2-component-PCA of x & x_hat.
+    """
+    f, ax = plt.subplots(1)
+
+    plt.scatter(pca_x[:, 0], pca_x[:, 1], c="black", alpha=0.2, label="Real")
+    plt.scatter(
+        pca_x_hat[:, 0],
+        pca_x_hat[:, 1],
+        c="red",
+        alpha=0.2,
+        label="Synthetic",
+    )
+    ax.legend()
+    plt.title("PCA plot")
+    plt.xlabel("x_pca")
+    plt.ylabel("y_pca")
+    plt.close()
+    return f
+
+
+def plot_tsne(tsne_x, tsne_x_hat) -> Figure:
+    """
+    Plot TSNE-reduced x and x_hat to visualise similarity. Overlap suggests similarity.
+    Args:
+        tsne_x: 2-component-PCA of x
+        tsne_x_hat: 2-component-PCA of x_hat
+
+    Returns:
+        Scatter plot showing 2-component-TSNE of x & x_hat.
+    """
+    f, ax = plt.subplots(1)
+
+    plt.scatter(tsne_x[:, 0], tsne_x[:, 1], c="black", alpha=0.2, label="Real")
+    plt.scatter(
+        tsne_x_hat[:, 0],
+        tsne_x_hat[:, 1],
+        c="red",
+        alpha=0.2,
+        label="Synthetic",
+    )
+    ax.legend()
+    plt.title("TSNE plot")
+    plt.xlabel("x_tsne")
+    plt.ylabel("y_tsne")
+    plt.close()
+    return f
diff --git a/src/dagobert/naming.py b/src/dagobert/naming.py
index f034ce41..f5af7438 100644
--- a/src/dagobert/naming.py
+++ b/src/dagobert/naming.py
@@ -199,6 +199,7 @@ class NPreprocessingArgs(object):
     """
 
     anchor = "anchor"
+    target_col = "target_col"
     cols_to_model = "cols_to_model"
     close_original = "close_original"
 
@@ -275,3 +276,25 @@ class NStockstats(object):
     wr_40 = "wr_40"
     vr_120 = "vr_120"
     vr_40 = "vr_40"
+
+
+class NRL(object):
+    """
+    Naming object for reinforcement learning environment / agent / algos.
+    """
+
+    rl_return = "rl_return"
+
+
+class NGAN(object):
+    """
+    Naming object for TimeGAN.
+    """
+
+    gru = "gru"
+    lstm = "lstm"
+    embedder = "embedder"
+    supervisor = "supervisor"
+    generator = "generator"
+    recovery = "recovery"
+    discriminator = "discriminator"
diff --git a/src/dagobert/preprocessing/feature_creation/time_features.py b/src/dagobert/preprocessing/feature_creation/time_features.py
index 3cda9c46..479e82b1 100644
--- a/src/dagobert/preprocessing/feature_creation/time_features.py
+++ b/src/dagobert/preprocessing/feature_creation/time_features.py
@@ -105,11 +105,11 @@ def add_time_features(self) -> Optional:
 
         if self.add_time_to_label:
             btt_col = pd.to_datetime(self.df_bars[self.barrier_touch_time_col])
-            time_to_label = (btt_col - date_col).dt.seconds.fillna(0)
+            time_to_label = (btt_col - date_col).dt.total_seconds().fillna(0)
             self.df_bars.insert(0, NTimeFeatures.time_to_label, time_to_label)
 
         if self.add_date_diff:
-            date_diff = date_col.diff().dt.seconds.fillna(0)
+            date_diff = date_col.diff().dt.total_seconds().fillna(0)
             self.df_bars.insert(0, NTimeFeatures.date_diff, date_diff)
 
         logger.info("Added time features.")