From 2076f5123750934c501c2685c6a5f6845fc178a3 Mon Sep 17 00:00:00 2001
From: Daniel Homola <dani.homola@gmail.com>
Date: Wed, 23 Dec 2020 12:19:23 +0000
Subject: [PATCH 01/62] started to implement rl env and extended cryptodataset
 for it

---
 config/custom/tcn_config_local.yaml        |   2 +-
 config/rl_config.yaml                      | 178 ++++++++
 notebooks/modelling/rl_env.ipynb           | 273 ++++++++++++
 src/dagobert/modelling/dl/__init__.py      |   2 +-
 src/dagobert/modelling/dl/data.py          |  44 +-
 src/dagobert/modelling/dl/preprocessing.py |   6 +-
 src/dagobert/modelling/rl/__init__.py      |   0
 src/dagobert/modelling/rl/env01.py         | 442 +++++++++++++++++++
 src/dagobert/modelling/rl/env02.py         | 473 +++++++++++++++++++++
 src/dagobert/modelling/rl/environment.py   | 117 +++++
 10 files changed, 1528 insertions(+), 9 deletions(-)
 create mode 100644 config/rl_config.yaml
 create mode 100644 notebooks/modelling/rl_env.ipynb
 create mode 100644 src/dagobert/modelling/rl/__init__.py
 create mode 100644 src/dagobert/modelling/rl/env01.py
 create mode 100644 src/dagobert/modelling/rl/env02.py
 create mode 100644 src/dagobert/modelling/rl/environment.py

diff --git a/config/custom/tcn_config_local.yaml b/config/custom/tcn_config_local.yaml
index 7ede2d23..5da5c8f9 100644
--- a/config/custom/tcn_config_local.yaml
+++ b/config/custom/tcn_config_local.yaml
@@ -39,7 +39,7 @@ optuna_submission_delay: 30
 
 output_size: 3
 num_channels: [150, 150, 150, 150, 150, 150, 150]
-kernel_size: 3
+kernel_size: 10
 dropout: 0.5
 use_last_timepoint: True
 last_y: False
diff --git a/config/rl_config.yaml b/config/rl_config.yaml
new file mode 100644
index 00000000..6508176d
--- /dev/null
+++ b/config/rl_config.yaml
@@ -0,0 +1,178 @@
+
+
+# --------------------------------------------------------------------------------------
+# LIGHTNING
+# --------------------------------------------------------------------------------------
+
+gpus: 1
+pin_memory: True
+profiler: True
+val_check_interval: 0.5
+# enable it with 'power' or 'binsearch'
+auto_scale_batch_size:
+#precision: 16
+
+# --------------------------------------------------------------------------------------
+# RUN
+# --------------------------------------------------------------------------------------
+
+log_dir: logs
+num_workers: 4
+exp_name: TCN
+tags:
+  - model1
+  - ethusdt_volume500
+no_comet_logger: True
+seed: 42
+batch_size: 100
+
+# --------------------------------------------------------------------------------------
+# MODEL
+# --------------------------------------------------------------------------------------
+
+output_size: 2
+num_channels: [150, 150, 150, 150, 150, 150, 150]
+kernel_size: 10
+dropout: 0.5
+use_last_timepoint: True
+last_y: False
+non_last_y_frac: 0.5
+regression: False
+density_num: 3
+mix_density_net: False
+no_class_weights: False
+no_sample_weights: False
+
+# --------------------------------------------------------------------------------------
+# DATA
+# --------------------------------------------------------------------------------------
+
+data_dir: "C:/Work/dagobert/data/modelling"
+
+lookback: auto
+mini_series_length: auto
+
+# If this is set to a number, then simple lookahead labelling is in place
+simple_lookahead_y: 
+simple_lookahead_reg: False
+
+# If this is True, anchor is labelled before preprocessing. to_label and simple_lookahead_y cannot be used together.
+to_label: False
+
+df_train:
+  anchor: std_bar_BTCUSDT_tick_1.feather
+  df2: std_bar_ETHUSDT_tick_1.feather
+df_val: 
+df_test:
+cols_to_model:
+  anchor:
+    - date_diff
+    - open
+    - high
+    - low
+    - close
+    - open_fd_0.0
+    - high_fd_0.0
+    - low_fd_0.0
+    - close_fd_0.0
+    - open_fd_tuned
+    - high_fd_tuned
+    - low_fd_tuned
+    - close_fd_tuned
+    - cum_ticks
+    - cum_dollar
+    - volume
+    - cum_volume_buy
+    - cum_volume_sell
+    - cum_volume_quote
+    - cum_volume_quote_buy
+    - cum_volume_quote_sell
+    - sin_date
+    - cos_date
+    - sin_time
+    - cos_time
+    - boll
+    - boll_lb
+    - boll_ub
+    - macd
+    - macds
+    - macdh
+    - wr_60
+    - rsi_60
+    - rsv_60
+    - atr_60
+    - cci_60
+    - kdjk_60
+    - kdjd_60
+    - kdjj_60
+    - pdi_60
+    - mdi_60
+    - vr_60
+  df2:
+    - date_diff
+    - open
+    - high
+    - low
+    - close
+    - open_fd_0.0
+    - high_fd_0.0
+    - low_fd_0.0
+    - close_fd_0.0
+    - open_fd_tuned
+    - high_fd_tuned
+    - low_fd_tuned
+    - close_fd_tuned
+    - cum_ticks
+    - cum_dollar
+    - volume
+    - cum_volume_buy
+    - cum_volume_sell
+    - cum_volume_quote
+    - cum_volume_quote_buy
+    - cum_volume_quote_sell
+    - sin_date
+    - cos_date
+    - sin_time
+    - cos_time
+    - boll
+    - boll_lb
+    - boll_ub
+    - macd
+    - macds
+    - macdh
+    - wr_60
+    - rsi_60
+    - rsv_60
+    - atr_60
+    - cci_60
+    - kdjk_60
+    - kdjd_60
+    - kdjj_60
+    - pdi_60
+    - mdi_60
+    - vr_60
+target_col: close_fd_0.0
+time_feat_n: 1
+time_embed_dim: 12
+
+augment_method: random_fast
+augment_prob: 0
+augment_dfs:
+augment_dfs_mix: 0
+
+# --------------------------------------------------------------------------------------
+# PREPROCESSING
+# --------------------------------------------------------------------------------------
+
+train_start_date: "2018-06-01"
+train_days: 730
+val_days: 60
+val_train_offset_days: 1
+val_puffer_days: 1
+test_days: 30
+test_train_offset_days: 62
+test_puffer_days: 1
+
+sample_weights: 
+binariser_method: 
+scaling_method: minmax
diff --git a/notebooks/modelling/rl_env.ipynb b/notebooks/modelling/rl_env.ipynb
new file mode 100644
index 00000000..cd4bd631
--- /dev/null
+++ b/notebooks/modelling/rl_env.ipynb
@@ -0,0 +1,273 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The autoreload extension is already loaded. To reload it, use:\n",
+      "  %reload_ext autoreload\n"
+     ]
+    }
+   ],
+   "source": [
+    "%load_ext autoreload\n",
+    "%autoreload 2\n",
+    "\n",
+    "from pathlib import Path\n",
+    "\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "\n",
+    "from dagobert.io import S3Connector\n",
+    "from dagobert.modelling.rl.environment import PortfolioEnv"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## PortfolioSim\n",
+    "\n",
+    "We go through the `step` function of the `PortfolioSim` class to understand what is it doing. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([0.48192771, 0.26506024, 0.25301205])"
+      ]
+     },
+     "execution_count": 32,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "eps = np.finfo(float).eps\n",
+    "\n",
+    "# orig portfolio value\n",
+    "p0 = 1\n",
+    "\n",
+    "# orig portfolio allocation (50% cash, nothing in 25% btc, 25% eth)\n",
+    "w0 = np.array([.5, .25, .25])\n",
+    "\n",
+    "# new relative price vector, expressed as returns (BTC went up 10%, ETH 5%)\n",
+    "y1 = np.array([1, 1.1, 1.05])\n",
+    "\n",
+    "# (eq7) since we last acted prices changed, so weights evolve (see below)\n",
+    "dw1 = (y1 * w0) / (np.dot(y1, w0) + eps)\n",
+    "dw1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.00020481927710843396"
+      ]
+     },
+     "execution_count": 42,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# new weight vector from the agent for this timestep (predicted correctly that BTC/ETH will go up and allocated more USD to them)\n",
+    "w1 = np.array([.4, .3, .3])\n",
+    "\n",
+    "# (eq16) cost to change portfolio:\n",
+    "# excluding change in cash to avoid double counting for transaction cost\n",
+    "mu = 0.0025 * (np.abs(dw1[1:] - w1[1:])).sum()\n",
+    "mu"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "1.045"
+      ]
+     },
+     "execution_count": 43,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "np.dot(y1, w1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.9997951807228915"
+      ]
+     },
+     "execution_count": 40,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "(1 - mu)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "1.0447859638554216"
+      ]
+     },
+     "execution_count": 37,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# (eq11) new portfolio value: see section between (eq19-20) why this works\n",
+    "p1 = p0 * (1 - mu) * np.dot(y1, w1)\n",
+    "p1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "        # (eq16) cost to change portfolio:\n",
+    "        # excluding change in cash to avoid double counting for transaction cost\n",
+    "        mu = self.cost * (np.abs(dw1[1:] - w1[1:])).sum()\n",
+    "\n",
+    "        # (eq11) final portfolio value: see section between (eq19-20) why this works\n",
+    "        p1 = p0 * (1 - mu) * np.dot(y1, w0)\n",
+    "\n",
+    "        # (eq9 & 10) rate of return log rate of return\n",
+    "        rho1 = p1 / p0 - 1  # rate of returns\n",
+    "        r1 = np.log(p1 + eps) - np.log(p0 + eps)\n",
+    "\n",
+    "        # (eq22) immediate reward is log rate of return scaled by episode length\n",
+    "        reward = r1 / self.steps"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data_dir = Path('C:/Work/dagobert/data/modelling')\n",
+    "instruments = ['BTC', 'ETH', 'XRP', 'LTC']\n",
+    "datetimes = None\n",
+    "\n",
+    "# work out the common datetimes\n",
+    "for instrument in instruments:\n",
+    "    df = pd.read_feather(data_dir/f'std_bar_{instrument}USDT_tick_1.feather')\n",
+    "    df = df.set_index('date_time')\n",
+    "    if datetimes is not None:\n",
+    "        datetimes = df.index.intersection(datetimes)\n",
+    "    else:\n",
+    "        datetimes = df.index\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# merge instruments \n",
+    "cols_to_select = ['open', 'low', 'high', 'close', 'volume']\n",
+    "history = np.empty((len(instruments), len(datetimes), len(cols_to_select)))\n",
+    "for i,instrument in enumerate(instruments):\n",
+    "    df = pd.read_feather(data_dir/f'std_bar_{instrument}USDT_tick_1.feather')\n",
+    "    history[i, :, :] = df.set_index('date_time').loc[datetimes, cols_to_select].values"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# make portfolio\n",
+    "portfolio = PortfolioEnv(history, instruments, len(datetimes), window_length=1440)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "AttributeError",
+     "evalue": "'PortfolioSim' object has no attribute 'p0'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
+      "\u001b[1;32m<ipython-input-11-ec7502bc072c>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[0mw\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0marray\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;36m.25\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;36m.25\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;36m.25\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;36m.25\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[0mportfolio\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstep\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mw\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
+      "\u001b[1;32mc:\\users\\danih\\dropbox\\dagobert\\dagobert\\src\\dagobert\\modelling\\rl\\environment.py\u001b[0m in \u001b[0;36mstep\u001b[1;34m(self, action)\u001b[0m\n\u001b[0;32m    253\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    254\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0mstep\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0maction\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 255\u001b[1;33m         \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_step\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0maction\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    256\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    257\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0m_step\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0maction\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;32mc:\\users\\danih\\dropbox\\dagobert\\dagobert\\src\\dagobert\\modelling\\rl\\environment.py\u001b[0m in \u001b[0;36m_step\u001b[1;34m(self, action)\u001b[0m\n\u001b[0;32m    296\u001b[0m         \u001b[0mopen_price_vector\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mobservation\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m-\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    297\u001b[0m         \u001b[0my1\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mclose_price_vector\u001b[0m \u001b[1;33m/\u001b[0m \u001b[0mopen_price_vector\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 298\u001b[1;33m         \u001b[0mreward\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0minfo\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdone2\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msim\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_step\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mweights\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my1\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    299\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    300\u001b[0m         \u001b[1;31m# calculate return for buy and hold a bit of each asset\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;32mc:\\users\\danih\\dropbox\\dagobert\\dagobert\\src\\dagobert\\modelling\\rl\\environment.py\u001b[0m in \u001b[0;36m_step\u001b[1;34m(self, w1, y1)\u001b[0m\n\u001b[0;32m    146\u001b[0m         \u001b[1;32massert\u001b[0m \u001b[0my1\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;36m1.0\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"y1[0] must be 1\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    147\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 148\u001b[1;33m         \u001b[0mp0\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mp0\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    149\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    150\u001b[0m         \u001b[0mdw1\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0my1\u001b[0m \u001b[1;33m*\u001b[0m \u001b[0mw1\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m/\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdot\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0my1\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mw1\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m+\u001b[0m \u001b[0meps\u001b[0m\u001b[1;33m)\u001b[0m  \u001b[1;31m# (eq7) weights evolve into\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;31mAttributeError\u001b[0m: 'PortfolioSim' object has no attribute 'p0'"
+     ]
+    }
+   ],
+   "source": [
+    "w = np.array([0, .25, .25, .25, .25])\n",
+    "portfolio.step(w)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.7.6 64-bit ('dagobert': conda)",
+   "language": "python",
+   "name": "python37664bitdagobertconda90fcdb25face404d8cd237e8f8473045"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/src/dagobert/modelling/dl/__init__.py b/src/dagobert/modelling/dl/__init__.py
index b69b500d..81ddc443 100644
--- a/src/dagobert/modelling/dl/__init__.py
+++ b/src/dagobert/modelling/dl/__init__.py
@@ -1,4 +1,4 @@
-from .data import CryptoDataset
+from .data import CryptoDataset, PortfolioCryptoDataset
 from .tcn_net import TemporalConvNet
 from .utils import LogCoshLoss, FocalLoss, MixedNormalPDFLoss
 from .adabelief import AdaBelief
diff --git a/src/dagobert/modelling/dl/data.py b/src/dagobert/modelling/dl/data.py
index e1a29584..09825bda 100644
--- a/src/dagobert/modelling/dl/data.py
+++ b/src/dagobert/modelling/dl/data.py
@@ -1,5 +1,6 @@
 """
-Classes defining PyTorch datasets for modelling.
+Classes defining PyTorch datasets for supervised deep learning and multi-instrument 
+reinforcement learning. 
 """
 import logging
 from pathlib import Path
@@ -68,12 +69,12 @@ class CryptoDataset(Dataset):
                 - std_bar_BTCUSDT_volume_250.feather
                 - std_bar_BTCUSDT_volume_500.feather
 
-    Here, we have the lovest granularity ETHUSDT and BTCUSDT bars simultaneously used
+    Here, we have the lowest granularity ETHUSDT and BTCUSDT bars simultaneously used
     to model whether the ETH price will be up or down in 30 minutes: simple_lookahead_y.
     Both input DFs use 4 columns (OHLC) and both can be augmented with a 50% chance,
     meaning, on average in every batch 256 samples would come from the augment_dfs.
     The anchor can only be augmented by one of the other less granular ETHUSDT volume
-    bar datasets, while the secondary input DF (BTCUSDT) chas its own two augmentation
+    bar datasets, while the secondary input DF (BTCUSDT) has its own two augmentation
     data sources.
 
     Internally these 6 DFs will be rapackaged as a single dict (`self.dfs`), where the
@@ -94,7 +95,7 @@ class CryptoDataset(Dataset):
 
     The reason why we disect the data (pandas DFs) into these dicts and list of np
     arrays is because of huge performance gains when we do the indexing in np instead
-    of pandas .loc.
+    of using the .loc method of pandas.
     """
 
     def __init__(
@@ -559,3 +560,38 @@ def plot(self) -> Tuple[Figure]:
             plt.ylabel("Count")
         plt.close()
         return fig_close, fig_data, fig_target
+
+
+class PortfolioCryptoDataset(CryptoDataset):
+    """
+    This extends :class:`dagobert.modelling.dl.data.CryptoDataset` to make it
+    suitable for multi instrument portfolio optimization through reinforcement-learning.
+
+    Instead of returning an array of Xs and single y, this returns only the Xs, and
+    uses the last time step of the Xs as the target.
+    """
+
+    def __init__(self, *args, **kw):
+        super().__init__(*args, **kw)
+        # for each instrument, fish out the index of the target_col (close_0 by default)
+        self.target_col_ix = [
+            np.where(self.target_col == np.array(cols))[0]
+            for cols in self.cols_to_model.values()
+        ]
+
+    def __getitem__(self, idx):
+        """
+        We don't need to calculate or fetch y, as we can simply use the last timepoint
+        for that.
+        """
+        idx = idx.tolist() if torch.is_tensor(idx) else idx
+        batch_dfs, batch_indices, _ = self._get_batch_dfs_indices_target()
+        from_idx, upto_idx = self._get_from_upto_idxs(idx, batch_indices)
+        Xs = self._get_Xs(batch_dfs, from_idx, upto_idx)
+
+        # cut off last time-point from each X as use that as y.
+        ys = np.empty(len(Xs))
+        for i, X in enumerate(Xs):
+            Xs[i] = X[:, :-1]
+            ys[i] = X[self.target_col_ix[i], -1]
+        return Xs, ys
diff --git a/src/dagobert/modelling/dl/preprocessing.py b/src/dagobert/modelling/dl/preprocessing.py
index 6c4b8d12..50efa475 100644
--- a/src/dagobert/modelling/dl/preprocessing.py
+++ b/src/dagobert/modelling/dl/preprocessing.py
@@ -41,9 +41,9 @@ class Preprocessing(object):
     @staticmethod
     def preprocess_augment_dfs(hparams: Namespace) -> Namespace:
         """
-        If DFs are defined in augment_dfs we download and scale them. We only
-        check the scaling_method parameter of hparams for deciding if  we already have
-        a downloaeded and processed version in the data_dir.
+        If DFs are defined in augment_dfs we download and scale them plus label the
+        anchor. We only check the labelling & scaling_method parameters of hparams for
+        deciding if we already have a downloaeded and processed version in the data_dir.
 
         Args:
             hparams: Parsed hypere parameters of the experiment.
diff --git a/src/dagobert/modelling/rl/__init__.py b/src/dagobert/modelling/rl/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/src/dagobert/modelling/rl/env01.py b/src/dagobert/modelling/rl/env01.py
new file mode 100644
index 00000000..4015cf7c
--- /dev/null
+++ b/src/dagobert/modelling/rl/env01.py
@@ -0,0 +1,442 @@
+"""
+https://github.com/wassname/rl-portfolio-management/blob/master/rl_portfolio_management/environments/portfolio.py
+"""
+import numpy as np
+import pandas as pd
+from matplotlib import pyplot as plt
+from pprint import pprint
+import logging
+import os
+import tempfile
+import time
+import gym
+import gym.spaces
+
+from ..config import eps
+from ..data.utils import normalize, random_shift, scale_to_start
+from ..util import MDD as max_drawdown, sharpe, softmax
+from ..callbacks.notebook_plot import LivePlotNotebook
+
+logger = logging.getLogger(__name__)
+
+
+class DataSrc(object):
+    """Acts as data provider for each new episode."""
+
+    def __init__(
+        self,
+        df,
+        steps=252,
+        scale=True,
+        scale_extra_cols=True,
+        augment=0.00,
+        window_length=50,
+        random_reset=True,
+    ):
+        """
+        DataSrc.
+
+        df - csv for data frame index of timestamps
+             and multi-index columns levels=[['LTCBTC'],...],['open','low','high','close',...]]
+             an example is included as an hdf file in this repository
+        steps - total steps in episode
+        scale - scale the data for each episode
+        scale_extra_cols - scale extra columns by global mean and std
+        augment - fraction to augment the data by
+        random_reset - reset to a random time (otherwise continue through time)
+        """
+        self.steps = steps + 1
+        self.augment = augment
+        self.random_reset = random_reset
+        self.scale = scale
+        self.scale_extra_cols = scale_extra_cols
+        self.window_length = window_length
+        self.idx = self.window_length
+
+        # get rid of NaN's
+        df = df.copy()
+        df.replace(np.nan, 0, inplace=True)
+        df = df.fillna(method="pad")
+
+        # dataframe to matrix
+        self.asset_names = df.columns.levels[0].tolist()
+        self.features = df.columns.levels[1].tolist()
+        data = df.as_matrix().reshape(
+            (len(df), len(self.asset_names), len(self.features))
+        )
+        self._data = np.transpose(data, (1, 0, 2))
+        self._times = df.index
+
+        self.price_columns = ["close", "high", "low", "open"]
+        self.non_price_columns = set(df.columns.levels[1]) - set(self.price_columns)
+
+        # Stats to let us normalize non price columns
+        if scale_extra_cols:
+            x = self._data.reshape((-1, len(self.features)))
+            self.stats = dict(mean=x.mean(0), std=x.std(0))
+            # for column in self._data.columns.levels[1].tolist():
+            #     x = df.xs(key=column, axis=1, level='Price').as_matrix()[:, :]
+            #     self.stats["mean"].append(x.mean())
+            #      = dict(mean=x.mean(), std=x.std())
+
+        self.reset()
+
+    def _step(self):
+        # get history matrix from dataframe
+        data_window = self.data[:, self.step : self.step + self.window_length].copy()
+
+        # (eq.1) prices
+        y1 = data_window[:, -1, 0] / data_window[:, -2, 0]
+        y1 = np.concatenate([[1.0], y1])  # add cash price
+
+        # (eq 18) X: prices are divided by close price
+        nb_pc = len(self.price_columns)
+        if self.scale:
+            last_close_price = data_window[:, -1, 0]
+            data_window[:, :, :nb_pc] /= last_close_price[:, np.newaxis, np.newaxis]
+
+        if self.scale_extra_cols:
+            # normalize non price columns
+            data_window[:, :, nb_pc:] -= self.stats["mean"][None, None, nb_pc:]
+            data_window[:, :, nb_pc:] /= self.stats["std"][None, None, nb_pc:]
+            data_window[:, :, nb_pc:] = np.clip(
+                data_window[:, :, nb_pc:],
+                self.stats["mean"][nb_pc:] - self.stats["std"][nb_pc:] * 10,
+                self.stats["mean"][nb_pc:] + self.stats["std"][nb_pc:] * 10,
+            )
+
+        self.step += 1
+        history = data_window
+        done = bool(self.step >= self.steps)
+
+        return history, y1, done
+
+    def reset(self):
+        self.step = 0
+
+        # get data for this episode
+        if self.random_reset:
+            self.idx = np.random.randint(
+                low=self.window_length + 1, high=self._data.shape[1] - self.steps - 2
+            )
+        else:
+            # continue sequentially, before reseting to start
+            if self.idx > (self._data.shape[1] - self.steps - self.window_length - 1):
+                self.idx = self.window_length + 1
+            else:
+                self.idx += self.steps
+        data = self._data[
+            :, self.idx - self.window_length : self.idx + self.steps + 1
+        ].copy()
+        self.times = self._times[
+            self.idx - self.window_length : self.idx + self.steps + 1
+        ]
+
+        # augment data to prevent overfitting
+        data += np.random.normal(loc=0, scale=self.augment, size=data.shape)
+
+        self.data = data
+
+
+class PortfolioSim(object):
+    """
+    Portfolio management sim.
+
+    Params:
+    - cost e.g. 0.0025 is max in Poliniex
+
+    Based of [Jiang 2017](https://arxiv.org/abs/1706.10059)
+    """
+
+    def __init__(self, asset_names=[], steps=128, trading_cost=0.0025, time_cost=0.0):
+        self.cost = trading_cost
+        self.time_cost = time_cost
+        self.steps = steps
+        self.asset_names = asset_names
+        self.reset()
+
+    def _step(self, w1, y1):
+        """
+        Step.
+
+        w1 - new action of portfolio weights - e.g. [0.1,0.9, 0.0]
+        y1 - price relative vector also called return
+            e.g. [1.0, 0.9, 1.1]
+        Numbered equations are from https://arxiv.org/abs/1706.10059
+        """
+        w0 = self.w0
+        p0 = self.p0
+
+        # (eq7) since we last acted prices changed, so weights evolve into
+        dw1 = (y1 * w0) / (np.dot(y1, w0) + eps)
+
+        # (eq16) cost to change portfolio: p' -> mu -> pt, see Figure 1
+        # excluding change in cash to avoid double counting for transaction cost
+        mu = self.cost * (np.abs(dw1[1:] - w1[1:])).sum()
+
+        # (eq11) final portfolio value: see section between (eq19-20) why this works
+        p1 = p0 * (1 - mu) * np.dot(y1, w0)
+
+        # (eq9 & 10) rate of return log rate of return
+        rho1 = p1 / p0 - 1  # rate of returns
+        r1 = np.log(p1 + eps) - np.log(p0 + eps)
+
+        # (eq22) immediate reward is log rate of return scaled by episode length
+        reward = r1 / self.steps
+
+        # remember for next step
+        self.w0 = w1
+        self.p0 = p1
+
+        # if we run out of money, we're done
+        done = bool(p1 == 0)
+
+        # should only return single values, not list
+        info = {
+            "reward": reward,
+            "log_return": r1,
+            "portfolio_value": p1,
+            "market_return": y1.mean(),
+            "rate_of_return": rho1,
+            "weights_mean": w1.mean(),
+            "weights_std": w1.std(),
+            "cost": mu,
+        }
+        # record weights and prices
+        for i, name in enumerate(["BTCBTC"] + self.asset_names):
+            info["weight_" + name] = w1[i]
+            info["price_" + name] = y1[i]
+
+        self.infos.append(info)
+        return reward, info, done
+
+    def reset(self):
+        self.infos = []
+        self.w0 = np.array([1.0] + [0.0] * len(self.asset_names))
+        self.p0 = 1.0
+
+
+class PortfolioEnv(gym.Env):
+    """
+    An environment for financial portfolio management.
+
+    Financial portfolio management is the process of constant redistribution of a fund into different
+    financial products.
+
+    Based on [Jiang 2017](https://arxiv.org/abs/1706.10059)
+    """
+
+    metadata = {"render.modes": ["notebook", "ansi"]}
+
+    def __init__(
+        self,
+        df,
+        steps=256,
+        trading_cost=0.0025,
+        time_cost=0.00,
+        window_length=50,
+        augment=0.00,
+        output_mode="EIIE",
+        log_dir=None,
+        scale=True,
+        scale_extra_cols=True,
+        random_reset=True,
+    ):
+        """
+        An environment for financial portfolio management.
+
+        Params:
+            df - csv for data frame index of timestamps
+                 and multi-index columns levels=[['LTCBTC'],...],['open','low','high','close']]
+            steps - steps in episode
+            window_length - how many past observations["history"] to return
+            trading_cost - cost of trade as a fraction,  e.g. 0.0025 corresponding to max rate of 0.25% at Poloniex (2017)
+            time_cost - cost of holding as a fraction
+            augment - fraction to randomly shift data by
+            output_mode: decides observation["history"] shape
+            - 'EIIE' for (assets, window, 3)
+            - 'atari' for (window, window, 3) (assets is padded)
+            - 'mlp' for (assets*window*3)
+            log_dir: directory to save plots to
+            scale - scales price data by last opening price on each episode (except return)
+            scale_extra_cols - scales non price data using mean and std for whole dataset
+        """
+        self.src = DataSrc(
+            df=df,
+            steps=steps,
+            scale=scale,
+            scale_extra_cols=scale_extra_cols,
+            augment=augment,
+            window_length=window_length,
+            random_reset=random_reset,
+        )
+        self._plot = self._plot2 = self._plot3 = None
+        self.output_mode = output_mode
+        self.sim = PortfolioSim(
+            asset_names=self.src.asset_names,
+            trading_cost=trading_cost,
+            time_cost=time_cost,
+            steps=steps,
+        )
+        self.log_dir = log_dir
+
+        # openai gym attributes
+        # action will be the portfolio weights [cash_bias,w1,w2...] where wn are [0, 1] for each asset
+        nb_assets = len(self.src.asset_names)
+        self.action_space = gym.spaces.Box(0.0, 1.0, shape=nb_assets + 1)
+
+        # get the history space from the data min and max
+        if output_mode == "EIIE":
+            obs_shape = (nb_assets, window_length, len(self.src.features))
+        elif output_mode == "atari":
+            obs_shape = (window_length, window_length, len(self.src.features))
+        elif output_mode == "mlp":
+            obs_shape = (nb_assets) * window_length * (len(self.src.features))
+        else:
+            raise Exception("Invalid value for output_mode: %s" % self.output_mode)
+
+        self.observation_space = gym.spaces.Dict(
+            {
+                "history": gym.spaces.Box(
+                    -10,
+                    20
+                    if scale
+                    else 1,  # if scale=True observed price changes return could be large fractions
+                    obs_shape,
+                ),
+                "weights": self.action_space,
+            }
+        )
+        self._reset()
+
+    def _step(self, action):
+        """
+        Step the env.
+
+        Actions should be portfolio [w0...]
+        - Where wn is a portfolio weight between 0 and 1. The first (w0) is cash_bias
+        - cn is the portfolio conversion weights see PortioSim._step for description
+        """
+        logger.debug("action: %s", action)
+
+        weights = np.clip(action, 0.0, 1.0)
+        weights /= weights.sum() + eps
+
+        # Sanity checks
+        assert self.action_space.contains(
+            action
+        ), "action should be within %r but is %r" % (self.action_space, action)
+        np.testing.assert_almost_equal(
+            np.sum(weights),
+            1.0,
+            3,
+            err_msg='weights should sum to 1. action="%s"' % weights,
+        )
+
+        history, y1, done1 = self.src._step()
+
+        reward, info, done2 = self.sim._step(weights, y1)
+
+        # calculate return for buy and hold a bit of each asset
+        info["market_value"] = np.cumprod(
+            [inf["market_return"] for inf in self.infos + [info]]
+        )[-1]
+        # add dates
+        info["date"] = self.src.times[self.src.step].timestamp()
+        info["steps"] = self.src.step
+
+        self.infos.append(info)
+
+        # reshape history according to output mode
+        if self.output_mode == "EIIE":
+            pass
+        elif self.output_mode == "atari":
+            padding = history.shape[1] - history.shape[0]
+            history = np.pad(history, [[0, padding], [0, 0], [0, 0]], mode="constant")
+        elif self.output_mode == "mlp":
+            history = history.flatten()
+
+        return {"history": history, "weights": weights}, reward, done1 or done2, info
+
+    def _reset(self):
+        self.sim.reset()
+        self.src.reset()
+        self.infos = []
+        action = self.sim.w0
+        observation, reward, done, info = self.step(action)
+        return observation
+
+    def _seed(self, seed):
+        np.random.seed(seed)
+        return [seed]
+
+    def _render(self, mode="notebook", close=False):
+        # if close:
+        # return
+        if mode == "ansi":
+            pprint(self.infos[-1])
+        elif mode == "notebook":
+            self.plot_notebook(close)
+
+    def plot_notebook(self, close=False):
+        """Live plot using the jupyter notebook rendering of matplotlib."""
+
+        if close:
+            self._plot = self._plot2 = self._plot3 = None
+            return
+
+        df_info = pd.DataFrame(self.infos)
+        df_info.index = pd.to_datetime(df_info["date"], unit="s")
+
+        # plot prices and performance
+        all_assets = ["BTCBTC"] + self.sim.asset_names
+        if not self._plot:
+            colors = [None] * len(all_assets) + ["black"]
+            self._plot_dir = (
+                os.path.join(self.log_dir, "notebook_plot_prices_" + str(time.time()))
+                if self.log_dir
+                else None
+            )
+            self._plot = LivePlotNotebook(
+                log_dir=self._plot_dir,
+                title="prices & performance",
+                labels=all_assets + ["Portfolio"],
+                ylabel="value",
+                colors=colors,
+            )
+        x = df_info.index
+        y_portfolio = df_info["portfolio_value"]
+        y_assets = [df_info["price_" + name].cumprod() for name in all_assets]
+        self._plot.update(x, y_assets + [y_portfolio])
+
+        # plot portfolio weights
+        if not self._plot2:
+            self._plot_dir2 = (
+                os.path.join(self.log_dir, "notebook_plot_weights_" + str(time.time()))
+                if self.log_dir
+                else None
+            )
+            self._plot2 = LivePlotNotebook(
+                log_dir=self._plot_dir2,
+                labels=all_assets,
+                title="weights",
+                ylabel="weight",
+            )
+        ys = [df_info["weight_" + name] for name in all_assets]
+        self._plot2.update(x, ys)
+
+        # plot portfolio costs
+        if not self._plot3:
+            self._plot_dir3 = (
+                os.path.join(self.log_dir, "notebook_plot_cost_" + str(time.time()))
+                if self.log_dir
+                else None
+            )
+            self._plot3 = LivePlotNotebook(
+                log_dir=self._plot_dir3, labels=["cost"], title="costs", ylabel="cost"
+            )
+        ys = [df_info["cost"].cumsum()]
+        self._plot3.update(x, ys)
+
+        if close:
+            self._plot = self._plot2 = self._plot3 = None
diff --git a/src/dagobert/modelling/rl/env02.py b/src/dagobert/modelling/rl/env02.py
new file mode 100644
index 00000000..7dfdbd55
--- /dev/null
+++ b/src/dagobert/modelling/rl/env02.py
@@ -0,0 +1,473 @@
+"""
+Modified from https://github.com/vermouth1992/drl-portfolio-management
+"""
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+
+import gym
+import gym.spaces
+
+eps = np.finfo(float).eps
+
+
+def random_shift(x, fraction):
+    """ Apply a random shift to a pandas series. """
+    min_x, max_x = np.min(x), np.max(x)
+    m = np.random.uniform(-fraction, fraction, size=x.shape) + 1
+    return np.clip(x * m, min_x, max_x)
+
+
+def scale_to_start(x):
+    """ Scale pandas series so that it starts at one. """
+    x = (x + eps) / (x[0] + eps)
+    return x
+
+
+def sharpe(returns, freq=30, rfr=0):
+    """ Given a set of returns, calculates naive (rfr=0) sharpe (eq 28). """
+    return (np.sqrt(freq) * np.mean(returns - rfr + eps)) / np.std(returns - rfr + eps)
+
+
+def max_drawdown(returns):
+    """ Max drawdown. See https://www.investopedia.com/terms/m/maximum-drawdown-mdd.asp """
+    peak = returns.max()
+    trough = returns[returns.argmax() :].min()
+    return (trough - peak) / (peak + eps)
+
+
+class DataGenerator(object):
+    """Acts as data provider for each new episode."""
+
+    def __init__(
+        self,
+        history,
+        abbreviation,
+        steps=730,
+        window_length=50,
+        start_idx=0,
+        start_date=None,
+    ):
+        """
+
+        Args:
+            history: (num_stocks, timestamp, 5) open, high, low, close, volume
+            abbreviation: a list of length num_stocks with assets name
+            steps: the total number of steps to simulate, default is 2 years
+            window_length: observation window, must be less than 50
+            start_date: the date to start. Default is None and random pick one.
+                        It should be a string e.g. '2012-08-13'
+        """
+        assert history.shape[0] == len(
+            abbreviation
+        ), "Number of stock is not consistent"
+        import copy
+
+        self.steps = steps + 1
+        self.window_length = window_length
+        self.step = start_idx
+        self.start_date = start_date
+
+        # make immutable class
+        self._data = history.copy()  # all data
+        self.asset_names = copy.copy(abbreviation)
+
+    def _step(self):
+        # get observation matrix from history, exclude volume, maybe volume is useful as it
+        # indicates how market total investment changes. Normalize could be critical here
+        self.step += 1
+        obs = self.data[:, self.step : self.step + self.window_length, :].copy()
+        # normalize obs with open price
+
+        # used for compute optimal action and sanity check
+        ground_truth_obs = self.data[
+            :, self.step + self.window_length : self.step + self.window_length + 1, :
+        ].copy()
+
+        done = self.step >= self.steps
+        return obs, done, ground_truth_obs
+
+    def reset(self):
+        self.step = 0
+
+        # get data for this episode, each episode might be different.
+        if self.start_date is None:
+            self.idx = np.random.randint(
+                low=self.window_length, high=self._data.shape[1] - self.steps
+            )
+        else:
+            raise ValueError("start_date is not yet supported / implemented")
+            # compute index corresponding to start_date for repeatable sequence
+            # self.idx = date_to_index(self.start_date) - self.start_idx
+            # assert (
+            #     self.idx >= self.window_length
+            #     and self.idx <= self._data.shape[1] - self.steps
+            # ), "Invalid start date, must be window_length day after start date and simulation steps day before end date"
+        data = self._data[
+            :, self.idx - self.window_length : self.idx + self.steps + 1, :4
+        ]
+        # apply augmentation?
+        self.data = data
+        return (
+            self.data[:, self.step : self.step + self.window_length, :].copy(),
+            self.data[
+                :,
+                self.step + self.window_length : self.step + self.window_length + 1,
+                :,
+            ].copy(),
+        )
+
+
+class PortfolioSim(object):
+    """
+    Portfolio management sim.
+    Params:
+    - cost e.g. 0.0025 is max in Poliniex
+    Based of [Jiang 2017](https://arxiv.org/abs/1706.10059)
+    """
+
+    def __init__(
+        self, asset_names=list(), steps=730, trading_cost=0.0025, time_cost=0.0
+    ):
+        self.asset_names = asset_names
+        self.cost = trading_cost
+        self.time_cost = time_cost
+        self.steps = steps
+        self.reset()
+
+    def _step(self, w1, y1):
+        """
+        Step.
+        w1 - new action of portfolio weights - e.g. [0.1,0.9,0.0]
+        y1 - price relative vector also called return
+            e.g. [1.0, 0.9, 1.1]
+        Numbered equations are from https://arxiv.org/abs/1706.10059
+        """
+        assert w1.shape == y1.shape, "w1 and y1 must have the same shape"
+        assert y1[0] == 1.0, "y1[0] must be 1"
+
+        p0 = self.p0
+
+        dw1 = (y1 * w1) / (np.dot(y1, w1) + eps)  # (eq7) weights evolve into
+
+        mu1 = self.cost * (np.abs(dw1 - w1)).sum()  # (eq16) cost to change portfolio
+
+        assert mu1 < 1.0, "Cost is larger than current holding"
+
+        p1 = p0 * (1 - mu1) * np.dot(y1, w1)  # (eq11) final portfolio value
+
+        p1 = p1 * (1 - self.time_cost)  # we can add a cost to holding
+
+        rho1 = p1 / p0 - 1  # rate of returns
+        r1 = np.log((p1 + eps) / (p0 + eps))  # log rate of return
+        reward = r1 / self.steps * 1000.0  # (22) average logarithmic accumulated return
+        # remember for next step
+        self.p0 = p1
+
+        # if we run out of money, we're done (losing all the money)
+        done = p1 == 0
+
+        info = {
+            "reward": reward,
+            "log_return": r1,
+            "portfolio_value": p1,
+            "return": y1.mean(),
+            "rate_of_return": rho1,
+            "weights_mean": w1.mean(),
+            "weights_std": w1.std(),
+            "cost": mu1,
+        }
+        self.infos.append(info)
+        return reward, info, done
+
+    def reset(self):
+        self.infos = []
+        self.p0 = 1.0
+
+
+class PortfolioEnv(gym.Env):
+    """
+    An environment for financial portfolio management.
+    Financial portfolio management is the process of constant redistribution of a fund into different
+    financial products.
+    Based on [Jiang 2017](https://arxiv.org/abs/1706.10059)
+    """
+
+    metadata = {"render.modes": ["human", "ansi"]}
+
+    def __init__(
+        self,
+        history,
+        abbreviation,
+        steps=730,  # 2 years
+        trading_cost=0.0025,
+        time_cost=0.00,
+        window_length=50,
+        start_idx=0,
+        sample_start_date=None,
+    ):
+        """
+        An environment for financial portfolio management.
+        Params:
+            steps - steps in episode
+            scale - scale data and each episode (except return)
+            augment - fraction to randomly shift data by
+            trading_cost - cost of trade as a fraction
+            time_cost - cost of holding as a fraction
+            window_length - how many past observations to return
+            start_idx - The number of days from '2012-08-13' of the dataset
+            sample_start_date - The start date sampling from the history
+        """
+        self.window_length = window_length
+        self.num_stocks = history.shape[0]
+        self.start_idx = start_idx
+
+        self.src = DataGenerator(
+            history,
+            abbreviation,
+            steps=steps,
+            window_length=window_length,
+            start_idx=start_idx,
+            start_date=sample_start_date,
+        )
+
+        self.sim = PortfolioSim(
+            asset_names=abbreviation,
+            trading_cost=trading_cost,
+            time_cost=time_cost,
+            steps=steps,
+        )
+
+        # openai gym attributes
+        # action will be the portfolio weights from 0 to 1 for each asset
+        self.action_space = gym.spaces.Box(
+            0, 1, shape=(len(self.src.asset_names) + 1,), dtype=np.float32
+        )  # include cash
+
+        # get the observation space from the data min and max
+        self.observation_space = gym.spaces.Box(
+            low=-np.inf,
+            high=np.inf,
+            shape=(len(abbreviation), window_length, history.shape[-1]),
+            dtype=np.float32,
+        )
+
+    def step(self, action):
+        return self._step(action)
+
+    def _step(self, action):
+        """
+        Step the env.
+        Actions should be portfolio [w0...]
+        - Where wn is a portfolio weight from 0 to 1. The first is cash_bias
+        - cn is the portfolio conversion weights see PortioSim._step for description
+        """
+        np.testing.assert_almost_equal(action.shape, (len(self.sim.asset_names) + 1,))
+
+        # normalise just in case
+        action = np.clip(action, 0, 1)
+
+        weights = action  # np.array([cash_bias] + list(action))  # [w0, w1...]
+        weights /= weights.sum() + eps
+        weights[0] += np.clip(
+            1 - weights.sum(), 0, 1
+        )  # so if weights are all zeros we normalise to [1,0...]
+
+        assert ((action >= 0) * (action <= 1)).all(), (
+            "all action values should be between 0 and 1. Not %s" % action
+        )
+        np.testing.assert_almost_equal(
+            np.sum(weights),
+            1.0,
+            3,
+            err_msg='weights should sum to 1. action="%s"' % weights,
+        )
+
+        observation, done1, ground_truth_obs = self.src._step()
+
+        # concatenate observation with ones
+        cash_observation = np.ones((1, self.window_length, observation.shape[2]))
+        observation = np.concatenate((cash_observation, observation), axis=0)
+
+        cash_ground_truth = np.ones((1, 1, ground_truth_obs.shape[2]))
+        ground_truth_obs = np.concatenate((cash_ground_truth, ground_truth_obs), axis=0)
+
+        # relative price vector of last observation day (close/open)
+        close_price_vector = observation[:, -1, 3]
+        open_price_vector = observation[:, -1, 0]
+        y1 = close_price_vector / open_price_vector
+        reward, info, done2 = self.sim._step(weights, y1)
+
+        # calculate return for buy and hold a bit of each asset
+        info["market_value"] = np.cumprod(
+            [inf["return"] for inf in self.infos + [info]]
+        )[-1]
+        # add dates
+        info["date"] = self.start_idx + self.src.idx + self.src.step
+        info["steps"] = self.src.step
+        info["next_obs"] = ground_truth_obs
+
+        self.infos.append(info)
+
+        return observation, reward, done1 or done2, info
+
+    def reset(self):
+        return self._reset()
+
+    def _reset(self):
+        self.infos = []
+        self.sim.reset()
+        observation, ground_truth_obs = self.src.reset()
+        cash_observation = np.ones((1, self.window_length, observation.shape[2]))
+        observation = np.concatenate((cash_observation, observation), axis=0)
+        cash_ground_truth = np.ones((1, 1, ground_truth_obs.shape[2]))
+        ground_truth_obs = np.concatenate((cash_ground_truth, ground_truth_obs), axis=0)
+        info = {}
+        info["next_obs"] = ground_truth_obs
+        return observation, info
+
+    def _render(self, mode="human", close=False):
+        if close:
+            return
+        if mode == "ansi":
+            print(self.infos[-1])
+        elif mode == "human":
+            self.plot()
+
+    def render(self, mode="human", close=False):
+        return self._render(mode="human", close=False)
+
+    def plot(self):
+        # show a plot of portfolio vs mean market performance
+        df_info = pd.DataFrame(self.infos)
+        df_info["date"] = pd.to_datetime(df_info["date"], format="%Y-%m-%d")
+        df_info.set_index("date", inplace=True)
+        mdd = max_drawdown(df_info.rate_of_return + 1)
+        sharpe_ratio = sharpe(df_info.rate_of_return)
+        title = "max_drawdown={: 2.2%} sharpe_ratio={: 2.4f}".format(mdd, sharpe_ratio)
+        df_info[["portfolio_value", "market_value"]].plot(
+            title=title, fig=plt.gcf(), rot=30
+        )
+
+
+class MultiActionPortfolioEnv(PortfolioEnv):
+    def __init__(
+        self,
+        history,
+        abbreviation,
+        model_names,
+        steps=730,  # 2 years
+        trading_cost=0.0025,
+        time_cost=0.00,
+        window_length=50,
+        start_idx=0,
+        sample_start_date=None,
+    ):
+        super(MultiActionPortfolioEnv, self).__init__(
+            history,
+            abbreviation,
+            steps,
+            trading_cost,
+            time_cost,
+            window_length,
+            start_idx,
+            sample_start_date,
+        )
+        self.model_names = model_names
+        # need to create each simulator for each model
+        self.sim = [
+            PortfolioSim(
+                asset_names=abbreviation,
+                trading_cost=trading_cost,
+                time_cost=time_cost,
+                steps=steps,
+            )
+            for _ in range(len(self.model_names))
+        ]
+
+    def _step(self, action):
+        """Step the environment by a vector of actions
+
+        Args:
+            action: (num_models, num_stocks + 1)
+
+        Returns:
+
+        """
+        assert (
+            action.ndim == 2
+        ), "Action must be a two dimensional array with shape (num_models, num_stocks + 1)"
+        assert action.shape[1] == len(self.sim[0].asset_names) + 1
+        assert action.shape[0] == len(self.model_names)
+        # normalise just in case
+        action = np.clip(action, 0, 1)
+        weights = action  # np.array([cash_bias] + list(action))  # [w0, w1...]
+        weights /= np.sum(weights, axis=1, keepdims=True) + eps
+        # so if weights are all zeros we normalise to [1,0...]
+        weights[:, 0] += np.clip(1 - np.sum(weights, axis=1), 0, 1)
+        assert ((action >= 0) * (action <= 1)).all(), (
+            "all action values should be between 0 and 1. Not %s" % action
+        )
+        np.testing.assert_almost_equal(
+            np.sum(weights, axis=1),
+            np.ones(shape=(weights.shape[0])),
+            3,
+            err_msg='weights should sum to 1. action="%s"' % weights,
+        )
+        observation, done1, ground_truth_obs = self.src._step()
+
+        # concatenate observation with ones
+        cash_observation = np.ones((1, self.window_length, observation.shape[2]))
+        observation = np.concatenate((cash_observation, observation), axis=0)
+
+        cash_ground_truth = np.ones((1, 1, ground_truth_obs.shape[2]))
+        ground_truth_obs = np.concatenate((cash_ground_truth, ground_truth_obs), axis=0)
+
+        # relative price vector of last observation day (close/open)
+        close_price_vector = observation[:, -1, 3]
+        open_price_vector = observation[:, -1, 0]
+        y1 = close_price_vector / open_price_vector
+
+        rewards = np.empty(shape=(weights.shape[0]))
+        info = {}
+        dones = np.empty(shape=(weights.shape[0]), dtype=bool)
+        for i in range(weights.shape[0]):
+            reward, current_info, done2 = self.sim[i]._step(weights[i], y1)
+            rewards[i] = reward
+            info[self.model_names[i]] = current_info["portfolio_value"]
+            info["return"] = current_info["return"]
+            dones[i] = done2
+
+        # calculate return for buy and hold a bit of each asset
+        info["market_value"] = np.cumprod(
+            [inf["return"] for inf in self.infos + [info]]
+        )[-1]
+        # add dates
+        info["date"] = self.start_idx + self.src.idx + self.src.step
+        info["steps"] = self.src.step
+        info["next_obs"] = ground_truth_obs
+
+        self.infos.append(info)
+
+        return observation, rewards, np.all(dones) or done1, info
+
+    def _reset(self):
+        self.infos = []
+        for sim in self.sim:
+            sim.reset()
+        observation, ground_truth_obs = self.src.reset()
+        cash_observation = np.ones((1, self.window_length, observation.shape[2]))
+        observation = np.concatenate((cash_observation, observation), axis=0)
+        cash_ground_truth = np.ones((1, 1, ground_truth_obs.shape[2]))
+        ground_truth_obs = np.concatenate((cash_ground_truth, ground_truth_obs), axis=0)
+        info = {}
+        info["next_obs"] = ground_truth_obs
+        return observation, info
+
+    def plot(self):
+        df_info = pd.DataFrame(self.infos)
+        fig = plt.gcf()
+        title = "Trading Performance of Various Models"
+        df_info["date"] = pd.to_datetime(df_info["date"], format="%Y-%m-%d")
+        df_info.set_index("date", inplace=True)
+        df_info[self.model_names + ["market_value"]].plot(title=title, fig=fig, rot=30)
diff --git a/src/dagobert/modelling/rl/environment.py b/src/dagobert/modelling/rl/environment.py
new file mode 100644
index 00000000..ce8861da
--- /dev/null
+++ b/src/dagobert/modelling/rl/environment.py
@@ -0,0 +1,117 @@
+"""
+Class defining PyTorch datasets for supervised modelling of a single instrument.
+"""
+import logging
+from typing import List
+
+import gym
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+
+from dagobert.modelling.dl import PortfolioCryptoDataset
+
+logger = logging.getLogger(__name__)
+
+
+class PortfolioSim(object):
+    """
+    Portfolio management class, loosely based on https://arxiv.org/abs/1706.10059
+
+    I started with this https://github.com/wassname/rl-portfolio-management
+    and compared it with the  article and the original implementation (see below).
+
+    The original implementation is a bit of a shitshow but the calculation of mu at
+    least agrees with what we have here:
+    https://github.com/ZhengyaoJiang/PGPortfolio/blob/master/pgportfolio/learn/nnagent.py
+    however there are questions around how the code corresponds to the paper, e.g.:
+    https://github.com/ZhengyaoJiang/PGPortfolio/issues/99
+    """
+
+    def __init__(
+        self,
+        asset_names: List[str],
+        steps: int = 128,
+        trading_cost: float = 0.001,
+        reward_type: str = "return",
+    ):
+        """
+        Class constructor.
+
+        Args:
+            asset_names: Names of assets in the portfolio.
+            steps: Maximum number of steps.
+            trading_cost: Commission rate, currently set to Binance's VIP0 taker level.
+            reward_type: Whether to use the log return as reward or the sharpe ratio,
+                which was found to be more stable.
+        """
+        self.asset_names = asset_names
+        self.asset_num = len(asset_names)
+        self.steps = steps
+        self.trading_cost = trading_cost
+        self.reward_type = reward_type
+        self.eps = np.finfo(float).eps
+        self.reset()
+
+    def _step(self, w1, y1):
+        """
+        Step.
+
+        w1 - new action of portfolio weights - e.g. [0.1,0.9, 0.0]
+        y1 - price relative vector also called return
+            e.g. [1.0, 0.9, 1.1]
+        Numbered equations are from https://arxiv.org/abs/1706.10059
+        """
+        w0 = self.w0
+        p0 = self.p0
+
+        # (eq7) since we last acted prices changed, so weights evolve into
+        dw1 = (y1 * w0) / (np.dot(y1, w0) + self.eps)
+
+        # (eq16) cost to change portfolio:
+        # excluding change in cash to avoid double counting for transaction cost
+        mu = self.trading_cost * (np.abs(dw1[1:] - w1[1:])).sum()
+
+        # (eq11) final portfolio value: see section between (eq19-20) why this works
+        p1 = p0 * (1 - mu) * np.dot(y1, w0)
+
+        # (eq9 & 10) rate of return log rate of return
+        rho1 = p1 / p0 - 1  # rate of returns
+        r1 = np.log(p1 + self.eps) - np.log(p0 + self.eps)
+
+        # (eq22) immediate reward is log rate of return scaled by episode length
+        if self.reward_type == "return":
+            reward = r1 / self.steps
+        # TODO: implement the differentiable sharpe ratio reward like so https://quant.stackexchange.com/a/38040
+
+        # remember for next step
+        self.w0 = w1
+        self.p0 = p1
+
+        # if we run out of money, we're done
+        done = p1 == 0
+
+        # should only return single values, not list
+        info = {
+            "reward": reward,
+            "log_return": r1,
+            "portfolio_value": p1,
+            "market_return": y1.mean(),
+            "rate_of_return": rho1,
+            "weights_mean": w1.mean(),
+            "weights_std": w1.std(),
+            "cost": mu,
+        }
+        # record weights and prices
+        for i, name in enumerate(["BTCBTC"] + self.asset_names):
+            info["weight_" + name] = w1[i]
+            info["price_" + name] = y1[i]
+
+        self.infos.append(info)
+        return reward, info, done
+
+    def reset(self):
+        self.infos = []
+        self.w0 = np.zeros(self.asset_num)
+        self.w0[0] = 1
+        self.p0 = 1.0

From c07fb1c7b1480fa8c99d791372d1ddfb80a0f4f5 Mon Sep 17 00:00:00 2001
From: Daniel Homola <dani.homola@gmail.com>
Date: Wed, 23 Dec 2020 13:03:31 +0000
Subject: [PATCH 02/62] modifying the definition of p1 due to experiments in
 the rl notebook

---
 notebooks/modelling/rl_env.ipynb         | 220 ++++++++++++++++++++---
 src/dagobert/modelling/rl/environment.py |  10 +-
 2 files changed, 199 insertions(+), 31 deletions(-)

diff --git a/notebooks/modelling/rl_env.ipynb b/notebooks/modelling/rl_env.ipynb
index cd4bd631..fcf19862 100644
--- a/notebooks/modelling/rl_env.ipynb
+++ b/notebooks/modelling/rl_env.ipynb
@@ -38,7 +38,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 32,
+   "execution_count": 51,
    "metadata": {},
    "outputs": [
     {
@@ -47,7 +47,7 @@
        "array([0.48192771, 0.26506024, 0.25301205])"
       ]
      },
-     "execution_count": 32,
+     "execution_count": 51,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -71,7 +71,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 42,
+   "execution_count": 52,
    "metadata": {},
    "outputs": [
     {
@@ -80,7 +80,7 @@
        "0.00020481927710843396"
       ]
      },
-     "execution_count": 42,
+     "execution_count": 52,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -97,86 +97,252 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 43,
+   "execution_count": 53,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "1.045"
+       "0.9997951807228915"
       ]
      },
-     "execution_count": 43,
+     "execution_count": 53,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "np.dot(y1, w1)"
+    "p0 * (1 - mu)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 40,
+   "execution_count": 54,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "0.9997951807228915"
+       "1.0375"
       ]
      },
-     "execution_count": 40,
+     "execution_count": 54,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "(1 - mu)"
+    "np.dot(y1, w0)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 37,
+   "execution_count": 55,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "1.0447859638554216"
+       "1.0372875000000001"
       ]
      },
-     "execution_count": 37,
+     "execution_count": 55,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
     "# (eq11) new portfolio value: see section between (eq19-20) why this works\n",
-    "p1 = p0 * (1 - mu) * np.dot(y1, w1)\n",
+    "p1 = p0 * (1 - mu) * np.dot(y1, w0)\n",
     "p1"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 56,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.03728750000000014"
+      ]
+     },
+     "execution_count": 56,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "rho1 = p1 / p0 - 1  # rate of returns\n",
+    "rho1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 57,
    "metadata": {},
    "outputs": [],
    "source": [
+    "p0 = p1\n",
+    "w0 = w1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 104,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "p1 1.0141928915662652\n",
+      "rho 0.014192891566265242\n",
+      "p1 1.0290884681079937\n",
+      "rho 0.014687123786407685\n",
+      "p1 1.0287802447893062\n",
+      "rho -0.0002995110024448522\n"
+     ]
+    }
+   ],
+   "source": [
+    "p0 = 1\n",
+    "w0 = np.array([.5, .25, .25])\n",
+    "\n",
+    "def step(y1, w1, w0, p0):\n",
+    "    dw1 = (y1 * w0) / (np.dot(y1, w0) + eps)\n",
+    "    mu = 0.0025 * (np.abs(dw1[1:] - w1[1:])).sum()\n",
+    "    p1 = p0 * (1 - mu) * np.dot(y1, w1)\n",
+    "    rho1 = p1 / p0 - 1\n",
+    "    print('p1', p1)\n",
+    "    print('rho', rho1)\n",
+    "    return p1\n",
     "\n",
-    "        # (eq16) cost to change portfolio:\n",
-    "        # excluding change in cash to avoid double counting for transaction cost\n",
-    "        mu = self.cost * (np.abs(dw1[1:] - w1[1:])).sum()\n",
+    "# BTC, ETH is going up but the agent is selling them, so return and p value should go down due to transaction cost in last sale\n",
+    "y1 = np.array([1, 1.1, 1.05])\n",
+    "w1 = np.array([.8, .1, .1])\n",
+    "p1 = step(y1, w1, w0, p0)\n",
     "\n",
-    "        # (eq11) final portfolio value: see section between (eq19-20) why this works\n",
-    "        p1 = p0 * (1 - mu) * np.dot(y1, w0)\n",
+    "y2 = np.array([1, 1.2, 1.1])\n",
+    "w2 = np.array([.9, .05, .05])\n",
+    "p2 = step(y2, w2, w1, p1)\n",
     "\n",
-    "        # (eq9 & 10) rate of return log rate of return\n",
-    "        rho1 = p1 / p0 - 1  # rate of returns\n",
-    "        r1 = np.log(p1 + eps) - np.log(p0 + eps)\n",
     "\n",
-    "        # (eq22) immediate reward is log rate of return scaled by episode length\n",
-    "        reward = r1 / self.steps"
+    "y3 = np.array([1, 1.3, 1.15])\n",
+    "w3 = np.array([1, 0, 0])\n",
+    "p3 = step(y3, w3, w2, p2)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 105,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "p1 0.9843092207792208\n",
+      "rho -0.015690779220779216\n",
+      "p1 0.9618924183114448\n",
+      "rho -0.022774146573604104\n",
+      "p1 0.9253212485790699\n",
+      "rho -0.03802002078005118\n"
+     ]
+    }
+   ],
+   "source": [
+    "# BTC, ETH is going down and the agent is buying them, so return and p value should go down fast\n",
+    "y1 = np.array([1, 0.9, .95])\n",
+    "w1 = np.array([.8, .1, .1])\n",
+    "\n",
+    "p1 = step(y1, w1, w0, p0)\n",
+    "\n",
+    "y2 = np.array([1, .9, .95])\n",
+    "w2 = np.array([.7, .15, .15])\n",
+    "p2 = step(y2, w2, w1, p1)\n",
+    "\n",
+    "y3 = np.array([1, .9, .95])\n",
+    "w3 = np.array([.5, .25, .25])\n",
+    "p3 = step(y3, w3, w2, p2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 106,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "p1 0.9843092207792208\n",
+      "rho -0.015690779220779216\n",
+      "p1 0.976712424016802\n",
+      "rho -0.0077178965736039995\n",
+      "p1 0.9764848524822389\n",
+      "rho -0.0002329974811082769\n"
+     ]
+    }
+   ],
+   "source": [
+    "# BTC, ETH is going down and the agent is selling them, so return and p value should go down but not as fast as in the prev example\n",
+    "y1 = np.array([1, 0.9, .95])\n",
+    "w1 = np.array([.8, .1, .1])\n",
+    "\n",
+    "p1 = step(y1, w1, w0, p0)\n",
+    "\n",
+    "y2 = np.array([1, .9, .95])\n",
+    "w2 = np.array([.9, .05, .05])\n",
+    "p2 = step(y2, w2, w1, p1)\n",
+    "\n",
+    "y3 = np.array([1, .9, .95])\n",
+    "w3 = np.array([1, 0, 0])\n",
+    "p3 = step(y3, w3, w2, p2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 107,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "p1 1.0092117073170732\n",
+      "rho 0.009211707317073214\n",
+      "p1 1.024114079612195\n",
+      "rho 0.014766349009900814\n",
+      "p1 1.0492192209883782\n",
+      "rho 0.024514008620689864\n"
+     ]
+    }
+   ],
+   "source": [
+    "# BTC, ETH is going up and the agent is buying them, so return and p value should go up\n",
+    "y1 = np.array([1, 1.05, 1.05])\n",
+    "w1 = np.array([.8, .1, .1])\n",
+    "\n",
+    "p1 = step(y1, w1, w0, p0)\n",
+    "\n",
+    "y2 = np.array([1, 1.05, 1.05])\n",
+    "w2 = np.array([.7, .15, .15])\n",
+    "\n",
+    "p2 = step(y2, w2, w1, p1)\n",
+    "\n",
+    "y3 = np.array([1, 1.05, 1.05])\n",
+    "w3 = np.array([.5, .25, .25])\n",
+    "\n",
+    "p3 = step(y3, w3, w2, p2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Using the environment"
    ]
   },
   {
diff --git a/src/dagobert/modelling/rl/environment.py b/src/dagobert/modelling/rl/environment.py
index ce8861da..f5c3521c 100644
--- a/src/dagobert/modelling/rl/environment.py
+++ b/src/dagobert/modelling/rl/environment.py
@@ -72,8 +72,10 @@ def _step(self, w1, y1):
         # excluding change in cash to avoid double counting for transaction cost
         mu = self.trading_cost * (np.abs(dw1[1:] - w1[1:])).sum()
 
-        # (eq11) final portfolio value: see section between (eq19-20) why this works
-        p1 = p0 * (1 - mu) * np.dot(y1, w0)
+        # (eq11) final portfolio value: after lot of experiments in rl notebook and
+        # reading the relevant parts of the paper a ton of times, we use w1 here
+        # instead of w0, also bc it makes intuitive sense this way
+        p1 = p0 * (1 - mu) * np.dot(y1, w1)
 
         # (eq9 & 10) rate of return log rate of return
         rho1 = p1 / p0 - 1  # rate of returns
@@ -100,10 +102,10 @@ def _step(self, w1, y1):
             "rate_of_return": rho1,
             "weights_mean": w1.mean(),
             "weights_std": w1.std(),
-            "cost": mu,
+            "rebalancing_cost": mu,
         }
         # record weights and prices
-        for i, name in enumerate(["BTCBTC"] + self.asset_names):
+        for i, name in enumerate(["USD"] + self.asset_names):
             info["weight_" + name] = w1[i]
             info["price_" + name] = y1[i]
 

From 9728317d6b702a5b0c4c6409ac323676426ff987 Mon Sep 17 00:00:00 2001
From: Daniel Homola <dani.homola@gmail.com>
Date: Wed, 23 Dec 2020 15:49:14 +0000
Subject: [PATCH 03/62] flashing out the runner and env.py a bit more

---
 setup.cfg                                |   1 +
 src/dagobert/modelling/rl/__init__.py    |   1 +
 src/dagobert/modelling/rl/environment.py | 244 +++++++++++++++++++++--
 src/dagobert/modelling/rl/rl.py          |   5 +
 src/dagobert/modelling/rl/rl_runner.py   |  38 ++++
 5 files changed, 276 insertions(+), 13 deletions(-)
 create mode 100644 src/dagobert/modelling/rl/rl.py
 create mode 100644 src/dagobert/modelling/rl/rl_runner.py

diff --git a/setup.cfg b/setup.cfg
index c845e0c5..bbc2948b 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -57,6 +57,7 @@ console_scripts =
     dagobert-tcn = dagobert.modelling.dl.tcn_runner:run
     dagobert-optuna = dagobert.modelling.dl.optuna:run
     dagobert-s3 = dagobert.io.runner:run
+    dagobert-tcn = dagobert.modelling.rl.rl_runner:run
 
 [test]
 # py.test options when running `python setup.py test`
diff --git a/src/dagobert/modelling/rl/__init__.py b/src/dagobert/modelling/rl/__init__.py
index e69de29b..56d3ada1 100644
--- a/src/dagobert/modelling/rl/__init__.py
+++ b/src/dagobert/modelling/rl/__init__.py
@@ -0,0 +1 @@
+from .environment import RLData, RLPortfolio, RLEnv
diff --git a/src/dagobert/modelling/rl/environment.py b/src/dagobert/modelling/rl/environment.py
index f5c3521c..c8fbf72b 100644
--- a/src/dagobert/modelling/rl/environment.py
+++ b/src/dagobert/modelling/rl/environment.py
@@ -2,19 +2,71 @@
 Class defining PyTorch datasets for supervised modelling of a single instrument.
 """
 import logging
-from typing import List
+from typing import List, Tuple
+from argparse import Namespace
 
 import gym
 import numpy as np
 import pandas as pd
 import matplotlib.pyplot as plt
+from torch.utils.data import Dataset, DataLoader
 
-from dagobert.modelling.dl import PortfolioCryptoDataset
+from dagobert.modelling.dl import PortfolioCryptoDataset, Preprocessing
 
 logger = logging.getLogger(__name__)
+eps = np.finfo(float).eps
 
 
-class PortfolioSim(object):
+class RLData(object):
+    """
+    Leverages the data class and configuration methods from the `dagobert.modelling.dl`
+    module as much as possible.
+    """
+
+    def __init__(self, hparams: Namespace, train_val_test: str = "train"):
+        """
+        Class constructor.
+
+        Args:
+            hparams: Hyparams parsed by the rl_runner. Similar to how `TCNLightning` is
+                initialized.
+            train_val_test: Whether we are training, validating or testing, it must be
+                either train, val or test.
+        """
+        RLData._pre_sanity_check(hparams)
+        hparams = Preprocessing().preprocess_augment_dfs(hparams)
+        self.hparams = Preprocessing().preprocess_train_dfs(hparams)
+        if train_val_test == "train":
+            augment_dfs = self.hparams.augment_dfs
+            augment_method = self.hparams.augment_method
+        else:
+            augment_dfs = None
+            augment_method = None
+        self.dataset = PortfolioCryptoDataset(
+            df_to_load=getattr(self.hparams, f"df_{train_val_test}"),
+            cols_to_model=self.hparams.cols_to_model,
+            target_col=self.hparams.target_col,
+            mini_series_length=self.hparams.mini_series_length,
+            data_dir=self.hparams.data_dir,
+            augment_method=augment_method,
+            augment_prob=self.hparams.augment_prob,
+            augment_dfs=augment_dfs,
+            augment_dfs_mix=self.hparams.augment_dfs_mix,
+        )
+        self.dataloader = DataLoader(self.dataset)
+
+    def step(self):
+        pass
+
+    def reset(self):
+        pass
+
+    @staticmethod
+    def _pre_sanity_check(hparams: Namespace):
+        pass
+
+
+class RLPortfolio(object):
     """
     Portfolio management class, loosely based on https://arxiv.org/abs/1706.10059
 
@@ -50,23 +102,27 @@ def __init__(
         self.steps = steps
         self.trading_cost = trading_cost
         self.reward_type = reward_type
-        self.eps = np.finfo(float).eps
         self.reset()
 
-    def _step(self, w1, y1):
+    def _step(self, w1: np.array, y1: np.array) -> Tuple[float, dict, bool]:
         """
-        Step.
+        See Figure 1 in https://arxiv.org/abs/1706.10059 to understand what this one
+        step corresponds to, also the equation numbers correspond to the paper's.
+
+        Args:
+            w1: New weights of the portfolio.
+            y1: New relative price vector for the portfolio's instruments. The first
+                    element refers to the cash asset (USD). Therefore y1[0] = 1 always.
 
-        w1 - new action of portfolio weights - e.g. [0.1,0.9, 0.0]
-        y1 - price relative vector also called return
-            e.g. [1.0, 0.9, 1.1]
-        Numbered equations are from https://arxiv.org/abs/1706.10059
+        Returns:
+            Typical reward, info, done vars for an OpenAI Gym.
         """
         w0 = self.w0
         p0 = self.p0
+        assert y1[0] == 1, "Cash price has to remain constant: 1."
 
         # (eq7) since we last acted prices changed, so weights evolve into
-        dw1 = (y1 * w0) / (np.dot(y1, w0) + self.eps)
+        dw1 = (y1 * w0) / (np.dot(y1, w0) + eps)
 
         # (eq16) cost to change portfolio:
         # excluding change in cash to avoid double counting for transaction cost
@@ -79,7 +135,7 @@ def _step(self, w1, y1):
 
         # (eq9 & 10) rate of return log rate of return
         rho1 = p1 / p0 - 1  # rate of returns
-        r1 = np.log(p1 + self.eps) - np.log(p0 + self.eps)
+        r1 = np.log(p1 + eps) - np.log(p0 + eps)
 
         # (eq22) immediate reward is log rate of return scaled by episode length
         if self.reward_type == "return":
@@ -91,7 +147,7 @@ def _step(self, w1, y1):
         self.p0 = p1
 
         # if we run out of money, we're done
-        done = p1 == 0
+        done = p1 <= 0
 
         # should only return single values, not list
         info = {
@@ -117,3 +173,165 @@ def reset(self):
         self.w0 = np.zeros(self.asset_num)
         self.w0[0] = 1
         self.p0 = 1.0
+
+
+class RLEnv(gym.Env):
+    """
+    A reinforcement learning environment for financial portfolio management, based on
+    https://arxiv.org/abs/1706.10059 and this implementation
+    https://github.com/wassname/rl-portfolio-management
+    """
+
+    def __init__(
+        self,
+        history,
+        abbreviation,
+        steps=730,  # 2 years
+        trading_cost=0.0025,
+        time_cost=0.00,
+        window_length=50,
+        start_idx=0,
+        sample_start_date=None,
+    ):
+        """
+        An environment for financial portfolio management.
+        Params:
+            steps - steps in episode
+            scale - scale data and each episode (except return)
+            augment - fraction to randomly shift data by
+            trading_cost - cost of trade as a fraction
+            time_cost - cost of holding as a fraction
+            window_length - how many past observations to return
+            start_idx - The number of days from '2012-08-13' of the dataset
+            sample_start_date - The start date sampling from the history
+        """
+        self.window_length = window_length
+        self.num_stocks = history.shape[0]
+        self.start_idx = start_idx
+
+        self.src = RLData(
+            history,
+            abbreviation,
+            steps=steps,
+            window_length=window_length,
+            start_idx=start_idx,
+            start_date=sample_start_date,
+        )
+
+        self.sim = RLPortfolio(
+            asset_names=abbreviation,
+            trading_cost=trading_cost,
+            time_cost=time_cost,
+            steps=steps,
+        )
+
+        # openai gym attributes
+        # action will be the portfolio weights from 0 to 1 for each asset
+        self.action_space = gym.spaces.Box(
+            0, 1, shape=(len(self.src.asset_names) + 1,), dtype=np.float32
+        )  # include cash
+
+        # get the observation space from the data min and max
+        self.observation_space = gym.spaces.Box(
+            low=-np.inf,
+            high=np.inf,
+            shape=(len(abbreviation), window_length, history.shape[-1]),
+            dtype=np.float32,
+        )
+
+    def step(self, action):
+        return self._step(action)
+
+    def _step(self, action):
+        """
+        Step the env.
+        Actions should be portfolio [w0...]
+        - Where wn is a portfolio weight from 0 to 1. The first is cash_bias
+        - cn is the portfolio conversion weights see PortioSim._step for description
+        """
+        np.testing.assert_almost_equal(action.shape, (len(self.sim.asset_names) + 1,))
+
+        # normalise just in case
+        action = np.clip(action, 0, 1)
+
+        weights = action  # np.array([cash_bias] + list(action))  # [w0, w1...]
+        weights /= weights.sum() + eps
+        weights[0] += np.clip(
+            1 - weights.sum(), 0, 1
+        )  # so if weights are all zeros we normalise to [1,0...]
+
+        assert ((action >= 0) * (action <= 1)).all(), (
+            "all action values should be between 0 and 1. Not %s" % action
+        )
+        np.testing.assert_almost_equal(
+            np.sum(weights),
+            1.0,
+            3,
+            err_msg='weights should sum to 1. action="%s"' % weights,
+        )
+
+        observation, done1, ground_truth_obs = self.src._step()
+
+        # concatenate observation with ones
+        cash_observation = np.ones((1, self.window_length, observation.shape[2]))
+        observation = np.concatenate((cash_observation, observation), axis=0)
+
+        cash_ground_truth = np.ones((1, 1, ground_truth_obs.shape[2]))
+        ground_truth_obs = np.concatenate((cash_ground_truth, ground_truth_obs), axis=0)
+
+        # relative price vector of last observation day (close/open)
+        close_price_vector = observation[:, -1, 3]
+        open_price_vector = observation[:, -1, 0]
+        y1 = close_price_vector / open_price_vector
+        reward, info, done2 = self.sim._step(weights, y1)
+
+        # calculate return for buy and hold a bit of each asset
+        info["market_value"] = np.cumprod(
+            [inf["return"] for inf in self.infos + [info]]
+        )[-1]
+        # add dates
+        info["date"] = self.start_idx + self.src.idx + self.src.step
+        info["steps"] = self.src.step
+        info["next_obs"] = ground_truth_obs
+
+        self.infos.append(info)
+
+        return observation, reward, done1 or done2, info
+
+    def reset(self):
+        return self._reset()
+
+    def _reset(self):
+        self.infos = []
+        self.sim.reset()
+        observation, ground_truth_obs = self.src.reset()
+        cash_observation = np.ones((1, self.window_length, observation.shape[2]))
+        observation = np.concatenate((cash_observation, observation), axis=0)
+        cash_ground_truth = np.ones((1, 1, ground_truth_obs.shape[2]))
+        ground_truth_obs = np.concatenate((cash_ground_truth, ground_truth_obs), axis=0)
+        info = {}
+        info["next_obs"] = ground_truth_obs
+        return observation, info
+
+    def _render(self, mode="human", close=False):
+        if close:
+            return
+        if mode == "ansi":
+            print(self.infos[-1])
+        elif mode == "human":
+            self.plot()
+
+    def render(self, mode="human", close=False):
+        return self._render(mode="human", close=False)
+
+    def plot(self):
+        # show a plot of portfolio vs mean market performance
+        df_info = pd.DataFrame(self.infos)
+        df_info["date"] = pd.to_datetime(df_info["date"], format="%Y-%m-%d")
+        df_info.set_index("date", inplace=True)
+        mdd = max_drawdown(df_info.rate_of_return + 1)
+        sharpe_ratio = sharpe(df_info.rate_of_return)
+        title = "max_drawdown={: 2.2%} sharpe_ratio={: 2.4f}".format(mdd, sharpe_ratio)
+        df_info[["portfolio_value", "market_value"]].plot(
+            title=title, fig=plt.gcf(), rot=30
+        )
diff --git a/src/dagobert/modelling/rl/rl.py b/src/dagobert/modelling/rl/rl.py
new file mode 100644
index 00000000..ca60f407
--- /dev/null
+++ b/src/dagobert/modelling/rl/rl.py
@@ -0,0 +1,5 @@
+from dagobert.modelling.rl import RLEnv
+
+
+def run_rl(args):
+    env = RLEnv(args)
diff --git a/src/dagobert/modelling/rl/rl_runner.py b/src/dagobert/modelling/rl/rl_runner.py
new file mode 100644
index 00000000..8a777213
--- /dev/null
+++ b/src/dagobert/modelling/rl/rl_runner.py
@@ -0,0 +1,38 @@
+"""
+Dagobert's runner for reinforcement learning.
+
+This module is driven by the `dagobert-rl` command which can be parametrised by 
+command line arguments, but it's much more convenient to use YAML configs for this, 
+see the `tcn_args.py` for more detail.
+"""
+import logging
+from pathlib import Path
+
+from dagobert.utils import setup_logging
+from dagobert.runner_utils import load_config, update_args
+from dagobert.modelling.dl.tcn_args import get_all_args
+from dagobert.modelling.rl.rl import run_rl
+
+
+logger = logging.getLogger(__name__)
+
+
+def run():
+    """
+    Run a single TCN training or parallelized hyper parameter tuning study using optuna.
+    """
+
+    # parse arguments amd setup logging
+    args = get_all_args()
+    setup_logging(logger, "dagobert-rl", logging.INFO, args.log_dir)
+
+    # load config yaml if exist
+    if args.config_path != "":
+        config = load_config(Path(args.config_path))
+        args = update_args(args, config)
+
+    run_rl(args)
+
+
+if __name__ == "__main__":
+    run()

From aa12cb12f19765209cd8d9774121f049cec9862d Mon Sep 17 00:00:00 2001
From: Daniel Homola <dani.homola@gmail.com>
Date: Wed, 30 Dec 2020 17:11:08 +0000
Subject: [PATCH 04/62] now we have all the components in place and I know what
 to do, now let's make this work

---
 config/rl_config.yaml                      |   5 +-
 notebooks/modelling/rl_env.ipynb           | 165 +++-------
 setup.cfg                                  |   2 +-
 src/dagobert/modelling/dl/data.py          |  47 ++-
 src/dagobert/modelling/dl/preprocessing.py |  18 +-
 src/dagobert/modelling/dl/tcn.py           |   3 +-
 src/dagobert/modelling/rl/environment.py   |  71 ++++-
 src/dagobert/modelling/rl/networks.py      | 143 +++++++++
 src/dagobert/modelling/rl/ppo.py           | 340 +++++++++++++++++++++
 src/dagobert/modelling/rl/rl.py            |   5 +-
 src/dagobert/naming.py                     |   9 +
 11 files changed, 654 insertions(+), 154 deletions(-)
 create mode 100644 src/dagobert/modelling/rl/networks.py
 create mode 100644 src/dagobert/modelling/rl/ppo.py

diff --git a/config/rl_config.yaml b/config/rl_config.yaml
index 6508176d..da8f251d 100644
--- a/config/rl_config.yaml
+++ b/config/rl_config.yaml
@@ -5,6 +5,7 @@
 # --------------------------------------------------------------------------------------
 
 gpus: 1
+multiprocessing: False
 pin_memory: True
 profiler: True
 val_check_interval: 0.5
@@ -40,8 +41,8 @@ non_last_y_frac: 0.5
 regression: False
 density_num: 3
 mix_density_net: False
-no_class_weights: False
-no_sample_weights: False
+no_class_weights: True
+no_sample_weights: True
 
 # --------------------------------------------------------------------------------------
 # DATA
diff --git a/notebooks/modelling/rl_env.ipynb b/notebooks/modelling/rl_env.ipynb
index fcf19862..ea5164b2 100644
--- a/notebooks/modelling/rl_env.ipynb
+++ b/notebooks/modelling/rl_env.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
@@ -23,8 +23,7 @@
     "import numpy as np\n",
     "import pandas as pd\n",
     "\n",
-    "from dagobert.io import S3Connector\n",
-    "from dagobert.modelling.rl.environment import PortfolioEnv"
+    "from dagobert.io import S3Connector"
    ]
   },
   {
@@ -38,7 +37,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 51,
+   "execution_count": 43,
    "metadata": {},
    "outputs": [
     {
@@ -47,7 +46,7 @@
        "array([0.48192771, 0.26506024, 0.25301205])"
       ]
      },
-     "execution_count": 51,
+     "execution_count": 43,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -71,7 +70,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 52,
+   "execution_count": 44,
    "metadata": {},
    "outputs": [
     {
@@ -80,7 +79,7 @@
        "0.00020481927710843396"
       ]
      },
-     "execution_count": 52,
+     "execution_count": 44,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -137,7 +136,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 55,
+   "execution_count": 45,
    "metadata": {},
    "outputs": [
     {
@@ -146,7 +145,7 @@
        "1.0372875000000001"
       ]
      },
-     "execution_count": 55,
+     "execution_count": 45,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -159,7 +158,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 56,
+   "execution_count": 46,
    "metadata": {},
    "outputs": [
     {
@@ -168,7 +167,7 @@
        "0.03728750000000014"
       ]
      },
-     "execution_count": 56,
+     "execution_count": 46,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -180,7 +179,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 57,
+   "execution_count": 47,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -190,19 +189,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 104,
+   "execution_count": 49,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "p1 1.0141928915662652\n",
-      "rho 0.014192891566265242\n",
-      "p1 1.0290884681079937\n",
-      "rho 0.014687123786407685\n",
-      "p1 1.0287802447893062\n",
-      "rho -0.0002995110024448522\n"
+      "p1 1.0366750000000002\n",
+      "rho 0.036675000000000235\n",
+      "p1 1.0674461056875002\n",
+      "rho 0.029682500000000056\n",
+      "p1 1.0911367376956023\n",
+      "rho 0.022193750000000012\n",
+      "p1 1.0911367376956023\n",
+      "rho 0.0\n"
      ]
     }
    ],
@@ -213,7 +214,7 @@
     "def step(y1, w1, w0, p0):\n",
     "    dw1 = (y1 * w0) / (np.dot(y1, w0) + eps)\n",
     "    mu = 0.0025 * (np.abs(dw1[1:] - w1[1:])).sum()\n",
-    "    p1 = p0 * (1 - mu) * np.dot(y1, w1)\n",
+    "    p1 = p0 * (1 - mu) * np.dot(y1, w0)\n",
     "    rho1 = p1 / p0 - 1\n",
     "    print('p1', p1)\n",
     "    print('rho', rho1)\n",
@@ -231,24 +232,28 @@
     "\n",
     "y3 = np.array([1, 1.3, 1.15])\n",
     "w3 = np.array([1, 0, 0])\n",
-    "p3 = step(y3, w3, w2, p2)\n"
+    "p3 = step(y3, w3, w2, p2)\n",
+    "\n",
+    "y4 = np.array([1, 1.5, 1.5])\n",
+    "w4 = np.array([1, 0, 0])\n",
+    "p4 = step(y4, w4, w3, p3)\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 105,
+   "execution_count": 50,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "p1 0.9843092207792208\n",
-      "rho -0.015690779220779216\n",
-      "p1 0.9618924183114448\n",
-      "rho -0.022774146573604104\n",
-      "p1 0.9253212485790699\n",
-      "rho -0.03802002078005118\n"
+      "p1 0.9618249999999999\n",
+      "rho -0.03817500000000007\n",
+      "p1 0.9471319208437499\n",
+      "rho -0.015276250000000102\n",
+      "p1 0.9253212485790698\n",
+      "rho -0.023028125000000066\n"
      ]
     }
    ],
@@ -270,19 +275,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 106,
+   "execution_count": 51,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "p1 0.9843092207792208\n",
-      "rho -0.015690779220779216\n",
-      "p1 0.976712424016802\n",
-      "rho -0.0077178965736039995\n",
-      "p1 0.9764848524822389\n",
-      "rho -0.0002329974811082769\n"
+      "p1 0.9618249999999999\n",
+      "rho -0.03817500000000007\n",
+      "p1 0.9471896303437499\n",
+      "rho -0.015216250000000042\n",
+      "p1 0.9398666705141548\n",
+      "rho -0.007731249999999967\n"
      ]
     }
    ],
@@ -304,19 +309,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 107,
+   "execution_count": 52,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "p1 1.0092117073170732\n",
-      "rho 0.009211707317073214\n",
-      "p1 1.024114079612195\n",
-      "rho 0.014766349009900814\n",
-      "p1 1.0492192209883782\n",
-      "rho 0.024514008620689864\n"
+      "p1 1.0242\n",
+      "rho 0.0242\n",
+      "p1 1.0342038734999999\n",
+      "rho 0.009767499999999929\n",
+      "p1 1.049219220988378\n",
+      "rho 0.014518749999999914\n"
      ]
     }
    ],
@@ -337,82 +342,6 @@
     "\n",
     "p3 = step(y3, w3, w2, p2)"
    ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Using the environment"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "data_dir = Path('C:/Work/dagobert/data/modelling')\n",
-    "instruments = ['BTC', 'ETH', 'XRP', 'LTC']\n",
-    "datetimes = None\n",
-    "\n",
-    "# work out the common datetimes\n",
-    "for instrument in instruments:\n",
-    "    df = pd.read_feather(data_dir/f'std_bar_{instrument}USDT_tick_1.feather')\n",
-    "    df = df.set_index('date_time')\n",
-    "    if datetimes is not None:\n",
-    "        datetimes = df.index.intersection(datetimes)\n",
-    "    else:\n",
-    "        datetimes = df.index\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# merge instruments \n",
-    "cols_to_select = ['open', 'low', 'high', 'close', 'volume']\n",
-    "history = np.empty((len(instruments), len(datetimes), len(cols_to_select)))\n",
-    "for i,instrument in enumerate(instruments):\n",
-    "    df = pd.read_feather(data_dir/f'std_bar_{instrument}USDT_tick_1.feather')\n",
-    "    history[i, :, :] = df.set_index('date_time').loc[datetimes, cols_to_select].values"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# make portfolio\n",
-    "portfolio = PortfolioEnv(history, instruments, len(datetimes), window_length=1440)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {},
-   "outputs": [
-    {
-     "ename": "AttributeError",
-     "evalue": "'PortfolioSim' object has no attribute 'p0'",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[1;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
-      "\u001b[1;32m<ipython-input-11-ec7502bc072c>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[0mw\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0marray\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;36m.25\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;36m.25\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;36m.25\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;36m.25\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[0mportfolio\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstep\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mw\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
-      "\u001b[1;32mc:\\users\\danih\\dropbox\\dagobert\\dagobert\\src\\dagobert\\modelling\\rl\\environment.py\u001b[0m in \u001b[0;36mstep\u001b[1;34m(self, action)\u001b[0m\n\u001b[0;32m    253\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    254\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0mstep\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0maction\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 255\u001b[1;33m         \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_step\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0maction\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    256\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    257\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0m_step\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0maction\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
-      "\u001b[1;32mc:\\users\\danih\\dropbox\\dagobert\\dagobert\\src\\dagobert\\modelling\\rl\\environment.py\u001b[0m in \u001b[0;36m_step\u001b[1;34m(self, action)\u001b[0m\n\u001b[0;32m    296\u001b[0m         \u001b[0mopen_price_vector\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mobservation\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m-\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    297\u001b[0m         \u001b[0my1\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mclose_price_vector\u001b[0m \u001b[1;33m/\u001b[0m \u001b[0mopen_price_vector\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 298\u001b[1;33m         \u001b[0mreward\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0minfo\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdone2\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msim\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_step\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mweights\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my1\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    299\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    300\u001b[0m         \u001b[1;31m# calculate return for buy and hold a bit of each asset\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
-      "\u001b[1;32mc:\\users\\danih\\dropbox\\dagobert\\dagobert\\src\\dagobert\\modelling\\rl\\environment.py\u001b[0m in \u001b[0;36m_step\u001b[1;34m(self, w1, y1)\u001b[0m\n\u001b[0;32m    146\u001b[0m         \u001b[1;32massert\u001b[0m \u001b[0my1\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;36m1.0\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"y1[0] must be 1\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    147\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 148\u001b[1;33m         \u001b[0mp0\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mp0\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    149\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    150\u001b[0m         \u001b[0mdw1\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0my1\u001b[0m \u001b[1;33m*\u001b[0m \u001b[0mw1\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m/\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdot\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0my1\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mw1\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m+\u001b[0m \u001b[0meps\u001b[0m\u001b[1;33m)\u001b[0m  \u001b[1;31m# (eq7) weights evolve into\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
-      "\u001b[1;31mAttributeError\u001b[0m: 'PortfolioSim' object has no attribute 'p0'"
-     ]
-    }
-   ],
-   "source": [
-    "w = np.array([0, .25, .25, .25, .25])\n",
-    "portfolio.step(w)"
-   ]
   }
  ],
  "metadata": {
diff --git a/setup.cfg b/setup.cfg
index bbc2948b..f0d2b5ce 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -57,7 +57,7 @@ console_scripts =
     dagobert-tcn = dagobert.modelling.dl.tcn_runner:run
     dagobert-optuna = dagobert.modelling.dl.optuna:run
     dagobert-s3 = dagobert.io.runner:run
-    dagobert-tcn = dagobert.modelling.rl.rl_runner:run
+    dagobert-rl = dagobert.modelling.rl.rl_runner:run
 
 [test]
 # py.test options when running `python setup.py test`
diff --git a/src/dagobert/modelling/dl/data.py b/src/dagobert/modelling/dl/data.py
index 09825bda..1db71a2b 100644
--- a/src/dagobert/modelling/dl/data.py
+++ b/src/dagobert/modelling/dl/data.py
@@ -5,7 +5,7 @@
 import logging
 from pathlib import Path
 from argparse import Namespace
-from typing import List, Tuple, Union
+from typing import List, Tuple, Union, Iterable, Callable
 
 import torch
 import numpy as np
@@ -13,9 +13,9 @@
 from matplotlib.figure import Figure
 from matplotlib import pyplot as plt
 from sklearn.preprocessing import MinMaxScaler
-from torch.utils.data import Dataset, DataLoader
+from torch.utils.data import Dataset, DataLoader, IterableDataset
 
-from dagobert.naming import NBarVars, NAugmentationMethods
+from dagobert.naming import NBarVars, NAugmentationMethods, NRL
 from dagobert.naming import NPreprocessingArgs as npa
 from dagobert.preprocessing.utils import set_dt_index
 from dagobert.modelling.augmentation import augment
@@ -567,17 +567,19 @@ class PortfolioCryptoDataset(CryptoDataset):
     This extends :class:`dagobert.modelling.dl.data.CryptoDataset` to make it
     suitable for multi instrument portfolio optimization through reinforcement-learning.
 
-    Instead of returning an array of Xs and single y, this returns only the Xs, and
-    uses the last time step of the Xs as the target.
+    Instead of returning an array of Xs and single y, this returns for each X a y. This
+    is achieved by adding the rl_return target column to the cols_to_model at init, and
+    then fishing it out for each sample before returning it.
+
+    This convulated way was used so we can repurpose and keep as much of the original
+    CryptoDataset as possible, without extensive refactoring.
     """
 
     def __init__(self, *args, **kw):
         super().__init__(*args, **kw)
-        # for each instrument, fish out the index of the target_col (close_0 by default)
-        self.target_col_ix = [
-            np.where(self.target_col == np.array(cols))[0]
-            for cols in self.cols_to_model.values()
-        ]
+        # for each instrument, we add the rl_return target col to their cols_to_model
+        for df_name, cols in self.cols_to_model.items():
+            self.cols_to_model[df_name].append(NRL.rl_return)
 
     def __getitem__(self, idx):
         """
@@ -589,9 +591,28 @@ def __getitem__(self, idx):
         from_idx, upto_idx = self._get_from_upto_idxs(idx, batch_indices)
         Xs = self._get_Xs(batch_dfs, from_idx, upto_idx)
 
-        # cut off last time-point from each X as use that as y.
+        # the last column is y (see __init__), so we fish it out and delete it from X
         ys = np.empty(len(Xs))
         for i, X in enumerate(Xs):
-            Xs[i] = X[:, :-1]
-            ys[i] = X[self.target_col_ix[i], -1]
+            ys[i] = X[-1, -1]
+            Xs[i] = X[:-1, :]
         return Xs, ys
+
+
+class ExperienceSourceDataset(IterableDataset):
+    """
+    Implementation from PyTorch Lightning Bolts. This allows us to use Lightning in a
+    reinforcement learning setting where we first need to generate our training data
+    by interacting with the environment, and only then use it to train our policy.
+
+    Basic experience source dataset. Takes a generate_batch function that returns an
+    iterator. The logic for the experience source and how the batch is generated is
+    defined ihbthe PPO Lightning model itself.
+    """
+
+    def __init__(self, generate_batch: Callable):
+        self.generate_batch = generate_batch
+
+    def __iter__(self) -> Iterable:
+        iterator = self.generate_batch()
+        return iterator
diff --git a/src/dagobert/modelling/dl/preprocessing.py b/src/dagobert/modelling/dl/preprocessing.py
index 50efa475..ffa6916a 100644
--- a/src/dagobert/modelling/dl/preprocessing.py
+++ b/src/dagobert/modelling/dl/preprocessing.py
@@ -16,7 +16,7 @@
 from sklearn.utils.validation import check_is_fitted
 from sklearn.exceptions import NotFittedError
 
-from dagobert.naming import NBarriers, NCrossValidation, NTimeFeatures, NBarVars
+from dagobert.naming import NBarriers, NCrossValidation, NTimeFeatures, NBarVars, NRL
 from dagobert.naming import NPreprocessingArgs as npa
 from dagobert.naming import NBarriers as nb
 from dagobert.io import FeatherWriter, S3Connector
@@ -138,6 +138,11 @@ def _preprocess_augment_dfs(
                     df, hparams.binariser_method, hparams.binariser_threshold, df_name
                 )
 
+        # add rl return if required
+        if hparams.target_col == NRL.rl_return:
+            rl_return = (df[NBarVars.close] / df[NBarVars.close].shift()).fillna(1)
+            df.insert(0, NRL.rl_return, rl_return)
+
         # scale all numeric columns
         if hparams.scaling_method is not None:
             cols_not_to_scale = set(
@@ -162,7 +167,11 @@ def _preprocess_augment_dfs(
                     logger.info(f"Fit-transformed {log_msg}")
 
         # save transformed file
-        if hparams.to_label or hparams.scaling_method:
+        if (
+            hparams.to_label
+            or hparams.scaling_method
+            or hparams.target_col == NRL.rl_return
+        ):
             feather_writer = FeatherWriter(output_path=data_dir / df_path_prev)
             feather_writer.write(df)
 
@@ -292,6 +301,11 @@ def _preprocess_train_dfs(
                     df, hparams.binariser_method, hparams.binariser_threshold, df_name
                 )
 
+        # add rl return if required
+        if hparams.target_col == NRL.rl_return:
+            rl_return = (df[NBarVars.close] / df[NBarVars.close].shift()).fillna(1)
+            df.insert(0, NRL.rl_return, rl_return)
+
         # split
         splitter = TrainValTestSplitter(data_connector=df)
         df_train, df_val, df_test = splitter.split(
diff --git a/src/dagobert/modelling/dl/tcn.py b/src/dagobert/modelling/dl/tcn.py
index 3db77df8..0fefa51d 100644
--- a/src/dagobert/modelling/dl/tcn.py
+++ b/src/dagobert/modelling/dl/tcn.py
@@ -182,7 +182,7 @@ def __init__(self, hparams: Namespace):
 
         # define main vars (other than model)
         super().__init__()
-        TCNLightning._pre_sanity_check(hparams)
+        hparams = TCNLightning._pre_sanity_check(hparams)
         # lightning sets this to cuda too late for some of our setup to work
         self.tcn_device = "cuda" if hparams.gpus > 0 else "cpu"
         hparams = Preprocessing().preprocess_augment_dfs(hparams)
@@ -686,6 +686,7 @@ def _pre_sanity_check(hparams: Namespace):
             raise ValueError(
                 "If you use augment_dfs you must at least define the anchor key."
             )
+        return hparams
 
     def _sanity_check(self):
         """
diff --git a/src/dagobert/modelling/rl/environment.py b/src/dagobert/modelling/rl/environment.py
index c8fbf72b..a4bd7b72 100644
--- a/src/dagobert/modelling/rl/environment.py
+++ b/src/dagobert/modelling/rl/environment.py
@@ -11,7 +11,15 @@
 import matplotlib.pyplot as plt
 from torch.utils.data import Dataset, DataLoader
 
-from dagobert.modelling.dl import PortfolioCryptoDataset, Preprocessing
+from dagobert.naming import NRL
+from dagobert.modelling.dl import (
+    PortfolioCryptoDataset,
+    Preprocessing,
+    TemporalConvNet,
+    TCNLightning,
+)
+from dagobert.modelling.utils import update_lookback
+
 
 logger = logging.getLogger(__name__)
 eps = np.finfo(float).eps
@@ -30,10 +38,11 @@ def __init__(self, hparams: Namespace, train_val_test: str = "train"):
         Args:
             hparams: Hyparams parsed by the rl_runner. Similar to how `TCNLightning` is
                 initialized.
+
             train_val_test: Whether we are training, validating or testing, it must be
                 either train, val or test.
         """
-        RLData._pre_sanity_check(hparams)
+        hparams = RLData._pre_sanity_check(hparams)
         hparams = Preprocessing().preprocess_augment_dfs(hparams)
         self.hparams = Preprocessing().preprocess_train_dfs(hparams)
         if train_val_test == "train":
@@ -53,17 +62,52 @@ def __init__(self, hparams: Namespace, train_val_test: str = "train"):
             augment_dfs=augment_dfs,
             augment_dfs_mix=self.hparams.augment_dfs_mix,
         )
-        self.dataloader = DataLoader(self.dataset)
+
+        self.dataloader = iter(DataLoader(self.dataset))
+
+        from IPython import embed
+
+        embed()
 
     def step(self):
-        pass
+        Xs, ys = self.dataset[idx]
+        return Xs, y, False
 
     def reset(self):
-        pass
+        self.dataloader = iter(DataLoader(self.dataset))
 
     @staticmethod
     def _pre_sanity_check(hparams: Namespace):
-        pass
+
+        if hparams.target_col != NRL.rl_return:
+            raise ValueError("target_col has to be rl_return for RL tasks.")
+
+        net_depth = len(hparams.num_channels)
+        k_size = hparams.kernel_size
+        max_seq_len = TemporalConvNet.get_tcn_receptive_field_size(k_size, net_depth)
+        logger.info(
+            f"A TCN with kernel size: {k_size} and depth: {net_depth} has a receptive "
+            f"field (can read a maximum sequence length) of {max_seq_len}."
+        )
+        if hparams.mini_series_length == "auto":
+            logger.info(f"We set mini_series_length from 'auto' to {max_seq_len}.")
+            hparams.mini_series_length = max_seq_len
+        if (
+            hparams.mini_series_length != "auto"
+            and hparams.mini_series_length > max_seq_len
+        ):
+            logger.warning(
+                f"Provided mini-series length: {hparams.mini_series_length} is "
+                f"larger than the networks receptive field size: {max_seq_len}."
+            )
+        # calcualte what the current TCN setup corresponds to in hourly lookback
+        df_anchor = TCNLightning._load_anchor(hparams)
+        hparams.lookback = update_lookback(df_anchor, hparams.mini_series_length)
+        logger.info(
+            f"The current mini_series_legnth {hparams.mini_series_length}, "
+            f"corresponds to an estimated lookback of {hparams.lookback} hours."
+        )
+        return hparams
 
 
 class RLPortfolio(object):
@@ -83,7 +127,7 @@ class RLPortfolio(object):
     def __init__(
         self,
         asset_names: List[str],
-        steps: int = 128,
+        episode_length: int = 1000,
         trading_cost: float = 0.001,
         reward_type: str = "return",
     ):
@@ -92,14 +136,12 @@ def __init__(
 
         Args:
             asset_names: Names of assets in the portfolio.
-            steps: Maximum number of steps.
             trading_cost: Commission rate, currently set to Binance's VIP0 taker level.
             reward_type: Whether to use the log return as reward or the sharpe ratio,
                 which was found to be more stable.
         """
         self.asset_names = asset_names
         self.asset_num = len(asset_names)
-        self.steps = steps
         self.trading_cost = trading_cost
         self.reward_type = reward_type
         self.reset()
@@ -128,10 +170,10 @@ def _step(self, w1: np.array, y1: np.array) -> Tuple[float, dict, bool]:
         # excluding change in cash to avoid double counting for transaction cost
         mu = self.trading_cost * (np.abs(dw1[1:] - w1[1:])).sum()
 
-        # (eq11) final portfolio value: after lot of experiments in rl notebook and
-        # reading the relevant parts of the paper a ton of times, we use w1 here
-        # instead of w0, also bc it makes intuitive sense this way
-        p1 = p0 * (1 - mu) * np.dot(y1, w1)
+        # (eq11) final portfolio value: I thought this should be w1 (at the end), but
+        # then think through how the env actually models the world (see Figure 1), w0
+        # (which is the original implementation) makes sense here.
+        p1 = p0 * (1 - mu) * np.dot(y1, w0)
 
         # (eq9 & 10) rate of return log rate of return
         rho1 = p1 / p0 - 1  # rate of returns
@@ -139,7 +181,7 @@ def _step(self, w1: np.array, y1: np.array) -> Tuple[float, dict, bool]:
 
         # (eq22) immediate reward is log rate of return scaled by episode length
         if self.reward_type == "return":
-            reward = r1 / self.steps
+            reward = r1
         # TODO: implement the differentiable sharpe ratio reward like so https://quant.stackexchange.com/a/38040
 
         # remember for next step
@@ -186,7 +228,6 @@ def __init__(
         self,
         history,
         abbreviation,
-        steps=730,  # 2 years
         trading_cost=0.0025,
         time_cost=0.00,
         window_length=50,
diff --git a/src/dagobert/modelling/rl/networks.py b/src/dagobert/modelling/rl/networks.py
new file mode 100644
index 00000000..4e2745b6
--- /dev/null
+++ b/src/dagobert/modelling/rl/networks.py
@@ -0,0 +1,143 @@
+from typing import Union, Tuple
+
+import torch
+from torch import nn
+from torch.distributions import Categorical, Normal
+
+
+def create_mlp(input_shape: Tuple[int], n_actions: int, hidden_sizes: list = [64, 64]):
+    """
+    Simple Multi-Layer Perceptron network
+    """
+    net_layers = []
+    net_layers.append(nn.Linear(input_shape[0], hidden_sizes[0]))
+    net_layers.append(nn.ReLU())
+
+    for i in range(len(hidden_sizes) - 1):
+        net_layers.append(nn.Linear(hidden_sizes[i], hidden_sizes[i + 1]))
+        net_layers.append(nn.ReLU())
+    net_layers.append(nn.Linear(hidden_sizes[-1], n_actions))
+
+    return nn.Sequential(*net_layers)
+
+
+class ActorCategorical(nn.Module):
+    """
+    Policy network, for discrete action spaces, which returns a distribution
+    and an action given an observation
+    """
+
+    def __init__(self, actor_net):
+        """
+        Args:
+            input_shape: observation shape of the environment
+            n_actions: number of discrete actions available in the environment
+        """
+        super().__init__()
+
+        self.actor_net = actor_net
+
+    def forward(self, states):
+        logits = self.actor_net(states)
+        pi = Categorical(logits=logits)
+        actions = pi.sample()
+
+        return pi, actions
+
+    def get_log_prob(self, pi: Categorical, actions: torch.Tensor):
+        """
+        Takes in a distribution and actions and returns log prob of actions
+        under the distribution
+        Args:
+            pi: torch distribution
+            actions: actions taken by distribution
+        Returns:
+            log probability of the acition under pi
+        """
+        return pi.log_prob(actions)
+
+
+class ActorContinous(nn.Module):
+    """
+    Policy network, for continous action spaces, which returns a distribution
+    and an action given an observation
+    """
+
+    def __init__(self, actor_net, act_dim):
+        """
+        Args:
+            input_shape: observation shape of the environment
+            n_actions: number of discrete actions available in the environment
+        """
+        super().__init__()
+        self.actor_net = actor_net
+        log_std = -0.5 * torch.ones(act_dim, dtype=torch.float)
+        self.log_std = torch.nn.Parameter(log_std)
+
+    def forward(self, states):
+        mu = self.actor_net(states)
+        std = torch.exp(self.log_std)
+        pi = Normal(loc=mu, scale=std)
+        actions = pi.sample()
+
+        return pi, actions
+
+    def get_log_prob(self, pi: Normal, actions: torch.Tensor):
+        """
+        Takes in a distribution and actions and returns log prob of actions
+        under the distribution
+        Args:
+            pi: torch distribution
+            actions: actions taken by distribution
+        Returns:
+            log probability of the acition under pi
+        """
+        return pi.log_prob(actions).sum(axis=-1)
+
+
+class ActorCriticAgent(object):
+    """
+    Actor Critic Agent used during trajectory collection. It returns a
+    distribution and an action given an observation. Agent based on the
+    implementations found here: https://github.com/Shmuma/ptan/blob/master/ptan/agent.py
+
+    """
+
+    def __init__(self, actor_net: nn.Module, critic_net: nn.Module):
+        self.actor_net = actor_net
+        self.critic_net = critic_net
+
+    @torch.no_grad()
+    def __call__(self, state: torch.Tensor, device: str) -> Tuple:
+        """
+        Takes in the current state and returns the agents policy, sampled
+        action, log probability of the action, and value of the given state
+        Args:
+            states: current state of the environment
+            device: the device used for the current batch
+        Returns:
+            torch dsitribution and randomly sampled action
+        """
+
+        state = state.to(device=device)
+
+        pi, actions = self.actor_net(state)
+        log_p = self.get_log_prob(pi, actions)
+
+        value = self.critic_net(state)
+
+        return pi, actions, log_p, value
+
+    def get_log_prob(
+        self, pi: Union[Categorical, Normal], actions: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        Takes in the current state and returns the agents policy, a sampled
+        action, log probability of the action, and the value of the state
+        Args:
+            pi: torch distribution
+            actions: actions taken by distribution
+        Returns:
+            log probability of the acition under pi
+        """
+        return self.actor_net.get_log_prob(pi, actions)
diff --git a/src/dagobert/modelling/rl/ppo.py b/src/dagobert/modelling/rl/ppo.py
new file mode 100644
index 00000000..30b1e957
--- /dev/null
+++ b/src/dagobert/modelling/rl/ppo.py
@@ -0,0 +1,340 @@
+from typing import List, Tuple
+
+import pytorch_lightning as pl
+from networks import create_mlp, ActorCriticAgent, ActorCategorical, ActorContinous
+from data import ExperienceSourceDataset
+
+import torch
+from torch.utils.data import DataLoader
+import torch.optim as optim
+from torch.optim.optimizer import Optimizer
+import numpy as np
+
+try:
+    import gym
+except ModuleNotFoundError:
+    _GYM_AVAILABLE = False
+else:
+    _GYM_AVAILABLE = True
+
+
+class PPO(pl.LightningModule):
+    """
+    PyTorch Lightning implementation of `PPO
+    <https://arxiv.org/abs/1707.06347>`_
+    Paper authors: John Schulman, Filip Wolski, Prafulla Dhariwal, Alec Radford, Oleg Klimov
+
+    Example:
+        model = PPO("CartPole-v0")
+    Train:
+        trainer = Trainer()
+        trainer.fit(model)
+    Note:
+        This example is based on:
+        https://github.com/openai/baselines/blob/master/baselines/ppo2/ppo2.py
+        https://github.com/PyTorchLightning/pytorch-lightning-bolts/blob/master/pl_bolts/models/rl/reinforce_model.py
+
+    """
+
+    def __init__(
+        self,
+        env: str,
+        gamma: float = 0.99,
+        lam: float = 0.95,
+        lr_actor: float = 3e-4,
+        lr_critic: float = 1e-3,
+        max_episode_len: float = 200,
+        batch_size: int = 512,
+        steps_per_epoch: int = 2048,
+        nb_optim_iters: int = 4,
+        clip_ratio: float = 0.2,
+    ) -> None:
+
+        """
+        Args:
+            env: gym environment tag
+            gamma: discount factor
+            lam: advantage discount factor (lambda in the paper)
+            lr_actor: learning rate of actor network
+            lr_critic: learning rate of critic network
+            max_episode_len: maximum number interactions (actions) in an episode
+            batch_size:  batch_size when training network- can simulate number of policy updates performed per epoch
+            steps_per_epoch: how many action-state pairs to rollout for trajectory collection per epoch
+            nb_optim_iters: how many steps of gradient descent to perform on each batch
+            clip_ratio: hyperparameter for clipping in the policy objective
+        """
+        super().__init__()
+
+        if not _GYM_AVAILABLE:
+            raise ModuleNotFoundError(
+                "This Module requires gym environment which is not installed yet."
+            )
+
+        # Hyperparameters
+        self.lr_actor = lr_actor
+        self.lr_critic = lr_critic
+        self.steps_per_epoch = steps_per_epoch
+        self.nb_optim_iters = nb_optim_iters
+        self.batch_size = batch_size
+        self.gamma = gamma
+        self.lam = lam
+        self.max_episode_len = max_episode_len
+        self.clip_ratio = clip_ratio
+        self.save_hyperparameters()
+
+        self.env = gym.make(env)
+        # value network
+        self.critic = create_mlp(self.env.observation_space.shape, 1)
+        # policy network (agent)
+        if type(self.env.action_space) == gym.spaces.box.Box:
+            act_dim = self.env.action_space.shape[0]
+            actor_mlp = create_mlp(self.env.observation_space.shape, act_dim)
+            self.actor = ActorContinous(actor_mlp, act_dim)
+        elif type(self.env.action_space) == gym.spaces.discrete.Discrete:
+            actor_mlp = create_mlp(
+                self.env.observation_space.shape, self.env.action_space.n
+            )
+            self.actor = ActorCategorical(actor_mlp)
+        else:
+            raise NotImplementedError(
+                "Env action space should be of type Box (continous) or Discrete (categorical)"
+                "Got type: ",
+                type(self.env.action_space),
+            )
+        self.agent = ActorCriticAgent(self.actor, self.critic)
+
+        self.batch_states = []
+        self.batch_actions = []
+        self.batch_adv = []
+        self.batch_qvals = []
+        self.batch_logp = []
+
+        self.ep_rewards = []
+        self.ep_values = []
+
+        self.done_episodes = 0
+        self.epoch_rewards = 0
+        self.avg_ep_reward = 0
+        self.avg_ep_len = 0
+        self.avg_reward = 0
+
+        self.state = torch.FloatTensor(self.env.reset())
+
+    def forward(
+        self, x: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Passes in a state x through the network and returns the policy and a sampled action
+        Args:
+            x: environment state
+        Returns:
+            Tuple of policy and action
+        """
+        pi, action = self.actor(x)
+        value = self.critic(x)
+
+        return pi, action, value
+
+    def discount_rewards(self, rewards: List[float], discount: float) -> List[float]:
+        """Calculate the discounted rewards of all rewards in list
+        Args:
+            rewards: list of rewards/advantages
+        Returns:
+            list of discounted rewards/advantages
+        """
+        assert isinstance(rewards[0], float)
+
+        cumul_reward = []
+        sum_r = 0.0
+
+        for r in reversed(rewards):
+            sum_r = (sum_r * discount) + r
+            cumul_reward.append(sum_r)
+
+        return list(reversed(cumul_reward))
+
+    def calc_advantage(
+        self, rewards: List[float], values: List[float], last_value: float
+    ) -> List[float]:
+        """Calculate the advantage given rewards, state values, and the last value of episode
+        Args:
+            rewards: list of episode rewards
+            values: list of state values from critic
+            last_value: value of last state of episode
+        Returns:
+            list of advantages
+        """
+        rews = rewards + [last_value]
+        vals = values + [last_value]
+        # GAE
+        delta = [
+            rews[i] + self.gamma * vals[i + 1] - vals[i] for i in range(len(rews) - 1)
+        ]
+        adv = self.discount_rewards(delta, self.gamma * self.lam)
+
+        return adv
+
+    def train_batch(
+        self,
+    ) -> Tuple[List[torch.Tensor], List[torch.Tensor], List[torch.Tensor]]:
+        """
+        Contains the logic for generating trajectory data to train policy and value network
+        Yield:
+           Tuple of Lists containing tensors for states, actions, log probs, qvals and advantage
+        """
+
+        for step in range(self.steps_per_epoch):
+            pi, action, log_prob, value = self.agent(self.state, self.device)
+            next_state, reward, done, _ = self.env.step(action.cpu().numpy())
+
+            self.batch_states.append(self.state)
+            self.batch_actions.append(action)
+            self.batch_logp.append(log_prob)
+
+            self.ep_rewards.append(reward)
+            self.ep_values.append(value.item())
+
+            self.state = torch.FloatTensor(next_state)
+
+            epoch_end = step == (self.steps_per_epoch - 1)
+            terminal = len(self.ep_rewards) == self.max_episode_len
+
+            if epoch_end or done or terminal:
+                # if trajectory ends abtruptly, boostrap value of next state
+                if (terminal or epoch_end) and not done:
+                    with torch.no_grad():
+                        _, _, _, value = self.agent(self.state, self.device)
+                        last_value = value.item()
+                else:
+                    last_value = 0
+
+                # discounted cumulative reward
+                self.batch_qvals += self.discount_rewards(
+                    self.ep_rewards + [last_value], self.gamma
+                )[:-1]
+                # advantage
+                self.batch_adv += self.calc_advantage(
+                    self.ep_rewards, self.ep_values, last_value
+                )
+                # logs
+                self.done_episodes += 1
+                self.epoch_rewards += np.sum(self.ep_rewards)
+                # reset params
+                self.ep_rewards = []
+                self.ep_values = []
+                self.state = torch.FloatTensor(self.env.reset())
+
+            if epoch_end:
+                train_data = zip(
+                    self.batch_states,
+                    self.batch_actions,
+                    self.batch_logp,
+                    self.batch_qvals,
+                    self.batch_adv,
+                )
+
+                for state, action, logp_old, qval, adv in train_data:
+                    yield state, action, logp_old, qval, adv
+
+                self.batch_states.clear()
+                self.batch_actions.clear()
+                self.batch_adv.clear()
+                self.batch_logp.clear()
+                self.batch_qvals.clear()
+
+                self.avg_ep_reward = self.epoch_rewards / self.done_episodes
+                self.avg_reward = self.epoch_rewards / self.steps_per_epoch
+                self.avg_ep_len = self.steps_per_epoch / self.done_episodes
+
+                self.epoch_rewards = 0
+                self.done_episodes = 0
+
+    def actor_loss(self, state, action, logp_old, qval, adv) -> torch.Tensor:
+        pi, _ = self.actor(state)
+        logp = self.actor.get_log_prob(pi, action)
+        ratio = torch.exp(logp - logp_old)
+        clip_adv = torch.clamp(ratio, 1 - self.clip_ratio, 1 + self.clip_ratio) * adv
+        loss_actor = -(torch.min(ratio * adv, clip_adv)).mean()
+        return loss_actor
+
+    def critic_loss(self, state, action, logp_old, qval, adv) -> torch.Tensor:
+        value = self.critic(state)
+        loss_critic = (qval - value).pow(2).mean()
+        return loss_critic
+
+    def training_step(
+        self, batch: Tuple[torch.Tensor, torch.Tensor], batch_idx, optimizer_idx
+    ):
+        """
+        Carries out a single update to actor and critic network from a batch of replay buffer.
+
+        Args:
+            batch: batch of replay buffer/trajectory data
+            batch_idx: not used
+            optimizer_idx: idx that controls optimizing actor or critic network
+        Returns:
+            loss
+        """
+        state, action, old_logp, qval, adv = batch
+        self.log(
+            "avg_ep_len", self.avg_ep_len, prog_bar=True, on_step=False, on_epoch=True
+        )
+        self.log(
+            "avg_ep_reward",
+            self.avg_ep_reward,
+            prog_bar=True,
+            on_step=False,
+            on_epoch=True,
+        )
+        self.log(
+            "avg_reward", self.avg_reward, prog_bar=True, on_step=False, on_epoch=True
+        )
+
+        if optimizer_idx % 2 == 0:
+            loss_actor = self.actor_loss(state, action, old_logp, qval, adv)
+            self.log(
+                "loss_actor",
+                loss_actor,
+                on_step=False,
+                on_epoch=True,
+                prog_bar=True,
+                logger=True,
+            )
+
+            return loss_actor
+
+        else:
+            loss_critic = self.critic_loss(state, action, old_logp, qval, adv)
+            self.log(
+                "loss_critic",
+                loss_critic,
+                on_step=False,
+                on_epoch=True,
+                prog_bar=False,
+                logger=True,
+            )
+
+            return loss_critic
+
+    def configure_optimizers(self) -> List[Optimizer]:
+        """ Initialize Adam optimizer"""
+        optimizer_actor = optim.Adam(self.actor.parameters(), lr=self.lr_actor)
+        optimizer_critic = optim.Adam(self.critic.parameters(), lr=self.lr_critic)
+
+        # to run multple steps of gradient descent
+        optimizers = []
+        for i in range(self.nb_optim_iters):
+            optimizers.append(optimizer_actor)
+            optimizers.append(optimizer_critic)
+
+        return optimizers
+
+    def _dataloader(self) -> DataLoader:
+        """Initialize the Replay Buffer dataset used for retrieving experiences"""
+        dataset = ExperienceSourceDataset(self.train_batch)
+        dataloader = DataLoader(dataset=dataset, batch_size=self.batch_size)
+        return dataloader
+
+    def train_dataloader(self) -> DataLoader:
+        """Get train loader"""
+        return self._dataloader()
diff --git a/src/dagobert/modelling/rl/rl.py b/src/dagobert/modelling/rl/rl.py
index ca60f407..35f465d9 100644
--- a/src/dagobert/modelling/rl/rl.py
+++ b/src/dagobert/modelling/rl/rl.py
@@ -1,5 +1,6 @@
-from dagobert.modelling.rl import RLEnv
+from dagobert.modelling.rl import RLEnv, RLData
 
 
 def run_rl(args):
-    env = RLEnv(args)
+    rld = RLData(args)
+    # env = RLEnv(args)
diff --git a/src/dagobert/naming.py b/src/dagobert/naming.py
index f034ce41..829c0eca 100644
--- a/src/dagobert/naming.py
+++ b/src/dagobert/naming.py
@@ -199,6 +199,7 @@ class NPreprocessingArgs(object):
     """
 
     anchor = "anchor"
+    target_col = "target_col"
     cols_to_model = "cols_to_model"
     close_original = "close_original"
 
@@ -275,3 +276,11 @@ class NStockstats(object):
     wr_40 = "wr_40"
     vr_120 = "vr_120"
     vr_40 = "vr_40"
+
+
+class NRL(object):
+    """
+    Naming object for reinforcement learning environment / agent / algos.
+    """
+
+    rl_return = "rl_return"

From 78b865055b24a9ea37b29b65989b28fc810e276b Mon Sep 17 00:00:00 2001
From: Daniel Homola <dani.homola@gmail.com>
Date: Wed, 30 Dec 2020 18:45:26 +0000
Subject: [PATCH 05/62] going through environment.py and trying to make it work

---
 config/rl_config.yaml                    |  72 ++------
 src/dagobert/modelling/dl/tcn.py         |  23 ++-
 src/dagobert/modelling/rl/environment.py | 213 +++++++++--------------
 src/dagobert/modelling/rl/utils.py       |  21 +++
 4 files changed, 130 insertions(+), 199 deletions(-)
 create mode 100644 src/dagobert/modelling/rl/utils.py

diff --git a/config/rl_config.yaml b/config/rl_config.yaml
index da8f251d..03a64656 100644
--- a/config/rl_config.yaml
+++ b/config/rl_config.yaml
@@ -19,19 +19,28 @@ auto_scale_batch_size:
 
 log_dir: logs
 num_workers: 4
-exp_name: TCN
+exp_name: RL-PPO-TCN
 tags:
-  - model1
-  - ethusdt_volume500
+  - RL_test
 no_comet_logger: True
 seed: 42
-batch_size: 100
+batch_size: 128
+
+
+# --------------------------------------------------------------------------------------
+# RL
+# --------------------------------------------------------------------------------------
+
+episode_length: 1000
+target_col: rl_return
+asset_names: 
+  - BTC
+  - ETH
 
 # --------------------------------------------------------------------------------------
 # MODEL
 # --------------------------------------------------------------------------------------
 
-output_size: 2
 num_channels: [150, 150, 150, 150, 150, 150, 150]
 kernel_size: 10
 dropout: 0.5
@@ -41,8 +50,6 @@ non_last_y_frac: 0.5
 regression: False
 density_num: 3
 mix_density_net: False
-no_class_weights: True
-no_sample_weights: True
 
 # --------------------------------------------------------------------------------------
 # DATA
@@ -53,13 +60,6 @@ data_dir: "C:/Work/dagobert/data/modelling"
 lookback: auto
 mini_series_length: auto
 
-# If this is set to a number, then simple lookahead labelling is in place
-simple_lookahead_y: 
-simple_lookahead_reg: False
-
-# If this is True, anchor is labelled before preprocessing. to_label and simple_lookahead_y cannot be used together.
-to_label: False
-
 df_train:
   anchor: std_bar_BTCUSDT_tick_1.feather
   df2: std_bar_ETHUSDT_tick_1.feather
@@ -110,49 +110,7 @@ cols_to_model:
     - mdi_60
     - vr_60
   df2:
-    - date_diff
-    - open
-    - high
-    - low
-    - close
-    - open_fd_0.0
-    - high_fd_0.0
-    - low_fd_0.0
-    - close_fd_0.0
-    - open_fd_tuned
-    - high_fd_tuned
-    - low_fd_tuned
-    - close_fd_tuned
-    - cum_ticks
-    - cum_dollar
-    - volume
-    - cum_volume_buy
-    - cum_volume_sell
-    - cum_volume_quote
-    - cum_volume_quote_buy
-    - cum_volume_quote_sell
-    - sin_date
-    - cos_date
-    - sin_time
-    - cos_time
-    - boll
-    - boll_lb
-    - boll_ub
-    - macd
-    - macds
-    - macdh
-    - wr_60
-    - rsi_60
-    - rsv_60
-    - atr_60
-    - cci_60
-    - kdjk_60
-    - kdjd_60
-    - kdjj_60
-    - pdi_60
-    - mdi_60
-    - vr_60
-target_col: close_fd_0.0
+    
 time_feat_n: 1
 time_embed_dim: 12
 
diff --git a/src/dagobert/modelling/dl/tcn.py b/src/dagobert/modelling/dl/tcn.py
index 0fefa51d..e49304fc 100644
--- a/src/dagobert/modelling/dl/tcn.py
+++ b/src/dagobert/modelling/dl/tcn.py
@@ -626,7 +626,7 @@ def _log_graph(self, datasets: CryptoDataset):
     # ----------------------------------------------------------------------------------
 
     @staticmethod
-    def _pre_sanity_check(hparams: Namespace):
+    def _pre_sanity_check(hparams: Namespace) -> Namespace:
         """Certain sanity checks must happen before preprocessing takes place."""
         # TARGET VARIABLE
         if (
@@ -654,7 +654,19 @@ def _pre_sanity_check(hparams: Namespace):
                 "Classification is not applicable with mixed density nets"
             )
 
-        # NET
+        # MINI SERIES / LOOKBACK
+        hparams = TCNLightning._pre_sanity_check_mini_series_lookback(hparams)
+
+        # ETC
+        if hparams.augment_dfs and npa.anchor not in hparams.augment_dfs.keys():
+            raise ValueError(
+                "If you use augment_dfs you must at least define the anchor key."
+            )
+        return hparams
+
+    @staticmethod
+    def _pre_sanity_check_mini_series_lookback(hparams: Namespace) -> Namespace:
+        """Calculate lookback and mini_series_length if necessary."""
         net_depth = len(hparams.num_channels)
         k_size = hparams.kernel_size
         max_seq_len = TemporalConvNet.get_tcn_receptive_field_size(k_size, net_depth)
@@ -681,13 +693,6 @@ def _pre_sanity_check(hparams: Namespace):
             f"corresponds to an estimated lookback of {hparams.lookback} hours."
         )
 
-        # ETC
-        if hparams.augment_dfs and npa.anchor not in hparams.augment_dfs.keys():
-            raise ValueError(
-                "If you use augment_dfs you must at least define the anchor key."
-            )
-        return hparams
-
     def _sanity_check(self):
         """
         Make sure the options defined in hparams don't contradict each other.
diff --git a/src/dagobert/modelling/rl/environment.py b/src/dagobert/modelling/rl/environment.py
index a4bd7b72..ba900123 100644
--- a/src/dagobert/modelling/rl/environment.py
+++ b/src/dagobert/modelling/rl/environment.py
@@ -12,13 +12,9 @@
 from torch.utils.data import Dataset, DataLoader
 
 from dagobert.naming import NRL
-from dagobert.modelling.dl import (
-    PortfolioCryptoDataset,
-    Preprocessing,
-    TemporalConvNet,
-    TCNLightning,
-)
-from dagobert.modelling.utils import update_lookback
+from dagobert.naming import NPreprocessingArgs as npa
+from dagobert.modelling.dl import PortfolioCryptoDataset, Preprocessing, TCNLightning
+from dagobert.modelling.rl.utils import sharpe_ratio, max_drawdown
 
 
 logger = logging.getLogger(__name__)
@@ -31,20 +27,23 @@ class RLData(object):
     module as much as possible.
     """
 
-    def __init__(self, hparams: Namespace, train_val_test: str = "train"):
+    def __init__(
+        self,
+        hparams: Namespace,
+        train_val_test: str = "train",
+    ):
         """
         Class constructor.
 
         Args:
             hparams: Hyparams parsed by the rl_runner. Similar to how `TCNLightning` is
                 initialized.
-
             train_val_test: Whether we are training, validating or testing, it must be
                 either train, val or test.
         """
-        hparams = RLData._pre_sanity_check(hparams)
-        hparams = Preprocessing().preprocess_augment_dfs(hparams)
-        self.hparams = Preprocessing().preprocess_train_dfs(hparams)
+        self.idx = 0
+        self.hparams = hparams
+
         if train_val_test == "train":
             augment_dfs = self.hparams.augment_dfs
             augment_method = self.hparams.augment_method
@@ -62,51 +61,34 @@ def __init__(self, hparams: Namespace, train_val_test: str = "train"):
             augment_dfs=augment_dfs,
             augment_dfs_mix=self.hparams.augment_dfs_mix,
         )
-
-        self.dataloader = iter(DataLoader(self.dataset))
-
-        from IPython import embed
-
-        embed()
+        self.dataset_len = len(self.dataset)
+        self.reset()
 
     def step(self):
-        Xs, ys = self.dataset[idx]
-        return Xs, y, False
+        Xs, ys = self.dataset[self.idx]
+        y1 = np.concatenate([[1.0], ys])
+        episode_full = self.idx == self.hparams.episode_length - 1
+        done = True if episode_full else False
+        self.idx += 1
+        return Xs, y1, done
 
     def reset(self):
-        self.dataloader = iter(DataLoader(self.dataset))
+        self.idx = np.random.randint(self.dataset_len - self.hparams.episode_length)
 
     @staticmethod
     def _pre_sanity_check(hparams: Namespace):
-
+        # ensure we have the rl specific target column in the config
         if hparams.target_col != NRL.rl_return:
             raise ValueError("target_col has to be rl_return for RL tasks.")
 
-        net_depth = len(hparams.num_channels)
-        k_size = hparams.kernel_size
-        max_seq_len = TemporalConvNet.get_tcn_receptive_field_size(k_size, net_depth)
-        logger.info(
-            f"A TCN with kernel size: {k_size} and depth: {net_depth} has a receptive "
-            f"field (can read a maximum sequence length) of {max_seq_len}."
-        )
-        if hparams.mini_series_length == "auto":
-            logger.info(f"We set mini_series_length from 'auto' to {max_seq_len}.")
-            hparams.mini_series_length = max_seq_len
-        if (
-            hparams.mini_series_length != "auto"
-            and hparams.mini_series_length > max_seq_len
-        ):
-            logger.warning(
-                f"Provided mini-series length: {hparams.mini_series_length} is "
-                f"larger than the networks receptive field size: {max_seq_len}."
-            )
-        # calcualte what the current TCN setup corresponds to in hourly lookback
-        df_anchor = TCNLightning._load_anchor(hparams)
-        hparams.lookback = update_lookback(df_anchor, hparams.mini_series_length)
-        logger.info(
-            f"The current mini_series_legnth {hparams.mini_series_length}, "
-            f"corresponds to an estimated lookback of {hparams.lookback} hours."
-        )
+        # make sure we have the same cols for each instrument
+        # this helps to have an environment with a single tensor as state
+        if len(hparams.cols_to_model) > 1:
+            for df_name, cols in hparams.cols_to_model.items():
+                hparams.cols_to_model[df_name] = hparams.cols_to_model[npa.anchor]
+
+        # MINI SERIES / LOOKBACK
+        hparams = TCNLightning._pre_sanity_check_mini_series_lookback(hparams)
         return hparams
 
 
@@ -128,7 +110,7 @@ def __init__(
         self,
         asset_names: List[str],
         episode_length: int = 1000,
-        trading_cost: float = 0.001,
+        trading_cost: float = 0.002,
         reward_type: str = "return",
     ):
         """
@@ -136,7 +118,8 @@ def __init__(
 
         Args:
             asset_names: Names of assets in the portfolio.
-            trading_cost: Commission rate, currently set to Binance's VIP0 taker level.
+            trading_cost: Commission rate, currently set to Binance's VIP0 taker level
+                plus doubled it to account for slippage. TODO: model slippage.
             reward_type: Whether to use the log return as reward or the sharpe ratio,
                 which was found to be more stable.
         """
@@ -146,7 +129,7 @@ def __init__(
         self.reward_type = reward_type
         self.reset()
 
-    def _step(self, w1: np.array, y1: np.array) -> Tuple[float, dict, bool]:
+    def step(self, w1: np.array, y1: np.array) -> Tuple[float, dict, bool]:
         """
         See Figure 1 in https://arxiv.org/abs/1706.10059 to understand what this one
         step corresponds to, also the equation numbers correspond to the paper's.
@@ -226,105 +209,69 @@ class RLEnv(gym.Env):
 
     def __init__(
         self,
-        history,
-        abbreviation,
-        trading_cost=0.0025,
-        time_cost=0.00,
-        window_length=50,
-        start_idx=0,
-        sample_start_date=None,
+        hparams: Namespace,
+        asset_names: List[str],
+        train_val_test: str = "train",
+        episode_length: int = 1000,
+        trading_cost: float = 0.001,
+        reward_type: str = "return",
     ):
         """
         An environment for financial portfolio management.
-        Params:
-            steps - steps in episode
-            scale - scale data and each episode (except return)
-            augment - fraction to randomly shift data by
-            trading_cost - cost of trade as a fraction
-            time_cost - cost of holding as a fraction
-            window_length - how many past observations to return
-            start_idx - The number of days from '2012-08-13' of the dataset
-            sample_start_date - The start date sampling from the history
+
+        Args:
+            hparams:
+            asset_names:
+
+            train_val_test:
+            episode_length:
+            trading_cost:
+            reward_type:
         """
-        self.window_length = window_length
-        self.num_stocks = history.shape[0]
-        self.start_idx = start_idx
-
-        self.src = RLData(
-            history,
-            abbreviation,
-            steps=steps,
-            window_length=window_length,
-            start_idx=start_idx,
-            start_date=sample_start_date,
-        )
+        # prepare datafiles if necessary
+        hparams = RLData._pre_sanity_check(hparams)
+        hparams = Preprocessing().preprocess_augment_dfs(hparams)
+        self.hparams = Preprocessing().preprocess_train_dfs(hparams)
+        self.asset_names = self.hparams.asset_names
+        self.asset_n = len(self.asset_names)
+        self.feat_n = len(self.hparams.cols_to_model[npa.anchor])
 
-        self.sim = RLPortfolio(
-            asset_names=abbreviation,
-            trading_cost=trading_cost,
-            time_cost=time_cost,
-            steps=steps,
-        )
+        self.data = RLData(self.hparams, train_val_test="train")
+        self.portfolio = RLPortfolio(self.asset_names, self.hparams.episode_length)
 
-        # openai gym attributes
-        # action will be the portfolio weights from 0 to 1 for each asset
+        # include cash in the portfolio action space
         self.action_space = gym.spaces.Box(
-            0, 1, shape=(len(self.src.asset_names) + 1,), dtype=np.float32
-        )  # include cash
+            0.0, 1.0, shape=(self.asset_n + 1), dtype=np.float32
+        )
 
         # get the observation space from the data min and max
-        self.observation_space = gym.spaces.Box(
-            low=-np.inf,
-            high=np.inf,
-            shape=(len(abbreviation), window_length, history.shape[-1]),
-            dtype=np.float32,
+        self.observation_space = gym.spaces.Dict(
+            {
+                "state": gym.spaces.Box(
+                    low=-10,
+                    high=10,
+                    shape=(self.asset_n, self.feat_n, self.hparams.mini_series_length),
+                    dtype=np.float32,
+                ),
+                "weights": self.action_space,
+            }
         )
 
-    def step(self, action):
-        return self._step(action)
-
-    def _step(self, action):
-        """
-        Step the env.
-        Actions should be portfolio [w0...]
-        - Where wn is a portfolio weight from 0 to 1. The first is cash_bias
-        - cn is the portfolio conversion weights see PortioSim._step for description
+    def step(self, action: np.array):
         """
-        np.testing.assert_almost_equal(action.shape, (len(self.sim.asset_names) + 1,))
+        Step in the environment.
 
-        # normalise just in case
+        Args:
+            action: Portfolio weights for the N assets and the cash (first item).
+                They should all be between 0 and 1 (no shorting) and sum to 1.
+        """
+        # cut and normalise action (just in case)
         action = np.clip(action, 0, 1)
-
-        weights = action  # np.array([cash_bias] + list(action))  # [w0, w1...]
+        weights = action
         weights /= weights.sum() + eps
-        weights[0] += np.clip(
-            1 - weights.sum(), 0, 1
-        )  # so if weights are all zeros we normalise to [1,0...]
-
-        assert ((action >= 0) * (action <= 1)).all(), (
-            "all action values should be between 0 and 1. Not %s" % action
-        )
-        np.testing.assert_almost_equal(
-            np.sum(weights),
-            1.0,
-            3,
-            err_msg='weights should sum to 1. action="%s"' % weights,
-        )
-
-        observation, done1, ground_truth_obs = self.src._step()
-
-        # concatenate observation with ones
-        cash_observation = np.ones((1, self.window_length, observation.shape[2]))
-        observation = np.concatenate((cash_observation, observation), axis=0)
-
-        cash_ground_truth = np.ones((1, 1, ground_truth_obs.shape[2]))
-        ground_truth_obs = np.concatenate((cash_ground_truth, ground_truth_obs), axis=0)
 
-        # relative price vector of last observation day (close/open)
-        close_price_vector = observation[:, -1, 3]
-        open_price_vector = observation[:, -1, 0]
-        y1 = close_price_vector / open_price_vector
-        reward, info, done2 = self.sim._step(weights, y1)
+        Xs, y1, done1 = self.data.step()
+        reward, info, done2 = self.portfolio.step(weights, y1)
 
         # calculate return for buy and hold a bit of each asset
         info["market_value"] = np.cumprod(
diff --git a/src/dagobert/modelling/rl/utils.py b/src/dagobert/modelling/rl/utils.py
new file mode 100644
index 00000000..f2651087
--- /dev/null
+++ b/src/dagobert/modelling/rl/utils.py
@@ -0,0 +1,21 @@
+"""Util functions for portfolio optimization and other RL related tasks"""
+
+import numpy as np
+
+eps = np.finfo(float).eps
+
+
+def sharpe_ratio(returns, freq: int = 30, rfr: int = 0):
+    """
+    Given a set of returns, calculates naive (rfr=0) sharpe (eq 28).
+    """
+    return (np.sqrt(freq) * np.mean(returns - rfr + eps)) / np.std(returns - rfr + eps)
+
+
+def max_drawdown(returns):
+    """
+    Max drawdown. See https://www.investopedia.com/terms/m/maximum-drawdown-mdd.asp
+    """
+    peak = returns.max()
+    trough = returns[returns.argmax() :].min()
+    return (trough - peak) / (peak + eps)

From 6e466d16da76cabd26874a7a770fc266dc0ce06d Mon Sep 17 00:00:00 2001
From: Daniel Homola <dani.homola@gmail.com>
Date: Thu, 31 Dec 2020 14:23:01 +0000
Subject: [PATCH 06/62] working on environment

---
 src/dagobert/modelling/rl/environment.py | 62 +++++-------------------
 1 file changed, 13 insertions(+), 49 deletions(-)

diff --git a/src/dagobert/modelling/rl/environment.py b/src/dagobert/modelling/rl/environment.py
index ba900123..4a198942 100644
--- a/src/dagobert/modelling/rl/environment.py
+++ b/src/dagobert/modelling/rl/environment.py
@@ -74,6 +74,7 @@ def step(self):
 
     def reset(self):
         self.idx = np.random.randint(self.dataset_len - self.hparams.episode_length)
+        return self.step()
 
     @staticmethod
     def _pre_sanity_check(hparams: Namespace):
@@ -232,14 +233,16 @@ def __init__(
         hparams = RLData._pre_sanity_check(hparams)
         hparams = Preprocessing().preprocess_augment_dfs(hparams)
         self.hparams = Preprocessing().preprocess_train_dfs(hparams)
+
+        # setup data and portfolio and other vars
+        self.infos = []
         self.asset_names = self.hparams.asset_names
         self.asset_n = len(self.asset_names)
         self.feat_n = len(self.hparams.cols_to_model[npa.anchor])
-
         self.data = RLData(self.hparams, train_val_test="train")
         self.portfolio = RLPortfolio(self.asset_names, self.hparams.episode_length)
 
-        # include cash in the portfolio action space
+        # setup openai gym env - include cash in the portfolio action space
         self.action_space = gym.spaces.Box(
             0.0, 1.0, shape=(self.asset_n + 1), dtype=np.float32
         )
@@ -270,56 +273,17 @@ def step(self, action: np.array):
         weights = action
         weights /= weights.sum() + eps
 
-        Xs, y1, done1 = self.data.step()
+        next_state, y1, done1 = self.data.step()
         reward, info, done2 = self.portfolio.step(weights, y1)
-
-        # calculate return for buy and hold a bit of each asset
-        info["market_value"] = np.cumprod(
-            [inf["return"] for inf in self.infos + [info]]
-        )[-1]
-        # add dates
-        info["date"] = self.start_idx + self.src.idx + self.src.step
-        info["steps"] = self.src.step
-        info["next_obs"] = ground_truth_obs
-
         self.infos.append(info)
 
-        return observation, reward, done1 or done2, info
+        return next_state, reward, done1 or done2, info
 
     def reset(self):
-        return self._reset()
-
-    def _reset(self):
         self.infos = []
-        self.sim.reset()
-        observation, ground_truth_obs = self.src.reset()
-        cash_observation = np.ones((1, self.window_length, observation.shape[2]))
-        observation = np.concatenate((cash_observation, observation), axis=0)
-        cash_ground_truth = np.ones((1, 1, ground_truth_obs.shape[2]))
-        ground_truth_obs = np.concatenate((cash_ground_truth, ground_truth_obs), axis=0)
-        info = {}
-        info["next_obs"] = ground_truth_obs
-        return observation, info
-
-    def _render(self, mode="human", close=False):
-        if close:
-            return
-        if mode == "ansi":
-            print(self.infos[-1])
-        elif mode == "human":
-            self.plot()
-
-    def render(self, mode="human", close=False):
-        return self._render(mode="human", close=False)
-
-    def plot(self):
-        # show a plot of portfolio vs mean market performance
-        df_info = pd.DataFrame(self.infos)
-        df_info["date"] = pd.to_datetime(df_info["date"], format="%Y-%m-%d")
-        df_info.set_index("date", inplace=True)
-        mdd = max_drawdown(df_info.rate_of_return + 1)
-        sharpe_ratio = sharpe(df_info.rate_of_return)
-        title = "max_drawdown={: 2.2%} sharpe_ratio={: 2.4f}".format(mdd, sharpe_ratio)
-        df_info[["portfolio_value", "market_value"]].plot(
-            title=title, fig=plt.gcf(), rot=30
-        )
+        self.portfolio.reset()
+        next_state, _, _ = self.data.reset()
+        return next_state
+
+    def render(self):
+        pass

From 28b11cb78dca51b9d33254ee54f2f0f00b15744b Mon Sep 17 00:00:00 2001
From: Daniel Homola <dani.homola@gmail.com>
Date: Fri, 1 Jan 2021 13:48:17 +0000
Subject: [PATCH 07/62] adding rl args and setting up the runner properly, time
 to go through the networks and the actual algo

---
 config/rl_config.yaml                      |  37 ++--
 src/dagobert/modelling/dl/__init__.py      |   2 +-
 src/dagobert/modelling/dl/preprocessing.py |  10 +-
 src/dagobert/modelling/dl/tcn.py           |  11 +-
 src/dagobert/modelling/dl/tcn_args.py      |  18 +-
 src/dagobert/modelling/rl/__init__.py      |   2 +
 src/dagobert/modelling/rl/environment.py   |  60 ++----
 src/dagobert/modelling/rl/ppo.py           | 238 ++++++++++++---------
 src/dagobert/modelling/rl/rl.py            |   6 -
 src/dagobert/modelling/rl/rl_args.py       | 177 +++++++++++++++
 src/dagobert/modelling/rl/rl_runner.py     |   8 +-
 11 files changed, 366 insertions(+), 203 deletions(-)
 delete mode 100644 src/dagobert/modelling/rl/rl.py
 create mode 100644 src/dagobert/modelling/rl/rl_args.py

diff --git a/config/rl_config.yaml b/config/rl_config.yaml
index 03a64656..dae94a58 100644
--- a/config/rl_config.yaml
+++ b/config/rl_config.yaml
@@ -5,7 +5,6 @@
 # --------------------------------------------------------------------------------------
 
 gpus: 1
-multiprocessing: False
 pin_memory: True
 profiler: True
 val_check_interval: 0.5
@@ -31,25 +30,36 @@ batch_size: 128
 # RL
 # --------------------------------------------------------------------------------------
 
-episode_length: 1000
-target_col: rl_return
 asset_names: 
   - BTC
   - ETH
+trading_cost: 0.002
+reward_type: return
+max_episode_length: 1000
+steps_per_epoch: 2000
+n_optim_iters: 4
+gamma: 0.99
+lamb: 0.95
+lr_actor: 0.0003
+lr_critic: 0.001
+clip_ratio: 0.2
+
+# don't change these, or preprocessing won't work 
+target_col: rl_return
+to_label: False
+no_sample_weights: True
+binariser_method: 
 
 # --------------------------------------------------------------------------------------
 # MODEL
 # --------------------------------------------------------------------------------------
 
-num_channels: [150, 150, 150, 150, 150, 150, 150]
-kernel_size: 10
-dropout: 0.5
-use_last_timepoint: True
-last_y: False
-non_last_y_frac: 0.5
-regression: False
-density_num: 3
-mix_density_net: False
+actor_num_channels: [50, 50, 50, 50, 50]
+actor_kernel_size: 5
+actor_dropout: 0.25
+critic_num_channels: [50, 50, 50, 50, 50]
+critic_kernel_size: 5
+critic_dropout: 0.25
 
 # --------------------------------------------------------------------------------------
 # DATA
@@ -110,6 +120,7 @@ cols_to_model:
     - mdi_60
     - vr_60
   df2:
+    # the cols of the secondary DFs will automatically be set to anchor's
     
 time_feat_n: 1
 time_embed_dim: 12
@@ -132,6 +143,4 @@ test_days: 30
 test_train_offset_days: 62
 test_puffer_days: 1
 
-sample_weights: 
-binariser_method: 
 scaling_method: minmax
diff --git a/src/dagobert/modelling/dl/__init__.py b/src/dagobert/modelling/dl/__init__.py
index 81ddc443..4dd9f930 100644
--- a/src/dagobert/modelling/dl/__init__.py
+++ b/src/dagobert/modelling/dl/__init__.py
@@ -1,4 +1,4 @@
-from .data import CryptoDataset, PortfolioCryptoDataset
+from .data import CryptoDataset, PortfolioCryptoDataset, ExperienceSourceDataset
 from .tcn_net import TemporalConvNet
 from .utils import LogCoshLoss, FocalLoss, MixedNormalPDFLoss
 from .adabelief import AdaBelief
diff --git a/src/dagobert/modelling/dl/preprocessing.py b/src/dagobert/modelling/dl/preprocessing.py
index ffa6916a..ebce640c 100644
--- a/src/dagobert/modelling/dl/preprocessing.py
+++ b/src/dagobert/modelling/dl/preprocessing.py
@@ -85,10 +85,7 @@ def preprocess_augment_dfs(hparams: Namespace) -> Namespace:
 
     @staticmethod
     def _preprocess_augment_dfs(
-        hparams: Namespace,
-        df_name: str,
-        df_path: str,
-        df_path_prev: str,
+        hparams: Namespace, df_name: str, df_path: str, df_path_prev: str,
     ) -> Optional:
         """
         Helper function that performs the preprocessing of simple augment DFs.
@@ -553,10 +550,7 @@ def _quantile_filter(
 
     @staticmethod
     def _binarise(
-        df: pd.DataFrame,
-        method: str,
-        threshold: float,
-        df_name: str = "",
+        df: pd.DataFrame, method: str, threshold: float, df_name: str = "",
     ) -> pd.DataFrame:
         """
         Binarises a DF with the provided method and threshold.
diff --git a/src/dagobert/modelling/dl/tcn.py b/src/dagobert/modelling/dl/tcn.py
index e49304fc..6aacde87 100644
--- a/src/dagobert/modelling/dl/tcn.py
+++ b/src/dagobert/modelling/dl/tcn.py
@@ -5,7 +5,6 @@
 import logging
 from pathlib import Path
 from typing import Optional
-from functools import partial
 from argparse import Namespace
 
 import numpy as np
@@ -131,11 +130,7 @@ def setup_and_run_tcn_lightning(args: Namespace, study: bool = False):
         args,
         logger=tcn_loggers,
         checkpoint_callback=checkpoint_callback,
-        callbacks=[
-            early_stop_callback,
-            metrics_callback,
-            LearningRateMonitor(),
-        ],
+        callbacks=[early_stop_callback, metrics_callback, LearningRateMonitor(),],
     )
     model = TCNLightning(args)
 
@@ -178,6 +173,7 @@ def __init__(self, hparams: Namespace):
         Args:
             hparams: Hyper-params passed in to the module. See the docs for more details
                 https://pytorch-lightning.readthedocs.io/en/latest/hyperparameters.html
+                and dagobert.modelling.dl.tcn_args for more information on the params.
         """
 
         # define main vars (other than model)
@@ -212,8 +208,7 @@ def __init__(self, hparams: Namespace):
         if self.hparams.mix_density_net:
             self.linear_mu = nn.Linear(self.hparams.num_channels[-1], self.density_num)
             self.linear_sigmasq = nn.Linear(
-                self.hparams.num_channels[-1],
-                self.density_num,
+                self.hparams.num_channels[-1], self.density_num,
             )
             self.linear_mix = nn.Linear(self.hparams.num_channels[-1], self.density_num)
         self = self.float()
diff --git a/src/dagobert/modelling/dl/tcn_args.py b/src/dagobert/modelling/dl/tcn_args.py
index d08b0a0b..cd15e7db 100644
--- a/src/dagobert/modelling/dl/tcn_args.py
+++ b/src/dagobert/modelling/dl/tcn_args.py
@@ -43,10 +43,7 @@ def add_run_specific_args(parent_parser):
         help="Number of cores to use to prepare the batches.",
     )
     parser.add_argument(
-        "--exp_name",
-        type=str,
-        default="TCN",
-        help="Name of experiment.",
+        "--exp_name", type=str, default="TCN", help="Name of experiment.",
     )
     parser.add_argument(
         "--tags",
@@ -236,9 +233,7 @@ def add_data_specific_args(parent_parser):
     # this is just a place-holder so it's easier to read the million params in the cmd
     parser.add_argument("--DATA_PARAMS", help="====================================")
     parser.add_argument(
-        "--data_dir",
-        type=str,
-        help="Path to folder holding the data files to use.",
+        "--data_dir", type=str, help="Path to folder holding the data files to use.",
     )
     parser.add_argument(
         "--lookback", type=float, default=6, help="Lookback length in hours."
@@ -272,9 +267,7 @@ def add_data_specific_args(parent_parser):
         ),
     )
     parser.add_argument(
-        "--to_label",
-        action="store_true",
-        help="Label datasets before preprocessing.",
+        "--to_label", action="store_true", help="Label datasets before preprocessing.",
     )
     parser.add_argument(
         "--label_sl", type=int, default=1, help="Stop-loss barrier size."
@@ -286,10 +279,7 @@ def add_data_specific_args(parent_parser):
         "--label_first_or_max",
         type=str,
         default=NBarriers.first,
-        choices=[
-            NBarriers.first,
-            NBarriers.max,
-        ],
+        choices=[NBarriers.first, NBarriers.max,],
         help="Weather to use the first or maximum barrier-touch.",
     )
     parser.add_argument(
diff --git a/src/dagobert/modelling/rl/__init__.py b/src/dagobert/modelling/rl/__init__.py
index 56d3ada1..d848e772 100644
--- a/src/dagobert/modelling/rl/__init__.py
+++ b/src/dagobert/modelling/rl/__init__.py
@@ -1 +1,3 @@
 from .environment import RLData, RLPortfolio, RLEnv
+from .networks import create_mlp, ActorCriticAgent, ActorContinous
+from .ppo import PPO, run_rl
diff --git a/src/dagobert/modelling/rl/environment.py b/src/dagobert/modelling/rl/environment.py
index 4a198942..dd71b9c2 100644
--- a/src/dagobert/modelling/rl/environment.py
+++ b/src/dagobert/modelling/rl/environment.py
@@ -11,9 +11,8 @@
 import matplotlib.pyplot as plt
 from torch.utils.data import Dataset, DataLoader
 
-from dagobert.naming import NRL
 from dagobert.naming import NPreprocessingArgs as npa
-from dagobert.modelling.dl import PortfolioCryptoDataset, Preprocessing, TCNLightning
+from dagobert.modelling.dl import PortfolioCryptoDataset
 from dagobert.modelling.rl.utils import sharpe_ratio, max_drawdown
 
 
@@ -28,9 +27,7 @@ class RLData(object):
     """
 
     def __init__(
-        self,
-        hparams: Namespace,
-        train_val_test: str = "train",
+        self, hparams: Namespace, train_val_test: str = "train",
     ):
         """
         Class constructor.
@@ -67,31 +64,15 @@ def __init__(
     def step(self):
         Xs, ys = self.dataset[self.idx]
         y1 = np.concatenate([[1.0], ys])
-        episode_full = self.idx == self.hparams.episode_length - 1
+        episode_full = self.idx == self.hparams.max_episode_length - 1
         done = True if episode_full else False
         self.idx += 1
         return Xs, y1, done
 
     def reset(self):
-        self.idx = np.random.randint(self.dataset_len - self.hparams.episode_length)
+        self.idx = np.random.randint(self.dataset_len - self.hparams.max_episode_length)
         return self.step()
 
-    @staticmethod
-    def _pre_sanity_check(hparams: Namespace):
-        # ensure we have the rl specific target column in the config
-        if hparams.target_col != NRL.rl_return:
-            raise ValueError("target_col has to be rl_return for RL tasks.")
-
-        # make sure we have the same cols for each instrument
-        # this helps to have an environment with a single tensor as state
-        if len(hparams.cols_to_model) > 1:
-            for df_name, cols in hparams.cols_to_model.items():
-                hparams.cols_to_model[df_name] = hparams.cols_to_model[npa.anchor]
-
-        # MINI SERIES / LOOKBACK
-        hparams = TCNLightning._pre_sanity_check_mini_series_lookback(hparams)
-        return hparams
-
 
 class RLPortfolio(object):
     """
@@ -110,7 +91,7 @@ class RLPortfolio(object):
     def __init__(
         self,
         asset_names: List[str],
-        episode_length: int = 1000,
+        max_episode_length: int = 1000,
         trading_cost: float = 0.002,
         reward_type: str = "return",
     ):
@@ -208,39 +189,22 @@ class RLEnv(gym.Env):
     https://github.com/wassname/rl-portfolio-management
     """
 
-    def __init__(
-        self,
-        hparams: Namespace,
-        asset_names: List[str],
-        train_val_test: str = "train",
-        episode_length: int = 1000,
-        trading_cost: float = 0.001,
-        reward_type: str = "return",
-    ):
+    def __init__(self, hparams: Namespace):
         """
         An environment for financial portfolio management.
 
         Args:
             hparams:
-            asset_names:
-
-            train_val_test:
-            episode_length:
-            trading_cost:
-            reward_type:
+            
         """
-        # prepare datafiles if necessary
-        hparams = RLData._pre_sanity_check(hparams)
-        hparams = Preprocessing().preprocess_augment_dfs(hparams)
-        self.hparams = Preprocessing().preprocess_train_dfs(hparams)
-
-        # setup data and portfolio and other vars
         self.infos = []
-        self.asset_names = self.hparams.asset_names
-        self.asset_n = len(self.asset_names)
+        self.hparams = hparams
+        self.asset_n = len(self.hparams.asset_names)
         self.feat_n = len(self.hparams.cols_to_model[npa.anchor])
         self.data = RLData(self.hparams, train_val_test="train")
-        self.portfolio = RLPortfolio(self.asset_names, self.hparams.episode_length)
+        self.portfolio = RLPortfolio(
+            self.hparams.asset_names, self.hparams.max_episode_length
+        )
 
         # setup openai gym env - include cash in the portfolio action space
         self.action_space = gym.spaces.Box(
diff --git a/src/dagobert/modelling/rl/ppo.py b/src/dagobert/modelling/rl/ppo.py
index 30b1e957..46cbc377 100644
--- a/src/dagobert/modelling/rl/ppo.py
+++ b/src/dagobert/modelling/rl/ppo.py
@@ -1,108 +1,128 @@
+"""
+Pytorch Lightning module of Proximal Policy Optimization RL algorithm, taken and 
+modified from https://github.com/sid-sundrani/ppo_lightning.
+"""
+# pylint: disable=no-member
+import logging
 from typing import List, Tuple
+from argparse import Namespace
 
-import pytorch_lightning as pl
-from networks import create_mlp, ActorCriticAgent, ActorCategorical, ActorContinous
-from data import ExperienceSourceDataset
-
+import gym
 import torch
-from torch.utils.data import DataLoader
-import torch.optim as optim
-from torch.optim.optimizer import Optimizer
 import numpy as np
+import torch.optim as optim
+from torch.utils.data import DataLoader
+from pytorch_lightning import LightningModule
+from pytorch_lightning import Trainer, Callback, loggers
+from pytorch_lightning.metrics import functional as plm
+from pytorch_lightning.callbacks import ModelCheckpoint
+from pytorch_lightning.trainer import seed_everything
+
+
+from dagobert.naming import NRL, NPreprocessingArgs as npa
+from dagobert.modelling.rl import (
+    RLEnv,
+    create_mlp,
+    ActorCriticAgent,
+    ActorContinous,
+)
+from dagobert.modelling.dl import (
+    ExperienceSourceDataset,
+    Preprocessing,
+    TCNLightning,
+)
+
+
+logger = logging.getLogger(__name__)
+
+
+def run_rl(args):
+    # setup loggers
+    seed_everything(args.seed)
+    tb_logger_name = None
+    comet_name = args.exp_name
+    tcn_loggers = []
+    tb_logger = loggers.TensorBoardLogger(
+        save_dir=Path(args.log_dir), name=args.exp_name, version=tb_logger_name
+    )
+    tcn_loggers.append(tb_logger)
+    if not args.no_comet_logger:
+        tcn_loggers.append(
+            loggers.CometLogger(
+                api_key=NStudy.comet_api_key,
+                workspace=NStudy.comet_workspace,
+                save_dir=args.log_dir,
+                project_name=NStudy.comet_project_name,
+                experiment_name=f"{comet_name}_{tb_logger.version}",
+            )
+        )
 
-try:
-    import gym
-except ModuleNotFoundError:
-    _GYM_AVAILABLE = False
-else:
-    _GYM_AVAILABLE = True
-
-
-class PPO(pl.LightningModule):
+    # setup callbacks
+    checkpoint_callback = ModelCheckpoint(
+        monitor="loss/val",
+        filename="_{epoch:02d}_{loss_val:.10f}",
+        dirpath=f"{args.log_dir}/models/{args.exp_name}_{tb_logger.version}",
+        save_top_k=3,
+        mode="max",
+    )
+
+    # define trainer and and lightning module
+    args.multiprocessing = True if args.gpus != 1 else False
+    trainer = Trainer.from_argparse_args(
+        args,
+        logger=tcn_loggers,
+        checkpoint_callback=checkpoint_callback,
+        callbacks=[early_stop_callback, metrics_callback, LearningRateMonitor(),],
+    )
+    model = PPO(args)
+    trainer.fit(model)
+    # trainer.test()
+
+    # return the validation and test loss for Optuna mainly
+    try:
+        val_loss = metrics_callback.get_min_max_metric("loss/val")
+        test_loss = metrics_callback.get_min_max_metric("loss/test")
+    except:
+        val_loss, test_loss = np.nan, np.nan
+    return val_loss, test_loss
+    rld = RLData(args)
+
+
+class PPO(LightningModule):
     """
-    PyTorch Lightning implementation of `PPO
-    <https://arxiv.org/abs/1707.06347>`_
-    Paper authors: John Schulman, Filip Wolski, Prafulla Dhariwal, Alec Radford, Oleg Klimov
-
-    Example:
-        model = PPO("CartPole-v0")
-    Train:
-        trainer = Trainer()
-        trainer.fit(model)
-    Note:
-        This example is based on:
-        https://github.com/openai/baselines/blob/master/baselines/ppo2/ppo2.py
-        https://github.com/PyTorchLightning/pytorch-lightning-bolts/blob/master/pl_bolts/models/rl/reinforce_model.py
-
+    PyTorch Lightning implementation of PPO https://arxiv.org/abs/1707.06347
+    The algorithm closely follows this:
+    https://github.com/openai/baselines/blob/master/baselines/ppo2/ppo2.py
     """
 
-    def __init__(
-        self,
-        env: str,
-        gamma: float = 0.99,
-        lam: float = 0.95,
-        lr_actor: float = 3e-4,
-        lr_critic: float = 1e-3,
-        max_episode_len: float = 200,
-        batch_size: int = 512,
-        steps_per_epoch: int = 2048,
-        nb_optim_iters: int = 4,
-        clip_ratio: float = 0.2,
-    ) -> None:
-
+    def __init__(self, hparams: Namespace):
         """
+        Class constructor.
+
         Args:
-            env: gym environment tag
-            gamma: discount factor
-            lam: advantage discount factor (lambda in the paper)
-            lr_actor: learning rate of actor network
-            lr_critic: learning rate of critic network
-            max_episode_len: maximum number interactions (actions) in an episode
-            batch_size:  batch_size when training network- can simulate number of policy updates performed per epoch
-            steps_per_epoch: how many action-state pairs to rollout for trajectory collection per epoch
-            nb_optim_iters: how many steps of gradient descent to perform on each batch
-            clip_ratio: hyperparameter for clipping in the policy objective
+            hparams: Hyper-params passed in to the module. See the docs for more details
+                https://pytorch-lightning.readthedocs.io/en/latest/hyperparameters.html
+                and dagobert.modelling.rl.rl_args for more information on the params.
         """
         super().__init__()
 
-        if not _GYM_AVAILABLE:
-            raise ModuleNotFoundError(
-                "This Module requires gym environment which is not installed yet."
-            )
+        # sanity check and setup device
+        hparams = PPO._pre_sanity_check(hparams)
+        self.tcn_device = "cuda" if hparams.gpus > 0 else "cpu"
 
-        # Hyperparameters
-        self.lr_actor = lr_actor
-        self.lr_critic = lr_critic
-        self.steps_per_epoch = steps_per_epoch
-        self.nb_optim_iters = nb_optim_iters
-        self.batch_size = batch_size
-        self.gamma = gamma
-        self.lam = lam
-        self.max_episode_len = max_episode_len
-        self.clip_ratio = clip_ratio
-        self.save_hyperparameters()
-
-        self.env = gym.make(env)
-        # value network
+        # prepare datafiles if necessary
+        hparams = Preprocessing().preprocess_augment_dfs(hparams)
+        self.hparams = Preprocessing().preprocess_train_dfs(hparams)
+
+        # create env and policy/value networks
+        self.env = RLEnv(hparams)
         self.critic = create_mlp(self.env.observation_space.shape, 1)
-        # policy network (agent)
-        if type(self.env.action_space) == gym.spaces.box.Box:
-            act_dim = self.env.action_space.shape[0]
-            actor_mlp = create_mlp(self.env.observation_space.shape, act_dim)
-            self.actor = ActorContinous(actor_mlp, act_dim)
-        elif type(self.env.action_space) == gym.spaces.discrete.Discrete:
-            actor_mlp = create_mlp(
-                self.env.observation_space.shape, self.env.action_space.n
-            )
-            self.actor = ActorCategorical(actor_mlp)
-        else:
-            raise NotImplementedError(
-                "Env action space should be of type Box (continous) or Discrete (categorical)"
-                "Got type: ",
-                type(self.env.action_space),
-            )
+        act_dim = self.env.action_space.shape[0]
+        actor_mlp = create_mlp(self.env.observation_space.shape, act_dim)
+        self.actor = ActorContinous(actor_mlp, act_dim)
         self.agent = ActorCriticAgent(self.actor, self.critic)
 
+        # init
         self.batch_states = []
         self.batch_actions = []
         self.batch_adv = []
@@ -168,10 +188,10 @@ def calc_advantage(
         vals = values + [last_value]
         # GAE
         delta = [
-            rews[i] + self.gamma * vals[i + 1] - vals[i] for i in range(len(rews) - 1)
+            rews[i] + self.hparams.gamma * vals[i + 1] - vals[i]
+            for i in range(len(rews) - 1)
         ]
-        adv = self.discount_rewards(delta, self.gamma * self.lam)
-
+        adv = self.discount_rewards(delta, self.hparams.gamma * self.hparams.lam)
         return adv
 
     def train_batch(
@@ -183,7 +203,7 @@ def train_batch(
            Tuple of Lists containing tensors for states, actions, log probs, qvals and advantage
         """
 
-        for step in range(self.steps_per_epoch):
+        for step in range(self.hparams.steps_per_epoch):
             pi, action, log_prob, value = self.agent(self.state, self.device)
             next_state, reward, done, _ = self.env.step(action.cpu().numpy())
 
@@ -196,8 +216,8 @@ def train_batch(
 
             self.state = torch.FloatTensor(next_state)
 
-            epoch_end = step == (self.steps_per_epoch - 1)
-            terminal = len(self.ep_rewards) == self.max_episode_len
+            epoch_end = step == (self.hparams.steps_per_epoch - 1)
+            terminal = len(self.ep_rewards) == self.hparams.max_episode_length
 
             if epoch_end or done or terminal:
                 # if trajectory ends abtruptly, boostrap value of next state
@@ -210,7 +230,7 @@ def train_batch(
 
                 # discounted cumulative reward
                 self.batch_qvals += self.discount_rewards(
-                    self.ep_rewards + [last_value], self.gamma
+                    self.ep_rewards + [last_value], self.hparams.gamma
                 )[:-1]
                 # advantage
                 self.batch_adv += self.calc_advantage(
@@ -316,14 +336,16 @@ def training_step(
 
             return loss_critic
 
-    def configure_optimizers(self) -> List[Optimizer]:
+    def configure_optimizers(self) -> List[optim.Optimizer]:
         """ Initialize Adam optimizer"""
-        optimizer_actor = optim.Adam(self.actor.parameters(), lr=self.lr_actor)
-        optimizer_critic = optim.Adam(self.critic.parameters(), lr=self.lr_critic)
+        optimizer_actor = optim.Adam(self.actor.parameters(), lr=self.hparams.lr_actor)
+        optimizer_critic = optim.Adam(
+            self.critic.parameters(), lr=self.hparams.lr_critic
+        )
 
         # to run multple steps of gradient descent
         optimizers = []
-        for i in range(self.nb_optim_iters):
+        for i in range(self.hparams.n_optim_iters):
             optimizers.append(optimizer_actor)
             optimizers.append(optimizer_critic)
 
@@ -332,9 +354,25 @@ def configure_optimizers(self) -> List[Optimizer]:
     def _dataloader(self) -> DataLoader:
         """Initialize the Replay Buffer dataset used for retrieving experiences"""
         dataset = ExperienceSourceDataset(self.train_batch)
-        dataloader = DataLoader(dataset=dataset, batch_size=self.batch_size)
+        dataloader = DataLoader(dataset=dataset, batch_size=self.hparams.batch_size)
         return dataloader
 
     def train_dataloader(self) -> DataLoader:
         """Get train loader"""
         return self._dataloader()
+
+    @staticmethod
+    def _pre_sanity_check(hparams: Namespace):
+        # ensure we have the rl specific target column in the config
+        if hparams.target_col != NRL.rl_return:
+            raise ValueError("target_col has to be rl_return for RL tasks.")
+
+        # make sure we have the same cols for each instrument
+        # this helps to have an environment with a single tensor as state
+        if len(hparams.cols_to_model) > 1:
+            for df_name, cols in hparams.cols_to_model.items():
+                hparams.cols_to_model[df_name] = hparams.cols_to_model[npa.anchor]
+
+        # MINI SERIES / LOOKBACK
+        hparams = TCNLightning._pre_sanity_check_mini_series_lookback(hparams)
+        return hparams
diff --git a/src/dagobert/modelling/rl/rl.py b/src/dagobert/modelling/rl/rl.py
deleted file mode 100644
index 35f465d9..00000000
--- a/src/dagobert/modelling/rl/rl.py
+++ /dev/null
@@ -1,6 +0,0 @@
-from dagobert.modelling.rl import RLEnv, RLData
-
-
-def run_rl(args):
-    rld = RLData(args)
-    # env = RLEnv(args)
diff --git a/src/dagobert/modelling/rl/rl_args.py b/src/dagobert/modelling/rl/rl_args.py
new file mode 100644
index 00000000..01d2f9dd
--- /dev/null
+++ b/src/dagobert/modelling/rl/rl_args.py
@@ -0,0 +1,177 @@
+"""
+All custom arguments and hyper-parameters for the reinforcement learning module.
+"""
+
+from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
+
+from pytorch_lightning import Trainer
+
+from dagobert.modelling.dl.tcn import TCNLightning
+from dagobert.modelling.dl.tcn_args import (
+    add_run_specific_args,
+    add_data_specific_args,
+    add_preprocessing_specific_args,
+)
+from dagobert.naming import (
+    NInputDataCols,
+    NAugmentationMethods,
+    NBarriers,
+    NPreprocessingArgs,
+)
+
+
+def add_rl_specific_args(parent_parser):
+    parser = ArgumentParser(
+        parents=[parent_parser],
+        add_help=False,
+        formatter_class=ArgumentDefaultsHelpFormatter,
+    )
+
+    # this is just a place-holder so it's easier to read the million params in the cmd
+    parser.add_argument("--RL_PARAMS", help="====================================")
+    parser.add_argument(
+        "--asset_names",
+        type=str,
+        nargs="+",
+        default=["BTC", "ETH"],
+        help=(
+            "Names of instruments to include in the portfolio, corresponding to "
+            "anchor, df2, df3, etc."
+        ),
+    )
+    parser.add_argument(
+        "--trading_cost",
+        type=float,
+        default=0.002,
+        help="Commission rate of making trades + an estimated cost of slippage.",
+    )
+    parser.add_argument(
+        "--reward_type",
+        type=str,
+        default="return",
+        help=(
+            "Determines the overall reward to maximise by the agent. Either return or "
+            "sharpe. See RLPortfolio class for more details."
+        ),
+    )
+    parser.add_argument(
+        "--max_episode_length",
+        type=int,
+        default=1000,
+        help=(
+            "Maximum number of interactions between the agent and the environment in "
+            "an episode."
+        ),
+    )
+    parser.add_argument(
+        "--steps_per_epoch",
+        type=int,
+        default=10000,
+        help=(
+            "How many action-state pairs to rollout for trajectory collection per "
+            "epoch. I.e. if all episodes run to their max_episode_length, we'll have "
+            "steps_per_epoch/max_episode_length number of unique episodes/trajectories."
+        ),
+    )
+    parser.add_argument(
+        "--n_optim_iters",
+        type=int,
+        default=4,
+        help=(
+            "How many steps of gradient descent to perform on each batch. This might "
+            "seem weird, but it helps sampling efficiency, done by the original PPO "
+            "implementation and the Google ablation study found it to be useful."
+        ),
+    )
+    parser.add_argument(
+        "--gamma", type=float, default=0.99, help="Discounting of rewards."
+    )
+    parser.add_argument(
+        "--lam",
+        type=float,
+        default=0.95,
+        help="Lambda parameter in the advantage discounting equation.",
+    )
+    parser.add_argument(
+        "--lr_actor",
+        type=float,
+        default=0.0003,
+        help="Learning rate for the actor/policy network.",
+    )
+    parser.add_argument(
+        "--lr_critic",
+        type=float,
+        default=0.001,
+        help="Learning rate for the critic/value network.",
+    )
+    parser.add_argument(
+        "--clip_ratio",
+        type=float,
+        default=0.2,
+        help="Clipping parameter for the PPO's policy upgrade cost function.",
+    )
+
+    return parser
+
+
+def add_model_specific_args(parent_parser):
+    parser = ArgumentParser(
+        parents=[parent_parser],
+        add_help=False,
+        formatter_class=ArgumentDefaultsHelpFormatter,
+    )
+
+    # this is just a place-holder so it's easier to read the million params in the cmd
+    parser.add_argument("--MODEL_PARAMS", help="====================================")
+    parser.add_argument(
+        "--actor_num_channels",
+        type=int,
+        nargs="+",
+        default=[50, 50, 50, 50, 50],
+        help=(
+            "Determines the number of layers (depth) of the actor / policy network and "
+            "the hidden unit count in each layer."
+        ),
+    )
+    parser.add_argument(
+        "--critic_num_channels",
+        type=int,
+        nargs="+",
+        default=[50, 50, 50, 50, 50],
+        help=(
+            "Determines the number of layers (depth) of the critic / value network and "
+            "the hidden unit count in each layer."
+        ),
+    )
+    parser.add_argument("--actor_kernel_size", type=int, default=5, help=" ")
+    parser.add_argument("--critic_kernel_size", type=int, default=5, help=" ")
+    parser.add_argument("--actor_dropout", type=float, default=0, help=" ")
+    parser.add_argument("--critic_dropout", type=float, default=0, help=" ")
+    parser.add_argument(
+        "--no_class_weights",
+        action="store_true",
+        help=(
+            "Set this to True so we can leverage the Preprocessing pipeline written "
+            "for the supervised DL module."
+        ),
+    )
+
+    return parser
+
+
+def get_all_args():
+    parser = ArgumentParser(
+        description="Lightning RL module",
+        formatter_class=ArgumentDefaultsHelpFormatter,
+    )
+
+    # add model params of lightning trainer (this HAS to be first)
+    parser = Trainer.add_argparse_args(parser)
+
+    # add model and run specific params
+    parser = add_rl_specific_args(parser)
+    parser = add_model_specific_args(parser)
+    parser = add_run_specific_args(parser)
+    parser = add_data_specific_args(parser)
+    parser = add_preprocessing_specific_args(parser)
+    return parser.parse_args()
diff --git a/src/dagobert/modelling/rl/rl_runner.py b/src/dagobert/modelling/rl/rl_runner.py
index 8a777213..d7d23845 100644
--- a/src/dagobert/modelling/rl/rl_runner.py
+++ b/src/dagobert/modelling/rl/rl_runner.py
@@ -3,15 +3,15 @@
 
 This module is driven by the `dagobert-rl` command which can be parametrised by 
 command line arguments, but it's much more convenient to use YAML configs for this, 
-see the `tcn_args.py` for more detail.
+see the `tcn_args.py` and `rl_args.py` for more detail.
 """
 import logging
 from pathlib import Path
 
 from dagobert.utils import setup_logging
 from dagobert.runner_utils import load_config, update_args
-from dagobert.modelling.dl.tcn_args import get_all_args
-from dagobert.modelling.rl.rl import run_rl
+from dagobert.modelling.rl.rl_args import get_all_args
+from dagobert.modelling.rl import run_rl
 
 
 logger = logging.getLogger(__name__)
@@ -19,7 +19,7 @@
 
 def run():
     """
-    Run a single TCN training or parallelized hyper parameter tuning study using optuna.
+    Initialise a reinforcement-learning environment and a PPO agent and train it.
     """
 
     # parse arguments amd setup logging

From d35e2cafe2b5983c04769d915cdeefff66127cc1 Mon Sep 17 00:00:00 2001
From: Daniel Homola <dani.homola@gmail.com>
Date: Fri, 1 Jan 2021 13:49:33 +0000
Subject: [PATCH 08/62] adding rl args and setting up the runner properly, time
 to go through the networks and the actual algo

---
 .pre-commit-config.yaml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 25d9788a..9decdab1 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -2,5 +2,4 @@ repos:
   - repo: https://github.com/ambv/black
     rev: 20.8b1
     hooks:
-    - id: black
-      language_version: python3.7
\ No newline at end of file
+    - id: black
\ No newline at end of file

From d6f8fd3c0758788e5a7ad16c2c579667dc50c78f Mon Sep 17 00:00:00 2001
From: Daniel Homola <dani.homola@gmail.com>
Date: Fri, 1 Jan 2021 18:07:52 +0000
Subject: [PATCH 09/62] going through ppo and networks to make it work with
 TCNs

---
 .pre-commit-config.yaml                  |  3 +-
 config/rl_config.yaml                    |  1 +
 src/dagobert/modelling/rl/__init__.py    |  2 +-
 src/dagobert/modelling/rl/environment.py |  9 +--
 src/dagobert/modelling/rl/networks.py    | 82 ++++++++++++------------
 src/dagobert/modelling/rl/ppo.py         | 62 +++++++++---------
 src/dagobert/modelling/rl/rl_args.py     | 11 +++-
 7 files changed, 93 insertions(+), 77 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 9decdab1..25d9788a 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -2,4 +2,5 @@ repos:
   - repo: https://github.com/ambv/black
     rev: 20.8b1
     hooks:
-    - id: black
\ No newline at end of file
+    - id: black
+      language_version: python3.7
\ No newline at end of file
diff --git a/config/rl_config.yaml b/config/rl_config.yaml
index dae94a58..6671c217 100644
--- a/config/rl_config.yaml
+++ b/config/rl_config.yaml
@@ -60,6 +60,7 @@ actor_dropout: 0.25
 critic_num_channels: [50, 50, 50, 50, 50]
 critic_kernel_size: 5
 critic_dropout: 0.25
+use_last_timepoint: True
 
 # --------------------------------------------------------------------------------------
 # DATA
diff --git a/src/dagobert/modelling/rl/__init__.py b/src/dagobert/modelling/rl/__init__.py
index d848e772..4df05a47 100644
--- a/src/dagobert/modelling/rl/__init__.py
+++ b/src/dagobert/modelling/rl/__init__.py
@@ -1,3 +1,3 @@
 from .environment import RLData, RLPortfolio, RLEnv
-from .networks import create_mlp, ActorCriticAgent, ActorContinous
+from .networks import build_tcn, ActorCriticAgent, ActorContinous
 from .ppo import PPO, run_rl
diff --git a/src/dagobert/modelling/rl/environment.py b/src/dagobert/modelling/rl/environment.py
index dd71b9c2..0405a6c7 100644
--- a/src/dagobert/modelling/rl/environment.py
+++ b/src/dagobert/modelling/rl/environment.py
@@ -27,7 +27,9 @@ class RLData(object):
     """
 
     def __init__(
-        self, hparams: Namespace, train_val_test: str = "train",
+        self,
+        hparams: Namespace,
+        train_val_test: str = "train",
     ):
         """
         Class constructor.
@@ -38,7 +40,6 @@ def __init__(
             train_val_test: Whether we are training, validating or testing, it must be
                 either train, val or test.
         """
-        self.idx = 0
         self.hparams = hparams
 
         if train_val_test == "train":
@@ -59,7 +60,7 @@ def __init__(
             augment_dfs_mix=self.hparams.augment_dfs_mix,
         )
         self.dataset_len = len(self.dataset)
-        self.reset()
+        self.idx = np.random.randint(self.dataset_len - self.hparams.max_episode_length)
 
     def step(self):
         Xs, ys = self.dataset[self.idx]
@@ -195,7 +196,7 @@ def __init__(self, hparams: Namespace):
 
         Args:
             hparams:
-            
+
         """
         self.infos = []
         self.hparams = hparams
diff --git a/src/dagobert/modelling/rl/networks.py b/src/dagobert/modelling/rl/networks.py
index 4e2745b6..c45824cb 100644
--- a/src/dagobert/modelling/rl/networks.py
+++ b/src/dagobert/modelling/rl/networks.py
@@ -1,60 +1,60 @@
+# pylint: disable=no-member
+from argparse import Namespace
 from typing import Union, Tuple
 
 import torch
 from torch import nn
 from torch.distributions import Categorical, Normal
 
+from dagobert.modelling.dl import TemporalConvNet
 
-def create_mlp(input_shape: Tuple[int], n_actions: int, hidden_sizes: list = [64, 64]):
-    """
-    Simple Multi-Layer Perceptron network
-    """
-    net_layers = []
-    net_layers.append(nn.Linear(input_shape[0], hidden_sizes[0]))
-    net_layers.append(nn.ReLU())
-
-    for i in range(len(hidden_sizes) - 1):
-        net_layers.append(nn.Linear(hidden_sizes[i], hidden_sizes[i + 1]))
-        net_layers.append(nn.ReLU())
-    net_layers.append(nn.Linear(hidden_sizes[-1], n_actions))
-
-    return nn.Sequential(*net_layers)
 
-
-class ActorCategorical(nn.Module):
+class ActorCriticTCN(nn.Module):
     """
-    Policy network, for discrete action spaces, which returns a distribution
-    and an action given an observation
+    Creates either the actor/policy or the critic/value network as a TCN net, followed
+    by the appropriate linear layers.
     """
 
-    def __init__(self, actor_net):
+    def __init__(
+        self, hparams: Namespace, n_actions: int, actor: bool = True
+    ) -> nn.Module:
         """
-        Args:
-            input_shape: observation shape of the environment
-            n_actions: number of discrete actions available in the environment
-        """
-        super().__init__()
-
-        self.actor_net = actor_net
+        Init a TCN like we do in `dagobert.modelling.dl.tcn`.
 
-    def forward(self, states):
-        logits = self.actor_net(states)
-        pi = Categorical(logits=logits)
-        actions = pi.sample()
-
-        return pi, actions
-
-    def get_log_prob(self, pi: Categorical, actions: torch.Tensor):
-        """
-        Takes in a distribution and actions and returns log prob of actions
-        under the distribution
         Args:
-            pi: torch distribution
-            actions: actions taken by distribution
+            hparams: Hparam parsed and updated by PPO module in dagobert.modelling.rl.
+            n_actions: Number of units at the end of the network: different for actor/critic
+            actor: If True, we are using the network params in hparams for the actor net,
+                else we take the params for the critic.
+
         Returns:
-            log probability of the acition under pi
+            Initiated TCN with the appropriate size for actor or critic.
         """
-        return pi.log_prob(actions)
+        self.hparams = hparams
+        num_inputs = [len(cols) for dataset, cols in hparams.cols_to_model.items()]
+        num_channels = (
+            hparams.actor_num_channels if actor else hparams.critic_num_channels
+        )
+        kernel_size = hparams.actor_kernel_size if actor else hparams.critic_kernel_size
+        dropout = hparams.actor_dropout if actor else hparams.critic_dropout
+        self.tcn = TemporalConvNet(
+            num_inputs=num_inputs,
+            num_channels=num_channels,
+            kernel_size=kernel_size,
+            dropout=dropout,
+            time_feat_n=hparams.time_feat_n,
+            time_embed_dim=hparams.time_embed_dim,
+        )
+        self.linear1 = nn.Linear(hparams.mini_series_length, 1)
+        self.linear2 = nn.Linear(num_channels[-1], hparams.n_actions)
+
+    def forward(self, *x):
+        y1 = self.tcn(*x)
+        if self.hparams.use_last_timepoint:
+            return self.linear2(y1[:, :, -1])
+        else:
+            y2 = nn.functional.relu(self.linear1(y1).squeeze(-1))
+            return self.linear2(y2)
 
 
 class ActorContinous(nn.Module):
diff --git a/src/dagobert/modelling/rl/ppo.py b/src/dagobert/modelling/rl/ppo.py
index 46cbc377..dc51ccd9 100644
--- a/src/dagobert/modelling/rl/ppo.py
+++ b/src/dagobert/modelling/rl/ppo.py
@@ -4,6 +4,7 @@
 """
 # pylint: disable=no-member
 import logging
+from pathlib import Path
 from typing import List, Tuple
 from argparse import Namespace
 
@@ -19,12 +20,12 @@
 from pytorch_lightning.trainer import seed_everything
 
 
-from dagobert.naming import NRL, NPreprocessingArgs as npa
+from dagobert.naming import NRL, NStudy, NPreprocessingArgs as npa
 from dagobert.modelling.rl import (
     RLEnv,
-    create_mlp,
-    ActorCriticAgent,
+    build_tcn,
     ActorContinous,
+    ActorCriticAgent,
 )
 from dagobert.modelling.dl import (
     ExperienceSourceDataset,
@@ -72,21 +73,11 @@ def run_rl(args):
         args,
         logger=tcn_loggers,
         checkpoint_callback=checkpoint_callback,
-        callbacks=[early_stop_callback, metrics_callback, LearningRateMonitor(),],
     )
     model = PPO(args)
     trainer.fit(model)
     # trainer.test()
 
-    # return the validation and test loss for Optuna mainly
-    try:
-        val_loss = metrics_callback.get_min_max_metric("loss/val")
-        test_loss = metrics_callback.get_min_max_metric("loss/test")
-    except:
-        val_loss, test_loss = np.nan, np.nan
-    return val_loss, test_loss
-    rld = RLData(args)
-
 
 class PPO(LightningModule):
     """
@@ -116,10 +107,9 @@ def __init__(self, hparams: Namespace):
 
         # create env and policy/value networks
         self.env = RLEnv(hparams)
-        self.critic = create_mlp(self.env.observation_space.shape, 1)
+        self.critic = build_tcn(hparams, 1, actor=False)
         act_dim = self.env.action_space.shape[0]
-        actor_mlp = create_mlp(self.env.observation_space.shape, act_dim)
-        self.actor = ActorContinous(actor_mlp, act_dim)
+        self.actor = ActorContinous(build_tcn(hparams, act_dim), act_dim)
         self.agent = ActorCriticAgent(self.actor, self.critic)
 
         # init
@@ -144,9 +134,11 @@ def forward(
         self, x: torch.Tensor
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         """
-        Passes in a state x through the network and returns the policy and a sampled action
+        Passes state x through the network and returns the policy and a sampled action.
+
         Args:
             x: environment state
+
         Returns:
             Tuple of policy and action
         """
@@ -156,9 +148,12 @@ def forward(
         return pi, action, value
 
     def discount_rewards(self, rewards: List[float], discount: float) -> List[float]:
-        """Calculate the discounted rewards of all rewards in list
+        """
+        Calculate the discounted rewards of all rewards in list.
+
         Args:
             rewards: list of rewards/advantages
+
         Returns:
             list of discounted rewards/advantages
         """
@@ -176,11 +171,14 @@ def discount_rewards(self, rewards: List[float], discount: float) -> List[float]
     def calc_advantage(
         self, rewards: List[float], values: List[float], last_value: float
     ) -> List[float]:
-        """Calculate the advantage given rewards, state values, and the last value of episode
+        """
+        Calculate the advantage given rewards, state values, and last value of episode.
+
         Args:
             rewards: list of episode rewards
             values: list of state values from critic
             last_value: value of last state of episode
+
         Returns:
             list of advantages
         """
@@ -196,16 +194,24 @@ def calc_advantage(
 
     def train_batch(
         self,
-    ) -> Tuple[List[torch.Tensor], List[torch.Tensor], List[torch.Tensor]]:
+    ) -> Tuple[
+        List[torch.Tensor],
+        List[torch.Tensor],
+        List[torch.Tensor],
+        List[torch.Tensor],
+        List[torch.Tensor],
+    ]:
         """
-        Contains the logic for generating trajectory data to train policy and value network
+        Logic for generating trajectory data to train policy and value network
+
         Yield:
-           Tuple of Lists containing tensors for states, actions, log probs, qvals and advantage
+           Tuple of Lists containing tensors for states, actions, log probs, qvals and
+            advantage.
         """
 
         for step in range(self.hparams.steps_per_epoch):
             pi, action, log_prob, value = self.agent(self.state, self.device)
-            next_state, reward, done, _ = self.env.step(action.cpu().numpy())
+            next_state, reward, done, info = self.env.step(action.cpu().numpy())
 
             self.batch_states.append(self.state)
             self.batch_actions.append(action)
@@ -286,7 +292,8 @@ def training_step(
         self, batch: Tuple[torch.Tensor, torch.Tensor], batch_idx, optimizer_idx
     ):
         """
-        Carries out a single update to actor and critic network from a batch of replay buffer.
+        Carries out a n_optim_iter number of updates to actor and critic network from a
+        batch of replay buffer.
 
         Args:
             batch: batch of replay buffer/trajectory data
@@ -320,7 +327,6 @@ def training_step(
                 prog_bar=True,
                 logger=True,
             )
-
             return loss_actor
 
         else:
@@ -333,7 +339,6 @@ def training_step(
                 prog_bar=False,
                 logger=True,
             )
-
             return loss_critic
 
     def configure_optimizers(self) -> List[optim.Optimizer]:
@@ -343,12 +348,11 @@ def configure_optimizers(self) -> List[optim.Optimizer]:
             self.critic.parameters(), lr=self.hparams.lr_critic
         )
 
-        # to run multple steps of gradient descent
+        # workaround to run multple steps of gradient descent within LightningModule
         optimizers = []
-        for i in range(self.hparams.n_optim_iters):
+        for _ in range(self.hparams.n_optim_iters):
             optimizers.append(optimizer_actor)
             optimizers.append(optimizer_critic)
-
         return optimizers
 
     def _dataloader(self) -> DataLoader:
diff --git a/src/dagobert/modelling/rl/rl_args.py b/src/dagobert/modelling/rl/rl_args.py
index 01d2f9dd..32853ad6 100644
--- a/src/dagobert/modelling/rl/rl_args.py
+++ b/src/dagobert/modelling/rl/rl_args.py
@@ -155,7 +155,16 @@ def add_model_specific_args(parent_parser):
             "for the supervised DL module."
         ),
     )
-
+    parser.add_argument(
+        "--use_last_timepoint",
+        action="store_true",
+        help=(
+            "If this flag is used the only the network's representation "
+            "corresponding at the latest time-point is used to predict the outcome."
+            "By default, we combine all representations across the sequence length"
+            "to make a prediction from, instead of just using the last one."
+        ),
+    )
     return parser
 
 

From 67f84a33c66719e1934af1fdcf823749632c3fd1 Mon Sep 17 00:00:00 2001
From: Daniel Homola <dani.homola@gmail.com>
Date: Fri, 1 Jan 2021 19:20:40 +0000
Subject: [PATCH 10/62] trying to get the tensor dims right to pass data
 through the TCN - not working yet

---
 config/rl_config.yaml                    |  2 +-
 src/dagobert/modelling/dl/tcn.py         | 39 ++++++++++++++------
 src/dagobert/modelling/rl/__init__.py    |  4 +--
 src/dagobert/modelling/rl/environment.py |  6 ++--
 src/dagobert/modelling/rl/networks.py    |  7 ++--
 src/dagobert/modelling/rl/ppo.py         | 45 +++++++++++++-----------
 src/dagobert/modelling/rl/rl_runner.py   |  2 +-
 7 files changed, 64 insertions(+), 41 deletions(-)

diff --git a/config/rl_config.yaml b/config/rl_config.yaml
index 6671c217..a651a58a 100644
--- a/config/rl_config.yaml
+++ b/config/rl_config.yaml
@@ -7,7 +7,7 @@
 gpus: 1
 pin_memory: True
 profiler: True
-val_check_interval: 0.5
+#val_check_interval: 0.5
 # enable it with 'power' or 'binsearch'
 auto_scale_batch_size:
 #precision: 16
diff --git a/src/dagobert/modelling/dl/tcn.py b/src/dagobert/modelling/dl/tcn.py
index 6aacde87..8d0e2cc3 100644
--- a/src/dagobert/modelling/dl/tcn.py
+++ b/src/dagobert/modelling/dl/tcn.py
@@ -130,7 +130,11 @@ def setup_and_run_tcn_lightning(args: Namespace, study: bool = False):
         args,
         logger=tcn_loggers,
         checkpoint_callback=checkpoint_callback,
-        callbacks=[early_stop_callback, metrics_callback, LearningRateMonitor(),],
+        callbacks=[
+            early_stop_callback,
+            metrics_callback,
+            LearningRateMonitor(),
+        ],
     )
     model = TCNLightning(args)
 
@@ -208,7 +212,8 @@ def __init__(self, hparams: Namespace):
         if self.hparams.mix_density_net:
             self.linear_mu = nn.Linear(self.hparams.num_channels[-1], self.density_num)
             self.linear_sigmasq = nn.Linear(
-                self.hparams.num_channels[-1], self.density_num,
+                self.hparams.num_channels[-1],
+                self.density_num,
             )
             self.linear_mix = nn.Linear(self.hparams.num_channels[-1], self.density_num)
         self = self.float()
@@ -661,14 +666,27 @@ def _pre_sanity_check(hparams: Namespace) -> Namespace:
 
     @staticmethod
     def _pre_sanity_check_mini_series_lookback(hparams: Namespace) -> Namespace:
-        """Calculate lookback and mini_series_length if necessary."""
-        net_depth = len(hparams.num_channels)
-        k_size = hparams.kernel_size
-        max_seq_len = TemporalConvNet.get_tcn_receptive_field_size(k_size, net_depth)
-        logger.info(
-            f"A TCN with kernel size: {k_size} and depth: {net_depth} has a receptive "
-            f"field (can read a maximum sequence length) of {max_seq_len}."
-        )
+        """
+        Calculate lookback and mini_series_length if necessary.
+
+        Note, this works for both DL and RL (with two simultaneously trained nets).
+        However, for RL, we use the actor network's params to set the mini_series_len.
+        """
+
+        cases = ["", "critic_", "actor_"]
+        for case in cases:
+            num_channels = f"{case}num_channels"
+            k_size = f"{case}kernel_size"
+            if num_channels in hparams:
+                net_depth = len(hparams.__getattribute__(num_channels))
+                k_size = hparams.__getattribute__(k_size)
+                max_seq_len = TemporalConvNet.get_tcn_receptive_field_size(
+                    k_size, net_depth
+                )
+                logger.info(
+                    f"A {case}TCN with kernel size: {k_size} and depth: {net_depth} "
+                    f"can read a maximum sequence length of {max_seq_len}."
+                )
         if hparams.mini_series_length == "auto":
             logger.info(f"We set mini_series_length from 'auto' to {max_seq_len}.")
             hparams.mini_series_length = max_seq_len
@@ -687,6 +705,7 @@ def _pre_sanity_check_mini_series_lookback(hparams: Namespace) -> Namespace:
             f"The current mini_series_legnth {hparams.mini_series_length}, "
             f"corresponds to an estimated lookback of {hparams.lookback} hours."
         )
+        return hparams
 
     def _sanity_check(self):
         """
diff --git a/src/dagobert/modelling/rl/__init__.py b/src/dagobert/modelling/rl/__init__.py
index 4df05a47..d4900664 100644
--- a/src/dagobert/modelling/rl/__init__.py
+++ b/src/dagobert/modelling/rl/__init__.py
@@ -1,3 +1,3 @@
 from .environment import RLData, RLPortfolio, RLEnv
-from .networks import build_tcn, ActorCriticAgent, ActorContinous
-from .ppo import PPO, run_rl
+from .networks import ActorCriticTCN, ActorCriticAgent, ActorContinous
+from .ppo import PPO
diff --git a/src/dagobert/modelling/rl/environment.py b/src/dagobert/modelling/rl/environment.py
index 0405a6c7..a969b2ae 100644
--- a/src/dagobert/modelling/rl/environment.py
+++ b/src/dagobert/modelling/rl/environment.py
@@ -6,9 +6,8 @@
 from argparse import Namespace
 
 import gym
+import torch
 import numpy as np
-import pandas as pd
-import matplotlib.pyplot as plt
 from torch.utils.data import Dataset, DataLoader
 
 from dagobert.naming import NPreprocessingArgs as npa
@@ -65,6 +64,7 @@ def __init__(
     def step(self):
         Xs, ys = self.dataset[self.idx]
         y1 = np.concatenate([[1.0], ys])
+        Xs = [torch.Tensor(x).unsqueeze(0) for x in Xs]
         episode_full = self.idx == self.hparams.max_episode_length - 1
         done = True if episode_full else False
         self.idx += 1
@@ -209,7 +209,7 @@ def __init__(self, hparams: Namespace):
 
         # setup openai gym env - include cash in the portfolio action space
         self.action_space = gym.spaces.Box(
-            0.0, 1.0, shape=(self.asset_n + 1), dtype=np.float32
+            0.0, 1.0, shape=(self.asset_n + 1,), dtype=np.float32
         )
 
         # get the observation space from the data min and max
diff --git a/src/dagobert/modelling/rl/networks.py b/src/dagobert/modelling/rl/networks.py
index c45824cb..16c18d3b 100644
--- a/src/dagobert/modelling/rl/networks.py
+++ b/src/dagobert/modelling/rl/networks.py
@@ -30,6 +30,7 @@ def __init__(
         Returns:
             Initiated TCN with the appropriate size for actor or critic.
         """
+        super().__init__()
         self.hparams = hparams
         num_inputs = [len(cols) for dataset, cols in hparams.cols_to_model.items()]
         num_channels = (
@@ -46,9 +47,9 @@ def __init__(
             time_embed_dim=hparams.time_embed_dim,
         )
         self.linear1 = nn.Linear(hparams.mini_series_length, 1)
-        self.linear2 = nn.Linear(num_channels[-1], hparams.n_actions)
+        self.linear2 = nn.Linear(num_channels[-1], n_actions)
 
-    def forward(self, *x):
+    def forward(self, x):
         y1 = self.tcn(*x)
         if self.hparams.use_last_timepoint:
             return self.linear2(y1[:, :, -1])
@@ -119,7 +120,7 @@ def __call__(self, state: torch.Tensor, device: str) -> Tuple:
             torch dsitribution and randomly sampled action
         """
 
-        state = state.to(device=device)
+        state = [s.to(device=device) for s in state]
 
         pi, actions = self.actor_net(state)
         log_p = self.get_log_prob(pi, actions)
diff --git a/src/dagobert/modelling/rl/ppo.py b/src/dagobert/modelling/rl/ppo.py
index dc51ccd9..d0fa0c90 100644
--- a/src/dagobert/modelling/rl/ppo.py
+++ b/src/dagobert/modelling/rl/ppo.py
@@ -4,6 +4,7 @@
 """
 # pylint: disable=no-member
 import logging
+from copy import deepcopy
 from pathlib import Path
 from typing import List, Tuple
 from argparse import Namespace
@@ -23,7 +24,7 @@
 from dagobert.naming import NRL, NStudy, NPreprocessingArgs as npa
 from dagobert.modelling.rl import (
     RLEnv,
-    build_tcn,
+    ActorCriticTCN,
     ActorContinous,
     ActorCriticAgent,
 )
@@ -105,14 +106,17 @@ def __init__(self, hparams: Namespace):
         hparams = Preprocessing().preprocess_augment_dfs(hparams)
         self.hparams = Preprocessing().preprocess_train_dfs(hparams)
 
-        # create env and policy/value networks
+        # create env, init starting state and policy/value networks
         self.env = RLEnv(hparams)
-        self.critic = build_tcn(hparams, 1, actor=False)
+        first_Xs = self.env.reset()
+        self.state = [torch.FloatTensor(x) for x in first_Xs]
+
+        self.critic = ActorCriticTCN(hparams, 1, actor=False)
         act_dim = self.env.action_space.shape[0]
-        self.actor = ActorContinous(build_tcn(hparams, act_dim), act_dim)
+        self.actor = ActorContinous(ActorCriticTCN(hparams, act_dim), act_dim)
         self.agent = ActorCriticAgent(self.actor, self.critic)
 
-        # init
+        # init batching and progress tracking vars
         self.batch_states = []
         self.batch_actions = []
         self.batch_adv = []
@@ -128,24 +132,22 @@ def __init__(self, hparams: Namespace):
         self.avg_ep_len = 0
         self.avg_reward = 0
 
-        self.state = torch.FloatTensor(self.env.reset())
-
-    def forward(
-        self, x: torch.Tensor
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        """
-        Passes state x through the network and returns the policy and a sampled action.
+    # def forward(
+    #     self, x: torch.Tensor
+    # ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    #     """
+    #     Passes state x through the network and returns the policy and a sampled action.
 
-        Args:
-            x: environment state
+    #     Args:
+    #         x: environment state
 
-        Returns:
-            Tuple of policy and action
-        """
-        pi, action = self.actor(x)
-        value = self.critic(x)
+    #     Returns:
+    #         Tuple of policy and action
+    #     """
+    #     pi, action = self.actor(*x)
+    #     value = self.critic(*x)
 
-        return pi, action, value
+    #     return pi, action, value
 
     def discount_rewards(self, rewards: List[float], discount: float) -> List[float]:
         """
@@ -374,8 +376,9 @@ def _pre_sanity_check(hparams: Namespace):
         # make sure we have the same cols for each instrument
         # this helps to have an environment with a single tensor as state
         if len(hparams.cols_to_model) > 1:
+            anchor_cols = deepcopy(hparams.cols_to_model[npa.anchor])
             for df_name, cols in hparams.cols_to_model.items():
-                hparams.cols_to_model[df_name] = hparams.cols_to_model[npa.anchor]
+                hparams.cols_to_model[df_name] = anchor_cols
 
         # MINI SERIES / LOOKBACK
         hparams = TCNLightning._pre_sanity_check_mini_series_lookback(hparams)
diff --git a/src/dagobert/modelling/rl/rl_runner.py b/src/dagobert/modelling/rl/rl_runner.py
index d7d23845..5b3b9d1c 100644
--- a/src/dagobert/modelling/rl/rl_runner.py
+++ b/src/dagobert/modelling/rl/rl_runner.py
@@ -11,7 +11,7 @@
 from dagobert.utils import setup_logging
 from dagobert.runner_utils import load_config, update_args
 from dagobert.modelling.rl.rl_args import get_all_args
-from dagobert.modelling.rl import run_rl
+from dagobert.modelling.rl.ppo import run_rl
 
 
 logger = logging.getLogger(__name__)

From 85c8c9f8733743cb5a5729a1c3fb3f4e868bbcb6 Mon Sep 17 00:00:00 2001
From: Daniel Homola <dani.homola@gmail.com>
Date: Sat, 2 Jan 2021 12:58:30 +0000
Subject: [PATCH 11/62] solved the data feeding problem, now onto making the
 env work

---
 config/rl_config.yaml                    |   7 +-
 src/dagobert/modelling/dl/data.py        |  10 +--
 src/dagobert/modelling/dl/tcn.py         | 101 +++++++++++------------
 src/dagobert/modelling/dl/tcn_net.py     |   9 +-
 src/dagobert/modelling/rl/environment.py |   9 +-
 src/dagobert/modelling/rl/ppo.py         |  30 +++----
 6 files changed, 74 insertions(+), 92 deletions(-)

diff --git a/config/rl_config.yaml b/config/rl_config.yaml
index a651a58a..b56cb761 100644
--- a/config/rl_config.yaml
+++ b/config/rl_config.yaml
@@ -4,7 +4,7 @@
 # LIGHTNING
 # --------------------------------------------------------------------------------------
 
-gpus: 1
+gpus: 0
 pin_memory: True
 profiler: True
 #val_check_interval: 0.5
@@ -66,7 +66,8 @@ use_last_timepoint: True
 # DATA
 # --------------------------------------------------------------------------------------
 
-data_dir: "C:/Work/dagobert/data/modelling"
+#data_dir: "C:/Work/dagobert/data/modelling"
+data_dir: "/home/daniel/dagobert_data/modelling"
 
 lookback: auto
 mini_series_length: auto
@@ -121,7 +122,7 @@ cols_to_model:
     - mdi_60
     - vr_60
   df2:
-    # the cols of the secondary DFs will automatically be set to anchor's
+    # the cols of the secondary DFs will automatically be set to anchor's if not defined
     
 time_feat_n: 1
 time_embed_dim: 12
diff --git a/src/dagobert/modelling/dl/data.py b/src/dagobert/modelling/dl/data.py
index 1db71a2b..46748dd3 100644
--- a/src/dagobert/modelling/dl/data.py
+++ b/src/dagobert/modelling/dl/data.py
@@ -198,9 +198,7 @@ def __getitem__(self, idx):
     # FUNCTIONS FOR SETUP
     # ----------------------------------------------------------------------------------
 
-    def _load_df_anchor(
-        self,
-    ) -> pd.DatetimeIndex:
+    def _load_df_anchor(self,) -> pd.DatetimeIndex:
         """
         Loads the anchor DF, and returns it. We use the anchor df for plotting and to
         extract the master index which we measure everything else against in batching.
@@ -576,10 +574,10 @@ class PortfolioCryptoDataset(CryptoDataset):
     """
 
     def __init__(self, *args, **kw):
-        super().__init__(*args, **kw)
         # for each instrument, we add the rl_return target col to their cols_to_model
-        for df_name, cols in self.cols_to_model.items():
-            self.cols_to_model[df_name].append(NRL.rl_return)
+        for df_name, _ in kw[npa.cols_to_model].items():
+            kw[npa.cols_to_model][df_name].append(NRL.rl_return)
+        super().__init__(*args, **kw)
 
     def __getitem__(self, idx):
         """
diff --git a/src/dagobert/modelling/dl/tcn.py b/src/dagobert/modelling/dl/tcn.py
index 8d0e2cc3..35e858ad 100644
--- a/src/dagobert/modelling/dl/tcn.py
+++ b/src/dagobert/modelling/dl/tcn.py
@@ -130,11 +130,7 @@ def setup_and_run_tcn_lightning(args: Namespace, study: bool = False):
         args,
         logger=tcn_loggers,
         checkpoint_callback=checkpoint_callback,
-        callbacks=[
-            early_stop_callback,
-            metrics_callback,
-            LearningRateMonitor(),
-        ],
+        callbacks=[early_stop_callback, metrics_callback, LearningRateMonitor(),],
     )
     model = TCNLightning(args)
 
@@ -212,8 +208,7 @@ def __init__(self, hparams: Namespace):
         if self.hparams.mix_density_net:
             self.linear_mu = nn.Linear(self.hparams.num_channels[-1], self.density_num)
             self.linear_sigmasq = nn.Linear(
-                self.hparams.num_channels[-1],
-                self.density_num,
+                self.hparams.num_channels[-1], self.density_num,
             )
             self.linear_mix = nn.Linear(self.hparams.num_channels[-1], self.density_num)
         self = self.float()
@@ -654,9 +649,6 @@ def _pre_sanity_check(hparams: Namespace) -> Namespace:
                 "Classification is not applicable with mixed density nets"
             )
 
-        # MINI SERIES / LOOKBACK
-        hparams = TCNLightning._pre_sanity_check_mini_series_lookback(hparams)
-
         # ETC
         if hparams.augment_dfs and npa.anchor not in hparams.augment_dfs.keys():
             raise ValueError(
@@ -664,49 +656,6 @@ def _pre_sanity_check(hparams: Namespace) -> Namespace:
             )
         return hparams
 
-    @staticmethod
-    def _pre_sanity_check_mini_series_lookback(hparams: Namespace) -> Namespace:
-        """
-        Calculate lookback and mini_series_length if necessary.
-
-        Note, this works for both DL and RL (with two simultaneously trained nets).
-        However, for RL, we use the actor network's params to set the mini_series_len.
-        """
-
-        cases = ["", "critic_", "actor_"]
-        for case in cases:
-            num_channels = f"{case}num_channels"
-            k_size = f"{case}kernel_size"
-            if num_channels in hparams:
-                net_depth = len(hparams.__getattribute__(num_channels))
-                k_size = hparams.__getattribute__(k_size)
-                max_seq_len = TemporalConvNet.get_tcn_receptive_field_size(
-                    k_size, net_depth
-                )
-                logger.info(
-                    f"A {case}TCN with kernel size: {k_size} and depth: {net_depth} "
-                    f"can read a maximum sequence length of {max_seq_len}."
-                )
-        if hparams.mini_series_length == "auto":
-            logger.info(f"We set mini_series_length from 'auto' to {max_seq_len}.")
-            hparams.mini_series_length = max_seq_len
-        if (
-            hparams.mini_series_length != "auto"
-            and hparams.mini_series_length > max_seq_len
-        ):
-            logger.warning(
-                f"Provided mini-series length: {hparams.mini_series_length} is "
-                f"larger than the networks receptive field size: {max_seq_len}."
-            )
-        # calcualte what the current TCN setup corresponds to in hourly lookback
-        df_anchor = TCNLightning._load_anchor(hparams)
-        hparams.lookback = update_lookback(df_anchor, hparams.mini_series_length)
-        logger.info(
-            f"The current mini_series_legnth {hparams.mini_series_length}, "
-            f"corresponds to an estimated lookback of {hparams.lookback} hours."
-        )
-        return hparams
-
     def _sanity_check(self):
         """
         Make sure the options defined in hparams don't contradict each other.
@@ -724,6 +673,9 @@ def _sanity_check(self):
         ):
             raise ValueError("You can either provide both df_val/df_test or neither!")
 
+        # MINI SERIES / LOOKBACK
+        self.hparams = TCNLightning._check_mini_series_lookback(self.hparams)
+
         # TARGET VARIABLE
         if not self.hparams.regression:
             if self.hparams.simple_lookahead_reg:
@@ -765,3 +717,46 @@ def _sanity_check(self):
             raise ValueError(
                 "non_last_y_frac has to be between 0 and 1 when using lasy_y=False."
             )
+
+    @staticmethod
+    def _check_mini_series_lookback(hparams: Namespace) -> Namespace:
+        """
+        Calculate lookback and mini_series_length if necessary.
+
+        Note, this works for both DL and RL (with two simultaneously trained nets).
+        However, for RL, we use the actor network's params to set the mini_series_len.
+        """
+
+        cases = ["", "critic_", "actor_"]
+        for case in cases:
+            num_channels = f"{case}num_channels"
+            k_size = f"{case}kernel_size"
+            if num_channels in hparams:
+                net_depth = len(hparams.__getattribute__(num_channels))
+                k_size = hparams.__getattribute__(k_size)
+                max_seq_len = TemporalConvNet.get_tcn_receptive_field_size(
+                    k_size, net_depth
+                )
+                logger.info(
+                    f"A {case}TCN with kernel size: {k_size} and depth: {net_depth} "
+                    f"can read a maximum sequence length of {max_seq_len}."
+                )
+        if hparams.mini_series_length == "auto":
+            logger.info(f"We set mini_series_length from 'auto' to {max_seq_len}.")
+            hparams.mini_series_length = max_seq_len
+        if (
+            hparams.mini_series_length != "auto"
+            and hparams.mini_series_length > max_seq_len
+        ):
+            logger.warning(
+                f"Provided mini-series length: {hparams.mini_series_length} is "
+                f"larger than the networks receptive field size: {max_seq_len}."
+            )
+        # calcualte what the current TCN setup corresponds to in hourly lookback
+        df_anchor = TCNLightning._load_anchor(hparams)
+        hparams.lookback = update_lookback(df_anchor, hparams.mini_series_length)
+        logger.info(
+            f"The current mini_series_legnth {hparams.mini_series_length}, "
+            f"corresponds to an estimated lookback of {hparams.lookback} hours."
+        )
+        return hparams
diff --git a/src/dagobert/modelling/dl/tcn_net.py b/src/dagobert/modelling/dl/tcn_net.py
index b49b139a..0bf2bba7 100644
--- a/src/dagobert/modelling/dl/tcn_net.py
+++ b/src/dagobert/modelling/dl/tcn_net.py
@@ -40,14 +40,7 @@ class TemporalBlock(nn.Module):
     """
 
     def __init__(
-        self,
-        n_inputs,
-        n_outputs,
-        kernel_size,
-        stride,
-        dilation,
-        padding,
-        dropout=0.2,
+        self, n_inputs, n_outputs, kernel_size, stride, dilation, padding, dropout=0.2,
     ):
         super(TemporalBlock, self).__init__()
         self.conv1 = weight_norm(
diff --git a/src/dagobert/modelling/rl/environment.py b/src/dagobert/modelling/rl/environment.py
index a969b2ae..89aad13c 100644
--- a/src/dagobert/modelling/rl/environment.py
+++ b/src/dagobert/modelling/rl/environment.py
@@ -2,6 +2,7 @@
 Class defining PyTorch datasets for supervised modelling of a single instrument.
 """
 import logging
+from copy import deepcopy
 from typing import List, Tuple
 from argparse import Namespace
 
@@ -26,9 +27,7 @@ class RLData(object):
     """
 
     def __init__(
-        self,
-        hparams: Namespace,
-        train_val_test: str = "train",
+        self, hparams: Namespace, train_val_test: str = "train",
     ):
         """
         Class constructor.
@@ -49,7 +48,7 @@ def __init__(
             augment_method = None
         self.dataset = PortfolioCryptoDataset(
             df_to_load=getattr(self.hparams, f"df_{train_val_test}"),
-            cols_to_model=self.hparams.cols_to_model,
+            cols_to_model=deepcopy(self.hparams.cols_to_model),
             target_col=self.hparams.target_col,
             mini_series_length=self.hparams.mini_series_length,
             data_dir=self.hparams.data_dir,
@@ -63,7 +62,9 @@ def __init__(
 
     def step(self):
         Xs, ys = self.dataset[self.idx]
+        # add cash price (always 1) to the new price vector
         y1 = np.concatenate([[1.0], ys])
+        # turn Xs into a batch of 1
         Xs = [torch.Tensor(x).unsqueeze(0) for x in Xs]
         episode_full = self.idx == self.hparams.max_episode_length - 1
         done = True if episode_full else False
diff --git a/src/dagobert/modelling/rl/ppo.py b/src/dagobert/modelling/rl/ppo.py
index d0fa0c90..eeba2652 100644
--- a/src/dagobert/modelling/rl/ppo.py
+++ b/src/dagobert/modelling/rl/ppo.py
@@ -71,9 +71,7 @@ def run_rl(args):
     # define trainer and and lightning module
     args.multiprocessing = True if args.gpus != 1 else False
     trainer = Trainer.from_argparse_args(
-        args,
-        logger=tcn_loggers,
-        checkpoint_callback=checkpoint_callback,
+        args, logger=tcn_loggers, checkpoint_callback=checkpoint_callback,
     )
     model = PPO(args)
     trainer.fit(model)
@@ -97,23 +95,21 @@ def __init__(self, hparams: Namespace):
                 and dagobert.modelling.rl.rl_args for more information on the params.
         """
         super().__init__()
-
         # sanity check and setup device
         hparams = PPO._pre_sanity_check(hparams)
         self.tcn_device = "cuda" if hparams.gpus > 0 else "cpu"
 
         # prepare datafiles if necessary
         hparams = Preprocessing().preprocess_augment_dfs(hparams)
-        self.hparams = Preprocessing().preprocess_train_dfs(hparams)
+        hparams = Preprocessing().preprocess_train_dfs(hparams)
+        self.hparams = TCNLightning._check_mini_series_lookback(hparams)
 
         # create env, init starting state and policy/value networks
-        self.env = RLEnv(hparams)
-        first_Xs = self.env.reset()
-        self.state = [torch.FloatTensor(x) for x in first_Xs]
-
-        self.critic = ActorCriticTCN(hparams, 1, actor=False)
+        self.env = RLEnv(self.hparams)
+        self.state = self.env.reset()
+        self.critic = ActorCriticTCN(self.hparams, 1, actor=False)
         act_dim = self.env.action_space.shape[0]
-        self.actor = ActorContinous(ActorCriticTCN(hparams, act_dim), act_dim)
+        self.actor = ActorContinous(ActorCriticTCN(self.hparams, act_dim), act_dim)
         self.agent = ActorCriticAgent(self.actor, self.critic)
 
         # init batching and progress tracking vars
@@ -373,13 +369,11 @@ def _pre_sanity_check(hparams: Namespace):
         if hparams.target_col != NRL.rl_return:
             raise ValueError("target_col has to be rl_return for RL tasks.")
 
-        # make sure we have the same cols for each instrument
-        # this helps to have an environment with a single tensor as state
+        # fill in the same cols for any df that doesn't have the cols_to_model defined
         if len(hparams.cols_to_model) > 1:
-            anchor_cols = deepcopy(hparams.cols_to_model[npa.anchor])
             for df_name, cols in hparams.cols_to_model.items():
-                hparams.cols_to_model[df_name] = anchor_cols
-
-        # MINI SERIES / LOOKBACK
-        hparams = TCNLightning._pre_sanity_check_mini_series_lookback(hparams)
+                if df_name != npa.anchor and (cols is None or len(cols) == 0):
+                    hparams.cols_to_model[df_name] = deepcopy(
+                        hparams.cols_to_model[npa.anchor]
+                    )
         return hparams

From ed384b920c226e0a6c4fac69d48d9e3e65ff0724 Mon Sep 17 00:00:00 2001
From: Daniel Homola <dani.homola@gmail.com>
Date: Sat, 2 Jan 2021 12:58:44 +0000
Subject: [PATCH 12/62] solved the data feeding problem, now onto making the
 env work

---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 25d9788a..1f2b0af4 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -3,4 +3,4 @@ repos:
     rev: 20.8b1
     hooks:
     - id: black
-      language_version: python3.7
\ No newline at end of file
+      
\ No newline at end of file

From a596113c26712267e10f80883902f4f4d928c4f8 Mon Sep 17 00:00:00 2001
From: Daniel Homola <dani.homola@gmail.com>
Date: Sat, 2 Jan 2021 14:17:06 +0000
Subject: [PATCH 13/62] nearly training the network.. added dirichlet
 distribution instead of gaussian. still getting tensor shape mismatch
 errors..

---
 .pre-commit-config.yaml                  |  2 +-
 config/rl_config.yaml                    |  6 ++--
 src/dagobert/modelling/rl/environment.py |  8 +++--
 src/dagobert/modelling/rl/networks.py    | 45 +++++++++++++++---------
 src/dagobert/modelling/rl/ppo.py         | 40 ++++++++++-----------
 5 files changed, 57 insertions(+), 44 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 1f2b0af4..25d9788a 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -3,4 +3,4 @@ repos:
     rev: 20.8b1
     hooks:
     - id: black
-      
\ No newline at end of file
+      language_version: python3.7
\ No newline at end of file
diff --git a/config/rl_config.yaml b/config/rl_config.yaml
index b56cb761..3c0e573d 100644
--- a/config/rl_config.yaml
+++ b/config/rl_config.yaml
@@ -4,7 +4,7 @@
 # LIGHTNING
 # --------------------------------------------------------------------------------------
 
-gpus: 0
+gpus: 1
 pin_memory: True
 profiler: True
 #val_check_interval: 0.5
@@ -66,8 +66,8 @@ use_last_timepoint: True
 # DATA
 # --------------------------------------------------------------------------------------
 
-#data_dir: "C:/Work/dagobert/data/modelling"
-data_dir: "/home/daniel/dagobert_data/modelling"
+data_dir: "C:/Work/dagobert/data/modelling"
+#data_dir: "/home/daniel/dagobert_data/modelling"
 
 lookback: auto
 mini_series_length: auto
diff --git a/src/dagobert/modelling/rl/environment.py b/src/dagobert/modelling/rl/environment.py
index 89aad13c..33818192 100644
--- a/src/dagobert/modelling/rl/environment.py
+++ b/src/dagobert/modelling/rl/environment.py
@@ -27,7 +27,9 @@ class RLData(object):
     """
 
     def __init__(
-        self, hparams: Namespace, train_val_test: str = "train",
+        self,
+        hparams: Namespace,
+        train_val_test: str = "train",
     ):
         """
         Class constructor.
@@ -108,7 +110,7 @@ def __init__(
                 which was found to be more stable.
         """
         self.asset_names = asset_names
-        self.asset_num = len(asset_names)
+        self.asset_n = len(asset_names)
         self.trading_cost = trading_cost
         self.reward_type = reward_type
         self.reset()
@@ -179,7 +181,7 @@ def step(self, w1: np.array, y1: np.array) -> Tuple[float, dict, bool]:
 
     def reset(self):
         self.infos = []
-        self.w0 = np.zeros(self.asset_num)
+        self.w0 = np.zeros(self.asset_n + 1)
         self.w0[0] = 1
         self.p0 = 1.0
 
diff --git a/src/dagobert/modelling/rl/networks.py b/src/dagobert/modelling/rl/networks.py
index 16c18d3b..06bfb89c 100644
--- a/src/dagobert/modelling/rl/networks.py
+++ b/src/dagobert/modelling/rl/networks.py
@@ -4,7 +4,7 @@
 
 import torch
 from torch import nn
-from torch.distributions import Categorical, Normal
+from torch.distributions import Dirichlet
 
 from dagobert.modelling.dl import TemporalConvNet
 
@@ -64,29 +64,45 @@ class ActorContinous(nn.Module):
     and an action given an observation
     """
 
-    def __init__(self, actor_net, act_dim):
+    def __init__(self, actor_net):
         """
+        The original PPO can be used for discrete action spaces with a Categorical
+        distribution or for a continuous actions space with a multivariate Gaussian,
+        where the network's outputs as raw logits go into it as the vector of mu and
+        the std is a separate learned parameter (same for all components).
+
+        This is fine if we want to sample unbounded continuos actions between -inf/inf,
+        but we here we need a mixture of weights for our portfolio that sums up to one.
+
+        The recommended thing to do in RL circles in this setup is to use DDPG, which
+        is a completely different and deterministic policy gradient algo. Instead of
+        that, here we implement an idea that I found here on this reddit discussion
+        https://www.reddit.com/r/reinforcementlearning/comments/cl2kqn/special_case_of_continuous_action_space_rl/
+        where they recommend swapping the Gaussian distribution for a Dirichlet one
+        and sampling our actions from that. This by design returns a probability
+        summing to one and there's no need to learn a separate std param.
+
+        NOTE! I'm not sure how well this works or how legit it is, as I haven't found
+        any papers or implementations actually doing this.
+
         Args:
             input_shape: observation shape of the environment
             n_actions: number of discrete actions available in the environment
         """
         super().__init__()
         self.actor_net = actor_net
-        log_std = -0.5 * torch.ones(act_dim, dtype=torch.float)
-        self.log_std = torch.nn.Parameter(log_std)
 
     def forward(self, states):
-        mu = self.actor_net(states)
-        std = torch.exp(self.log_std)
-        pi = Normal(loc=mu, scale=std)
+        concentrations = nn.functional.softmax(self.actor_net(states), dim=1).squeeze(0)
+        pi = Dirichlet(concentrations)
         actions = pi.sample()
-
         return pi, actions
 
-    def get_log_prob(self, pi: Normal, actions: torch.Tensor):
+    def get_log_prob(self, pi: Dirichlet, actions: torch.Tensor):
         """
-        Takes in a distribution and actions and returns log prob of actions
-        under the distribution
+        Takes in a distribution and actions and returns log prob of actions under
+        the distribution
+
         Args:
             pi: torch distribution
             actions: actions taken by distribution
@@ -121,17 +137,12 @@ def __call__(self, state: torch.Tensor, device: str) -> Tuple:
         """
 
         state = [s.to(device=device) for s in state]
-
         pi, actions = self.actor_net(state)
         log_p = self.get_log_prob(pi, actions)
-
         value = self.critic_net(state)
-
         return pi, actions, log_p, value
 
-    def get_log_prob(
-        self, pi: Union[Categorical, Normal], actions: torch.Tensor
-    ) -> torch.Tensor:
+    def get_log_prob(self, pi: Dirichlet, actions: torch.Tensor) -> torch.Tensor:
         """
         Takes in the current state and returns the agents policy, a sampled
         action, log probability of the action, and the value of the state
diff --git a/src/dagobert/modelling/rl/ppo.py b/src/dagobert/modelling/rl/ppo.py
index eeba2652..d92676a7 100644
--- a/src/dagobert/modelling/rl/ppo.py
+++ b/src/dagobert/modelling/rl/ppo.py
@@ -71,7 +71,9 @@ def run_rl(args):
     # define trainer and and lightning module
     args.multiprocessing = True if args.gpus != 1 else False
     trainer = Trainer.from_argparse_args(
-        args, logger=tcn_loggers, checkpoint_callback=checkpoint_callback,
+        args,
+        logger=tcn_loggers,
+        checkpoint_callback=checkpoint_callback,
     )
     model = PPO(args)
     trainer.fit(model)
@@ -108,8 +110,8 @@ def __init__(self, hparams: Namespace):
         self.env = RLEnv(self.hparams)
         self.state = self.env.reset()
         self.critic = ActorCriticTCN(self.hparams, 1, actor=False)
-        act_dim = self.env.action_space.shape[0]
-        self.actor = ActorContinous(ActorCriticTCN(self.hparams, act_dim), act_dim)
+        actor_tcn = ActorCriticTCN(self.hparams, self.env.action_space.shape[0])
+        self.actor = ActorContinous(actor_tcn)
         self.agent = ActorCriticAgent(self.actor, self.critic)
 
         # init batching and progress tracking vars
@@ -128,22 +130,21 @@ def __init__(self, hparams: Namespace):
         self.avg_ep_len = 0
         self.avg_reward = 0
 
-    # def forward(
-    #     self, x: torch.Tensor
-    # ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-    #     """
-    #     Passes state x through the network and returns the policy and a sampled action.
-
-    #     Args:
-    #         x: environment state
+    def forward(
+        self, *x: List[torch.Tensor]
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Passes state x through the network and returns the policy and a sampled action.
 
-    #     Returns:
-    #         Tuple of policy and action
-    #     """
-    #     pi, action = self.actor(*x)
-    #     value = self.critic(*x)
+        Args:
+            x: environment state
 
-    #     return pi, action, value
+        Returns:
+            Tuple of policy and action
+        """
+        pi, action = self.actor(*x)
+        value = self.critic(*x)
+        return pi, action, value
 
     def discount_rewards(self, rewards: List[float], discount: float) -> List[float]:
         """
@@ -163,7 +164,6 @@ def discount_rewards(self, rewards: List[float], discount: float) -> List[float]
         for r in reversed(rewards):
             sum_r = (sum_r * discount) + r
             cumul_reward.append(sum_r)
-
         return list(reversed(cumul_reward))
 
     def calc_advantage(
@@ -218,7 +218,7 @@ def train_batch(
             self.ep_rewards.append(reward)
             self.ep_values.append(value.item())
 
-            self.state = torch.FloatTensor(next_state)
+            self.state = next_state
 
             epoch_end = step == (self.hparams.steps_per_epoch - 1)
             terminal = len(self.ep_rewards) == self.hparams.max_episode_length
@@ -246,7 +246,7 @@ def train_batch(
                 # reset params
                 self.ep_rewards = []
                 self.ep_values = []
-                self.state = torch.FloatTensor(self.env.reset())
+                self.state = self.env.reset()
 
             if epoch_end:
                 train_data = zip(

From c32ed32cdaf1dbaeda03c133c9831b241f4e8001 Mon Sep 17 00:00:00 2001
From: Daniel Homola <dani.homola@gmail.com>
Date: Sat, 2 Jan 2021 16:59:27 +0000
Subject: [PATCH 14/62] ppo is training (and producing nonsensical results) but
 it's TRAINING in lightninggit status!

---
 src/dagobert/modelling/rl/environment.py |  2 +-
 src/dagobert/modelling/rl/networks.py    |  6 ++-
 src/dagobert/modelling/rl/ppo.py         | 56 +++++++++++-------------
 3 files changed, 31 insertions(+), 33 deletions(-)

diff --git a/src/dagobert/modelling/rl/environment.py b/src/dagobert/modelling/rl/environment.py
index 33818192..ded6c16c 100644
--- a/src/dagobert/modelling/rl/environment.py
+++ b/src/dagobert/modelling/rl/environment.py
@@ -66,7 +66,7 @@ def step(self):
         Xs, ys = self.dataset[self.idx]
         # add cash price (always 1) to the new price vector
         y1 = np.concatenate([[1.0], ys])
-        # turn Xs into a batch of 1
+        # turn Xs into a batch of 1, ready to be fed into the actor/critic
         Xs = [torch.Tensor(x).unsqueeze(0) for x in Xs]
         episode_full = self.idx == self.hparams.max_episode_length - 1
         done = True if episode_full else False
diff --git a/src/dagobert/modelling/rl/networks.py b/src/dagobert/modelling/rl/networks.py
index 06bfb89c..664e82d0 100644
--- a/src/dagobert/modelling/rl/networks.py
+++ b/src/dagobert/modelling/rl/networks.py
@@ -77,12 +77,14 @@ def __init__(self, actor_net):
         The recommended thing to do in RL circles in this setup is to use DDPG, which
         is a completely different and deterministic policy gradient algo. Instead of
         that, here we implement an idea that I found here on this reddit discussion
-        https://www.reddit.com/r/reinforcementlearning/comments/cl2kqn/special_case_of_continuous_action_space_rl/
+        https://www.reddit.com/r/reinforcementlearning/comments/cl2kqn/
+        special_case_of_continuous_action_space_rl/
         where they recommend swapping the Gaussian distribution for a Dirichlet one
         and sampling our actions from that. This by design returns a probability
         summing to one and there's no need to learn a separate std param.
 
-        NOTE! I'm not sure how well this works or how legit it is, as I haven't found
+        NOTE!
+        I'm not sure how well this works or how legit it is, as I haven't found
         any papers or implementations actually doing this.
 
         Args:
diff --git a/src/dagobert/modelling/rl/ppo.py b/src/dagobert/modelling/rl/ppo.py
index d92676a7..0c70dab8 100644
--- a/src/dagobert/modelling/rl/ppo.py
+++ b/src/dagobert/modelling/rl/ppo.py
@@ -61,8 +61,8 @@ def run_rl(args):
 
     # setup callbacks
     checkpoint_callback = ModelCheckpoint(
-        monitor="loss/val",
-        filename="_{epoch:02d}_{loss_val:.10f}",
+        monitor="avg_reward",
+        filename="_{epoch:02d}_{avg_reward:.10f}",
         dirpath=f"{args.log_dir}/models/{args.exp_name}_{tb_logger.version}",
         save_top_k=3,
         mode="max",
@@ -130,21 +130,21 @@ def __init__(self, hparams: Namespace):
         self.avg_ep_len = 0
         self.avg_reward = 0
 
-    def forward(
-        self, *x: List[torch.Tensor]
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        """
-        Passes state x through the network and returns the policy and a sampled action.
+    # def forward(
+    #     self, states: List[torch.Tensor]
+    # ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    #     """
+    #     Passes state x through the network and returns the policy and a sampled action.
 
-        Args:
-            x: environment state
+    #     Args:
+    #         x: environment state
 
-        Returns:
-            Tuple of policy and action
-        """
-        pi, action = self.actor(*x)
-        value = self.critic(*x)
-        return pi, action, value
+    #     Returns:
+    #         Tuple of policy and action
+    #     """
+    #     pi, action = self.actor(states)
+    #     value = self.critic(states)
+    #     return pi, action, value
 
     def discount_rewards(self, rewards: List[float], discount: float) -> List[float]:
         """
@@ -160,7 +160,6 @@ def discount_rewards(self, rewards: List[float], discount: float) -> List[float]
 
         cumul_reward = []
         sum_r = 0.0
-
         for r in reversed(rewards):
             sum_r = (sum_r * discount) + r
             cumul_reward.append(sum_r)
@@ -211,13 +210,12 @@ def train_batch(
             pi, action, log_prob, value = self.agent(self.state, self.device)
             next_state, reward, done, info = self.env.step(action.cpu().numpy())
 
-            self.batch_states.append(self.state)
+            # drop first batch dim so dataloader later can resample them for backprop
+            self.batch_states.append([s.squeeze(0) for s in self.state])
             self.batch_actions.append(action)
             self.batch_logp.append(log_prob)
-
             self.ep_rewards.append(reward)
             self.ep_values.append(value.item())
-
             self.state = next_state
 
             epoch_end = step == (self.hparams.steps_per_epoch - 1)
@@ -267,8 +265,8 @@ def train_batch(
                 self.batch_qvals.clear()
 
                 self.avg_ep_reward = self.epoch_rewards / self.done_episodes
-                self.avg_reward = self.epoch_rewards / self.steps_per_epoch
-                self.avg_ep_len = self.steps_per_epoch / self.done_episodes
+                self.avg_reward = self.epoch_rewards / self.hparams.steps_per_epoch
+                self.avg_ep_len = self.hparams.steps_per_epoch / self.done_episodes
 
                 self.epoch_rewards = 0
                 self.done_episodes = 0
@@ -276,9 +274,11 @@ def train_batch(
     def actor_loss(self, state, action, logp_old, qval, adv) -> torch.Tensor:
         pi, _ = self.actor(state)
         logp = self.actor.get_log_prob(pi, action)
-        ratio = torch.exp(logp - logp_old)
-        clip_adv = torch.clamp(ratio, 1 - self.clip_ratio, 1 + self.clip_ratio) * adv
-        loss_actor = -(torch.min(ratio * adv, clip_adv)).mean()
+        ratio = torch.exp(logp - logp_old.sum(-1))
+        clip_ratio = torch.clamp(
+            ratio, 1 - self.hparams.clip_ratio, 1 + self.hparams.clip_ratio
+        )
+        loss_actor = -(torch.min(ratio * adv, clip_ratio * adv)).mean()
         return loss_actor
 
     def critic_loss(self, state, action, logp_old, qval, adv) -> torch.Tensor:
@@ -334,7 +334,7 @@ def training_step(
                 loss_critic,
                 on_step=False,
                 on_epoch=True,
-                prog_bar=False,
+                prog_bar=True,
                 logger=True,
             )
             return loss_critic
@@ -353,16 +353,12 @@ def configure_optimizers(self) -> List[optim.Optimizer]:
             optimizers.append(optimizer_critic)
         return optimizers
 
-    def _dataloader(self) -> DataLoader:
+    def train_dataloader(self) -> DataLoader:
         """Initialize the Replay Buffer dataset used for retrieving experiences"""
         dataset = ExperienceSourceDataset(self.train_batch)
         dataloader = DataLoader(dataset=dataset, batch_size=self.hparams.batch_size)
         return dataloader
 
-    def train_dataloader(self) -> DataLoader:
-        """Get train loader"""
-        return self._dataloader()
-
     @staticmethod
     def _pre_sanity_check(hparams: Namespace):
         # ensure we have the rl specific target column in the config

From 466d7b2c7ae0aa732ff1ee900860ed9d536dc344 Mon Sep 17 00:00:00 2001
From: Daniel Homola <dani.homola@gmail.com>
Date: Sat, 2 Jan 2021 17:00:27 +0000
Subject: [PATCH 15/62] deleted unnecessary files

---
 src/dagobert/modelling/rl/env01.py | 442 ---------------------------
 src/dagobert/modelling/rl/env02.py | 473 -----------------------------
 2 files changed, 915 deletions(-)
 delete mode 100644 src/dagobert/modelling/rl/env01.py
 delete mode 100644 src/dagobert/modelling/rl/env02.py

diff --git a/src/dagobert/modelling/rl/env01.py b/src/dagobert/modelling/rl/env01.py
deleted file mode 100644
index 4015cf7c..00000000
--- a/src/dagobert/modelling/rl/env01.py
+++ /dev/null
@@ -1,442 +0,0 @@
-"""
-https://github.com/wassname/rl-portfolio-management/blob/master/rl_portfolio_management/environments/portfolio.py
-"""
-import numpy as np
-import pandas as pd
-from matplotlib import pyplot as plt
-from pprint import pprint
-import logging
-import os
-import tempfile
-import time
-import gym
-import gym.spaces
-
-from ..config import eps
-from ..data.utils import normalize, random_shift, scale_to_start
-from ..util import MDD as max_drawdown, sharpe, softmax
-from ..callbacks.notebook_plot import LivePlotNotebook
-
-logger = logging.getLogger(__name__)
-
-
-class DataSrc(object):
-    """Acts as data provider for each new episode."""
-
-    def __init__(
-        self,
-        df,
-        steps=252,
-        scale=True,
-        scale_extra_cols=True,
-        augment=0.00,
-        window_length=50,
-        random_reset=True,
-    ):
-        """
-        DataSrc.
-
-        df - csv for data frame index of timestamps
-             and multi-index columns levels=[['LTCBTC'],...],['open','low','high','close',...]]
-             an example is included as an hdf file in this repository
-        steps - total steps in episode
-        scale - scale the data for each episode
-        scale_extra_cols - scale extra columns by global mean and std
-        augment - fraction to augment the data by
-        random_reset - reset to a random time (otherwise continue through time)
-        """
-        self.steps = steps + 1
-        self.augment = augment
-        self.random_reset = random_reset
-        self.scale = scale
-        self.scale_extra_cols = scale_extra_cols
-        self.window_length = window_length
-        self.idx = self.window_length
-
-        # get rid of NaN's
-        df = df.copy()
-        df.replace(np.nan, 0, inplace=True)
-        df = df.fillna(method="pad")
-
-        # dataframe to matrix
-        self.asset_names = df.columns.levels[0].tolist()
-        self.features = df.columns.levels[1].tolist()
-        data = df.as_matrix().reshape(
-            (len(df), len(self.asset_names), len(self.features))
-        )
-        self._data = np.transpose(data, (1, 0, 2))
-        self._times = df.index
-
-        self.price_columns = ["close", "high", "low", "open"]
-        self.non_price_columns = set(df.columns.levels[1]) - set(self.price_columns)
-
-        # Stats to let us normalize non price columns
-        if scale_extra_cols:
-            x = self._data.reshape((-1, len(self.features)))
-            self.stats = dict(mean=x.mean(0), std=x.std(0))
-            # for column in self._data.columns.levels[1].tolist():
-            #     x = df.xs(key=column, axis=1, level='Price').as_matrix()[:, :]
-            #     self.stats["mean"].append(x.mean())
-            #      = dict(mean=x.mean(), std=x.std())
-
-        self.reset()
-
-    def _step(self):
-        # get history matrix from dataframe
-        data_window = self.data[:, self.step : self.step + self.window_length].copy()
-
-        # (eq.1) prices
-        y1 = data_window[:, -1, 0] / data_window[:, -2, 0]
-        y1 = np.concatenate([[1.0], y1])  # add cash price
-
-        # (eq 18) X: prices are divided by close price
-        nb_pc = len(self.price_columns)
-        if self.scale:
-            last_close_price = data_window[:, -1, 0]
-            data_window[:, :, :nb_pc] /= last_close_price[:, np.newaxis, np.newaxis]
-
-        if self.scale_extra_cols:
-            # normalize non price columns
-            data_window[:, :, nb_pc:] -= self.stats["mean"][None, None, nb_pc:]
-            data_window[:, :, nb_pc:] /= self.stats["std"][None, None, nb_pc:]
-            data_window[:, :, nb_pc:] = np.clip(
-                data_window[:, :, nb_pc:],
-                self.stats["mean"][nb_pc:] - self.stats["std"][nb_pc:] * 10,
-                self.stats["mean"][nb_pc:] + self.stats["std"][nb_pc:] * 10,
-            )
-
-        self.step += 1
-        history = data_window
-        done = bool(self.step >= self.steps)
-
-        return history, y1, done
-
-    def reset(self):
-        self.step = 0
-
-        # get data for this episode
-        if self.random_reset:
-            self.idx = np.random.randint(
-                low=self.window_length + 1, high=self._data.shape[1] - self.steps - 2
-            )
-        else:
-            # continue sequentially, before reseting to start
-            if self.idx > (self._data.shape[1] - self.steps - self.window_length - 1):
-                self.idx = self.window_length + 1
-            else:
-                self.idx += self.steps
-        data = self._data[
-            :, self.idx - self.window_length : self.idx + self.steps + 1
-        ].copy()
-        self.times = self._times[
-            self.idx - self.window_length : self.idx + self.steps + 1
-        ]
-
-        # augment data to prevent overfitting
-        data += np.random.normal(loc=0, scale=self.augment, size=data.shape)
-
-        self.data = data
-
-
-class PortfolioSim(object):
-    """
-    Portfolio management sim.
-
-    Params:
-    - cost e.g. 0.0025 is max in Poliniex
-
-    Based of [Jiang 2017](https://arxiv.org/abs/1706.10059)
-    """
-
-    def __init__(self, asset_names=[], steps=128, trading_cost=0.0025, time_cost=0.0):
-        self.cost = trading_cost
-        self.time_cost = time_cost
-        self.steps = steps
-        self.asset_names = asset_names
-        self.reset()
-
-    def _step(self, w1, y1):
-        """
-        Step.
-
-        w1 - new action of portfolio weights - e.g. [0.1,0.9, 0.0]
-        y1 - price relative vector also called return
-            e.g. [1.0, 0.9, 1.1]
-        Numbered equations are from https://arxiv.org/abs/1706.10059
-        """
-        w0 = self.w0
-        p0 = self.p0
-
-        # (eq7) since we last acted prices changed, so weights evolve into
-        dw1 = (y1 * w0) / (np.dot(y1, w0) + eps)
-
-        # (eq16) cost to change portfolio: p' -> mu -> pt, see Figure 1
-        # excluding change in cash to avoid double counting for transaction cost
-        mu = self.cost * (np.abs(dw1[1:] - w1[1:])).sum()
-
-        # (eq11) final portfolio value: see section between (eq19-20) why this works
-        p1 = p0 * (1 - mu) * np.dot(y1, w0)
-
-        # (eq9 & 10) rate of return log rate of return
-        rho1 = p1 / p0 - 1  # rate of returns
-        r1 = np.log(p1 + eps) - np.log(p0 + eps)
-
-        # (eq22) immediate reward is log rate of return scaled by episode length
-        reward = r1 / self.steps
-
-        # remember for next step
-        self.w0 = w1
-        self.p0 = p1
-
-        # if we run out of money, we're done
-        done = bool(p1 == 0)
-
-        # should only return single values, not list
-        info = {
-            "reward": reward,
-            "log_return": r1,
-            "portfolio_value": p1,
-            "market_return": y1.mean(),
-            "rate_of_return": rho1,
-            "weights_mean": w1.mean(),
-            "weights_std": w1.std(),
-            "cost": mu,
-        }
-        # record weights and prices
-        for i, name in enumerate(["BTCBTC"] + self.asset_names):
-            info["weight_" + name] = w1[i]
-            info["price_" + name] = y1[i]
-
-        self.infos.append(info)
-        return reward, info, done
-
-    def reset(self):
-        self.infos = []
-        self.w0 = np.array([1.0] + [0.0] * len(self.asset_names))
-        self.p0 = 1.0
-
-
-class PortfolioEnv(gym.Env):
-    """
-    An environment for financial portfolio management.
-
-    Financial portfolio management is the process of constant redistribution of a fund into different
-    financial products.
-
-    Based on [Jiang 2017](https://arxiv.org/abs/1706.10059)
-    """
-
-    metadata = {"render.modes": ["notebook", "ansi"]}
-
-    def __init__(
-        self,
-        df,
-        steps=256,
-        trading_cost=0.0025,
-        time_cost=0.00,
-        window_length=50,
-        augment=0.00,
-        output_mode="EIIE",
-        log_dir=None,
-        scale=True,
-        scale_extra_cols=True,
-        random_reset=True,
-    ):
-        """
-        An environment for financial portfolio management.
-
-        Params:
-            df - csv for data frame index of timestamps
-                 and multi-index columns levels=[['LTCBTC'],...],['open','low','high','close']]
-            steps - steps in episode
-            window_length - how many past observations["history"] to return
-            trading_cost - cost of trade as a fraction,  e.g. 0.0025 corresponding to max rate of 0.25% at Poloniex (2017)
-            time_cost - cost of holding as a fraction
-            augment - fraction to randomly shift data by
-            output_mode: decides observation["history"] shape
-            - 'EIIE' for (assets, window, 3)
-            - 'atari' for (window, window, 3) (assets is padded)
-            - 'mlp' for (assets*window*3)
-            log_dir: directory to save plots to
-            scale - scales price data by last opening price on each episode (except return)
-            scale_extra_cols - scales non price data using mean and std for whole dataset
-        """
-        self.src = DataSrc(
-            df=df,
-            steps=steps,
-            scale=scale,
-            scale_extra_cols=scale_extra_cols,
-            augment=augment,
-            window_length=window_length,
-            random_reset=random_reset,
-        )
-        self._plot = self._plot2 = self._plot3 = None
-        self.output_mode = output_mode
-        self.sim = PortfolioSim(
-            asset_names=self.src.asset_names,
-            trading_cost=trading_cost,
-            time_cost=time_cost,
-            steps=steps,
-        )
-        self.log_dir = log_dir
-
-        # openai gym attributes
-        # action will be the portfolio weights [cash_bias,w1,w2...] where wn are [0, 1] for each asset
-        nb_assets = len(self.src.asset_names)
-        self.action_space = gym.spaces.Box(0.0, 1.0, shape=nb_assets + 1)
-
-        # get the history space from the data min and max
-        if output_mode == "EIIE":
-            obs_shape = (nb_assets, window_length, len(self.src.features))
-        elif output_mode == "atari":
-            obs_shape = (window_length, window_length, len(self.src.features))
-        elif output_mode == "mlp":
-            obs_shape = (nb_assets) * window_length * (len(self.src.features))
-        else:
-            raise Exception("Invalid value for output_mode: %s" % self.output_mode)
-
-        self.observation_space = gym.spaces.Dict(
-            {
-                "history": gym.spaces.Box(
-                    -10,
-                    20
-                    if scale
-                    else 1,  # if scale=True observed price changes return could be large fractions
-                    obs_shape,
-                ),
-                "weights": self.action_space,
-            }
-        )
-        self._reset()
-
-    def _step(self, action):
-        """
-        Step the env.
-
-        Actions should be portfolio [w0...]
-        - Where wn is a portfolio weight between 0 and 1. The first (w0) is cash_bias
-        - cn is the portfolio conversion weights see PortioSim._step for description
-        """
-        logger.debug("action: %s", action)
-
-        weights = np.clip(action, 0.0, 1.0)
-        weights /= weights.sum() + eps
-
-        # Sanity checks
-        assert self.action_space.contains(
-            action
-        ), "action should be within %r but is %r" % (self.action_space, action)
-        np.testing.assert_almost_equal(
-            np.sum(weights),
-            1.0,
-            3,
-            err_msg='weights should sum to 1. action="%s"' % weights,
-        )
-
-        history, y1, done1 = self.src._step()
-
-        reward, info, done2 = self.sim._step(weights, y1)
-
-        # calculate return for buy and hold a bit of each asset
-        info["market_value"] = np.cumprod(
-            [inf["market_return"] for inf in self.infos + [info]]
-        )[-1]
-        # add dates
-        info["date"] = self.src.times[self.src.step].timestamp()
-        info["steps"] = self.src.step
-
-        self.infos.append(info)
-
-        # reshape history according to output mode
-        if self.output_mode == "EIIE":
-            pass
-        elif self.output_mode == "atari":
-            padding = history.shape[1] - history.shape[0]
-            history = np.pad(history, [[0, padding], [0, 0], [0, 0]], mode="constant")
-        elif self.output_mode == "mlp":
-            history = history.flatten()
-
-        return {"history": history, "weights": weights}, reward, done1 or done2, info
-
-    def _reset(self):
-        self.sim.reset()
-        self.src.reset()
-        self.infos = []
-        action = self.sim.w0
-        observation, reward, done, info = self.step(action)
-        return observation
-
-    def _seed(self, seed):
-        np.random.seed(seed)
-        return [seed]
-
-    def _render(self, mode="notebook", close=False):
-        # if close:
-        # return
-        if mode == "ansi":
-            pprint(self.infos[-1])
-        elif mode == "notebook":
-            self.plot_notebook(close)
-
-    def plot_notebook(self, close=False):
-        """Live plot using the jupyter notebook rendering of matplotlib."""
-
-        if close:
-            self._plot = self._plot2 = self._plot3 = None
-            return
-
-        df_info = pd.DataFrame(self.infos)
-        df_info.index = pd.to_datetime(df_info["date"], unit="s")
-
-        # plot prices and performance
-        all_assets = ["BTCBTC"] + self.sim.asset_names
-        if not self._plot:
-            colors = [None] * len(all_assets) + ["black"]
-            self._plot_dir = (
-                os.path.join(self.log_dir, "notebook_plot_prices_" + str(time.time()))
-                if self.log_dir
-                else None
-            )
-            self._plot = LivePlotNotebook(
-                log_dir=self._plot_dir,
-                title="prices & performance",
-                labels=all_assets + ["Portfolio"],
-                ylabel="value",
-                colors=colors,
-            )
-        x = df_info.index
-        y_portfolio = df_info["portfolio_value"]
-        y_assets = [df_info["price_" + name].cumprod() for name in all_assets]
-        self._plot.update(x, y_assets + [y_portfolio])
-
-        # plot portfolio weights
-        if not self._plot2:
-            self._plot_dir2 = (
-                os.path.join(self.log_dir, "notebook_plot_weights_" + str(time.time()))
-                if self.log_dir
-                else None
-            )
-            self._plot2 = LivePlotNotebook(
-                log_dir=self._plot_dir2,
-                labels=all_assets,
-                title="weights",
-                ylabel="weight",
-            )
-        ys = [df_info["weight_" + name] for name in all_assets]
-        self._plot2.update(x, ys)
-
-        # plot portfolio costs
-        if not self._plot3:
-            self._plot_dir3 = (
-                os.path.join(self.log_dir, "notebook_plot_cost_" + str(time.time()))
-                if self.log_dir
-                else None
-            )
-            self._plot3 = LivePlotNotebook(
-                log_dir=self._plot_dir3, labels=["cost"], title="costs", ylabel="cost"
-            )
-        ys = [df_info["cost"].cumsum()]
-        self._plot3.update(x, ys)
-
-        if close:
-            self._plot = self._plot2 = self._plot3 = None
diff --git a/src/dagobert/modelling/rl/env02.py b/src/dagobert/modelling/rl/env02.py
deleted file mode 100644
index 7dfdbd55..00000000
--- a/src/dagobert/modelling/rl/env02.py
+++ /dev/null
@@ -1,473 +0,0 @@
-"""
-Modified from https://github.com/vermouth1992/drl-portfolio-management
-"""
-import numpy as np
-import pandas as pd
-import matplotlib.pyplot as plt
-
-import gym
-import gym.spaces
-
-eps = np.finfo(float).eps
-
-
-def random_shift(x, fraction):
-    """ Apply a random shift to a pandas series. """
-    min_x, max_x = np.min(x), np.max(x)
-    m = np.random.uniform(-fraction, fraction, size=x.shape) + 1
-    return np.clip(x * m, min_x, max_x)
-
-
-def scale_to_start(x):
-    """ Scale pandas series so that it starts at one. """
-    x = (x + eps) / (x[0] + eps)
-    return x
-
-
-def sharpe(returns, freq=30, rfr=0):
-    """ Given a set of returns, calculates naive (rfr=0) sharpe (eq 28). """
-    return (np.sqrt(freq) * np.mean(returns - rfr + eps)) / np.std(returns - rfr + eps)
-
-
-def max_drawdown(returns):
-    """ Max drawdown. See https://www.investopedia.com/terms/m/maximum-drawdown-mdd.asp """
-    peak = returns.max()
-    trough = returns[returns.argmax() :].min()
-    return (trough - peak) / (peak + eps)
-
-
-class DataGenerator(object):
-    """Acts as data provider for each new episode."""
-
-    def __init__(
-        self,
-        history,
-        abbreviation,
-        steps=730,
-        window_length=50,
-        start_idx=0,
-        start_date=None,
-    ):
-        """
-
-        Args:
-            history: (num_stocks, timestamp, 5) open, high, low, close, volume
-            abbreviation: a list of length num_stocks with assets name
-            steps: the total number of steps to simulate, default is 2 years
-            window_length: observation window, must be less than 50
-            start_date: the date to start. Default is None and random pick one.
-                        It should be a string e.g. '2012-08-13'
-        """
-        assert history.shape[0] == len(
-            abbreviation
-        ), "Number of stock is not consistent"
-        import copy
-
-        self.steps = steps + 1
-        self.window_length = window_length
-        self.step = start_idx
-        self.start_date = start_date
-
-        # make immutable class
-        self._data = history.copy()  # all data
-        self.asset_names = copy.copy(abbreviation)
-
-    def _step(self):
-        # get observation matrix from history, exclude volume, maybe volume is useful as it
-        # indicates how market total investment changes. Normalize could be critical here
-        self.step += 1
-        obs = self.data[:, self.step : self.step + self.window_length, :].copy()
-        # normalize obs with open price
-
-        # used for compute optimal action and sanity check
-        ground_truth_obs = self.data[
-            :, self.step + self.window_length : self.step + self.window_length + 1, :
-        ].copy()
-
-        done = self.step >= self.steps
-        return obs, done, ground_truth_obs
-
-    def reset(self):
-        self.step = 0
-
-        # get data for this episode, each episode might be different.
-        if self.start_date is None:
-            self.idx = np.random.randint(
-                low=self.window_length, high=self._data.shape[1] - self.steps
-            )
-        else:
-            raise ValueError("start_date is not yet supported / implemented")
-            # compute index corresponding to start_date for repeatable sequence
-            # self.idx = date_to_index(self.start_date) - self.start_idx
-            # assert (
-            #     self.idx >= self.window_length
-            #     and self.idx <= self._data.shape[1] - self.steps
-            # ), "Invalid start date, must be window_length day after start date and simulation steps day before end date"
-        data = self._data[
-            :, self.idx - self.window_length : self.idx + self.steps + 1, :4
-        ]
-        # apply augmentation?
-        self.data = data
-        return (
-            self.data[:, self.step : self.step + self.window_length, :].copy(),
-            self.data[
-                :,
-                self.step + self.window_length : self.step + self.window_length + 1,
-                :,
-            ].copy(),
-        )
-
-
-class PortfolioSim(object):
-    """
-    Portfolio management sim.
-    Params:
-    - cost e.g. 0.0025 is max in Poliniex
-    Based of [Jiang 2017](https://arxiv.org/abs/1706.10059)
-    """
-
-    def __init__(
-        self, asset_names=list(), steps=730, trading_cost=0.0025, time_cost=0.0
-    ):
-        self.asset_names = asset_names
-        self.cost = trading_cost
-        self.time_cost = time_cost
-        self.steps = steps
-        self.reset()
-
-    def _step(self, w1, y1):
-        """
-        Step.
-        w1 - new action of portfolio weights - e.g. [0.1,0.9,0.0]
-        y1 - price relative vector also called return
-            e.g. [1.0, 0.9, 1.1]
-        Numbered equations are from https://arxiv.org/abs/1706.10059
-        """
-        assert w1.shape == y1.shape, "w1 and y1 must have the same shape"
-        assert y1[0] == 1.0, "y1[0] must be 1"
-
-        p0 = self.p0
-
-        dw1 = (y1 * w1) / (np.dot(y1, w1) + eps)  # (eq7) weights evolve into
-
-        mu1 = self.cost * (np.abs(dw1 - w1)).sum()  # (eq16) cost to change portfolio
-
-        assert mu1 < 1.0, "Cost is larger than current holding"
-
-        p1 = p0 * (1 - mu1) * np.dot(y1, w1)  # (eq11) final portfolio value
-
-        p1 = p1 * (1 - self.time_cost)  # we can add a cost to holding
-
-        rho1 = p1 / p0 - 1  # rate of returns
-        r1 = np.log((p1 + eps) / (p0 + eps))  # log rate of return
-        reward = r1 / self.steps * 1000.0  # (22) average logarithmic accumulated return
-        # remember for next step
-        self.p0 = p1
-
-        # if we run out of money, we're done (losing all the money)
-        done = p1 == 0
-
-        info = {
-            "reward": reward,
-            "log_return": r1,
-            "portfolio_value": p1,
-            "return": y1.mean(),
-            "rate_of_return": rho1,
-            "weights_mean": w1.mean(),
-            "weights_std": w1.std(),
-            "cost": mu1,
-        }
-        self.infos.append(info)
-        return reward, info, done
-
-    def reset(self):
-        self.infos = []
-        self.p0 = 1.0
-
-
-class PortfolioEnv(gym.Env):
-    """
-    An environment for financial portfolio management.
-    Financial portfolio management is the process of constant redistribution of a fund into different
-    financial products.
-    Based on [Jiang 2017](https://arxiv.org/abs/1706.10059)
-    """
-
-    metadata = {"render.modes": ["human", "ansi"]}
-
-    def __init__(
-        self,
-        history,
-        abbreviation,
-        steps=730,  # 2 years
-        trading_cost=0.0025,
-        time_cost=0.00,
-        window_length=50,
-        start_idx=0,
-        sample_start_date=None,
-    ):
-        """
-        An environment for financial portfolio management.
-        Params:
-            steps - steps in episode
-            scale - scale data and each episode (except return)
-            augment - fraction to randomly shift data by
-            trading_cost - cost of trade as a fraction
-            time_cost - cost of holding as a fraction
-            window_length - how many past observations to return
-            start_idx - The number of days from '2012-08-13' of the dataset
-            sample_start_date - The start date sampling from the history
-        """
-        self.window_length = window_length
-        self.num_stocks = history.shape[0]
-        self.start_idx = start_idx
-
-        self.src = DataGenerator(
-            history,
-            abbreviation,
-            steps=steps,
-            window_length=window_length,
-            start_idx=start_idx,
-            start_date=sample_start_date,
-        )
-
-        self.sim = PortfolioSim(
-            asset_names=abbreviation,
-            trading_cost=trading_cost,
-            time_cost=time_cost,
-            steps=steps,
-        )
-
-        # openai gym attributes
-        # action will be the portfolio weights from 0 to 1 for each asset
-        self.action_space = gym.spaces.Box(
-            0, 1, shape=(len(self.src.asset_names) + 1,), dtype=np.float32
-        )  # include cash
-
-        # get the observation space from the data min and max
-        self.observation_space = gym.spaces.Box(
-            low=-np.inf,
-            high=np.inf,
-            shape=(len(abbreviation), window_length, history.shape[-1]),
-            dtype=np.float32,
-        )
-
-    def step(self, action):
-        return self._step(action)
-
-    def _step(self, action):
-        """
-        Step the env.
-        Actions should be portfolio [w0...]
-        - Where wn is a portfolio weight from 0 to 1. The first is cash_bias
-        - cn is the portfolio conversion weights see PortioSim._step for description
-        """
-        np.testing.assert_almost_equal(action.shape, (len(self.sim.asset_names) + 1,))
-
-        # normalise just in case
-        action = np.clip(action, 0, 1)
-
-        weights = action  # np.array([cash_bias] + list(action))  # [w0, w1...]
-        weights /= weights.sum() + eps
-        weights[0] += np.clip(
-            1 - weights.sum(), 0, 1
-        )  # so if weights are all zeros we normalise to [1,0...]
-
-        assert ((action >= 0) * (action <= 1)).all(), (
-            "all action values should be between 0 and 1. Not %s" % action
-        )
-        np.testing.assert_almost_equal(
-            np.sum(weights),
-            1.0,
-            3,
-            err_msg='weights should sum to 1. action="%s"' % weights,
-        )
-
-        observation, done1, ground_truth_obs = self.src._step()
-
-        # concatenate observation with ones
-        cash_observation = np.ones((1, self.window_length, observation.shape[2]))
-        observation = np.concatenate((cash_observation, observation), axis=0)
-
-        cash_ground_truth = np.ones((1, 1, ground_truth_obs.shape[2]))
-        ground_truth_obs = np.concatenate((cash_ground_truth, ground_truth_obs), axis=0)
-
-        # relative price vector of last observation day (close/open)
-        close_price_vector = observation[:, -1, 3]
-        open_price_vector = observation[:, -1, 0]
-        y1 = close_price_vector / open_price_vector
-        reward, info, done2 = self.sim._step(weights, y1)
-
-        # calculate return for buy and hold a bit of each asset
-        info["market_value"] = np.cumprod(
-            [inf["return"] for inf in self.infos + [info]]
-        )[-1]
-        # add dates
-        info["date"] = self.start_idx + self.src.idx + self.src.step
-        info["steps"] = self.src.step
-        info["next_obs"] = ground_truth_obs
-
-        self.infos.append(info)
-
-        return observation, reward, done1 or done2, info
-
-    def reset(self):
-        return self._reset()
-
-    def _reset(self):
-        self.infos = []
-        self.sim.reset()
-        observation, ground_truth_obs = self.src.reset()
-        cash_observation = np.ones((1, self.window_length, observation.shape[2]))
-        observation = np.concatenate((cash_observation, observation), axis=0)
-        cash_ground_truth = np.ones((1, 1, ground_truth_obs.shape[2]))
-        ground_truth_obs = np.concatenate((cash_ground_truth, ground_truth_obs), axis=0)
-        info = {}
-        info["next_obs"] = ground_truth_obs
-        return observation, info
-
-    def _render(self, mode="human", close=False):
-        if close:
-            return
-        if mode == "ansi":
-            print(self.infos[-1])
-        elif mode == "human":
-            self.plot()
-
-    def render(self, mode="human", close=False):
-        return self._render(mode="human", close=False)
-
-    def plot(self):
-        # show a plot of portfolio vs mean market performance
-        df_info = pd.DataFrame(self.infos)
-        df_info["date"] = pd.to_datetime(df_info["date"], format="%Y-%m-%d")
-        df_info.set_index("date", inplace=True)
-        mdd = max_drawdown(df_info.rate_of_return + 1)
-        sharpe_ratio = sharpe(df_info.rate_of_return)
-        title = "max_drawdown={: 2.2%} sharpe_ratio={: 2.4f}".format(mdd, sharpe_ratio)
-        df_info[["portfolio_value", "market_value"]].plot(
-            title=title, fig=plt.gcf(), rot=30
-        )
-
-
-class MultiActionPortfolioEnv(PortfolioEnv):
-    def __init__(
-        self,
-        history,
-        abbreviation,
-        model_names,
-        steps=730,  # 2 years
-        trading_cost=0.0025,
-        time_cost=0.00,
-        window_length=50,
-        start_idx=0,
-        sample_start_date=None,
-    ):
-        super(MultiActionPortfolioEnv, self).__init__(
-            history,
-            abbreviation,
-            steps,
-            trading_cost,
-            time_cost,
-            window_length,
-            start_idx,
-            sample_start_date,
-        )
-        self.model_names = model_names
-        # need to create each simulator for each model
-        self.sim = [
-            PortfolioSim(
-                asset_names=abbreviation,
-                trading_cost=trading_cost,
-                time_cost=time_cost,
-                steps=steps,
-            )
-            for _ in range(len(self.model_names))
-        ]
-
-    def _step(self, action):
-        """Step the environment by a vector of actions
-
-        Args:
-            action: (num_models, num_stocks + 1)
-
-        Returns:
-
-        """
-        assert (
-            action.ndim == 2
-        ), "Action must be a two dimensional array with shape (num_models, num_stocks + 1)"
-        assert action.shape[1] == len(self.sim[0].asset_names) + 1
-        assert action.shape[0] == len(self.model_names)
-        # normalise just in case
-        action = np.clip(action, 0, 1)
-        weights = action  # np.array([cash_bias] + list(action))  # [w0, w1...]
-        weights /= np.sum(weights, axis=1, keepdims=True) + eps
-        # so if weights are all zeros we normalise to [1,0...]
-        weights[:, 0] += np.clip(1 - np.sum(weights, axis=1), 0, 1)
-        assert ((action >= 0) * (action <= 1)).all(), (
-            "all action values should be between 0 and 1. Not %s" % action
-        )
-        np.testing.assert_almost_equal(
-            np.sum(weights, axis=1),
-            np.ones(shape=(weights.shape[0])),
-            3,
-            err_msg='weights should sum to 1. action="%s"' % weights,
-        )
-        observation, done1, ground_truth_obs = self.src._step()
-
-        # concatenate observation with ones
-        cash_observation = np.ones((1, self.window_length, observation.shape[2]))
-        observation = np.concatenate((cash_observation, observation), axis=0)
-
-        cash_ground_truth = np.ones((1, 1, ground_truth_obs.shape[2]))
-        ground_truth_obs = np.concatenate((cash_ground_truth, ground_truth_obs), axis=0)
-
-        # relative price vector of last observation day (close/open)
-        close_price_vector = observation[:, -1, 3]
-        open_price_vector = observation[:, -1, 0]
-        y1 = close_price_vector / open_price_vector
-
-        rewards = np.empty(shape=(weights.shape[0]))
-        info = {}
-        dones = np.empty(shape=(weights.shape[0]), dtype=bool)
-        for i in range(weights.shape[0]):
-            reward, current_info, done2 = self.sim[i]._step(weights[i], y1)
-            rewards[i] = reward
-            info[self.model_names[i]] = current_info["portfolio_value"]
-            info["return"] = current_info["return"]
-            dones[i] = done2
-
-        # calculate return for buy and hold a bit of each asset
-        info["market_value"] = np.cumprod(
-            [inf["return"] for inf in self.infos + [info]]
-        )[-1]
-        # add dates
-        info["date"] = self.start_idx + self.src.idx + self.src.step
-        info["steps"] = self.src.step
-        info["next_obs"] = ground_truth_obs
-
-        self.infos.append(info)
-
-        return observation, rewards, np.all(dones) or done1, info
-
-    def _reset(self):
-        self.infos = []
-        for sim in self.sim:
-            sim.reset()
-        observation, ground_truth_obs = self.src.reset()
-        cash_observation = np.ones((1, self.window_length, observation.shape[2]))
-        observation = np.concatenate((cash_observation, observation), axis=0)
-        cash_ground_truth = np.ones((1, 1, ground_truth_obs.shape[2]))
-        ground_truth_obs = np.concatenate((cash_ground_truth, ground_truth_obs), axis=0)
-        info = {}
-        info["next_obs"] = ground_truth_obs
-        return observation, info
-
-    def plot(self):
-        df_info = pd.DataFrame(self.infos)
-        fig = plt.gcf()
-        title = "Trading Performance of Various Models"
-        df_info["date"] = pd.to_datetime(df_info["date"], format="%Y-%m-%d")
-        df_info.set_index("date", inplace=True)
-        df_info[self.model_names + ["market_value"]].plot(title=title, fig=fig, rot=30)

From 15c2283f827032129a4cc4327aad5dd32ed094ff Mon Sep 17 00:00:00 2001
From: Daniel Homola <dani.homola@gmail.com>
Date: Sat, 2 Jan 2021 17:02:21 +0000
Subject: [PATCH 16/62] blackening somehow didn't run...Marci used to have
 this..

---
 src/dagobert/modelling/dl/data.py          |  4 +++-
 src/dagobert/modelling/dl/preprocessing.py | 10 ++++++++--
 src/dagobert/modelling/dl/tcn.py           |  9 +++++++--
 src/dagobert/modelling/dl/tcn_args.py      | 18 ++++++++++++++----
 src/dagobert/modelling/dl/tcn_net.py       |  9 ++++++++-
 5 files changed, 40 insertions(+), 10 deletions(-)

diff --git a/src/dagobert/modelling/dl/data.py b/src/dagobert/modelling/dl/data.py
index 46748dd3..91e96391 100644
--- a/src/dagobert/modelling/dl/data.py
+++ b/src/dagobert/modelling/dl/data.py
@@ -198,7 +198,9 @@ def __getitem__(self, idx):
     # FUNCTIONS FOR SETUP
     # ----------------------------------------------------------------------------------
 
-    def _load_df_anchor(self,) -> pd.DatetimeIndex:
+    def _load_df_anchor(
+        self,
+    ) -> pd.DatetimeIndex:
         """
         Loads the anchor DF, and returns it. We use the anchor df for plotting and to
         extract the master index which we measure everything else against in batching.
diff --git a/src/dagobert/modelling/dl/preprocessing.py b/src/dagobert/modelling/dl/preprocessing.py
index ebce640c..ffa6916a 100644
--- a/src/dagobert/modelling/dl/preprocessing.py
+++ b/src/dagobert/modelling/dl/preprocessing.py
@@ -85,7 +85,10 @@ def preprocess_augment_dfs(hparams: Namespace) -> Namespace:
 
     @staticmethod
     def _preprocess_augment_dfs(
-        hparams: Namespace, df_name: str, df_path: str, df_path_prev: str,
+        hparams: Namespace,
+        df_name: str,
+        df_path: str,
+        df_path_prev: str,
     ) -> Optional:
         """
         Helper function that performs the preprocessing of simple augment DFs.
@@ -550,7 +553,10 @@ def _quantile_filter(
 
     @staticmethod
     def _binarise(
-        df: pd.DataFrame, method: str, threshold: float, df_name: str = "",
+        df: pd.DataFrame,
+        method: str,
+        threshold: float,
+        df_name: str = "",
     ) -> pd.DataFrame:
         """
         Binarises a DF with the provided method and threshold.
diff --git a/src/dagobert/modelling/dl/tcn.py b/src/dagobert/modelling/dl/tcn.py
index 35e858ad..79dd6457 100644
--- a/src/dagobert/modelling/dl/tcn.py
+++ b/src/dagobert/modelling/dl/tcn.py
@@ -130,7 +130,11 @@ def setup_and_run_tcn_lightning(args: Namespace, study: bool = False):
         args,
         logger=tcn_loggers,
         checkpoint_callback=checkpoint_callback,
-        callbacks=[early_stop_callback, metrics_callback, LearningRateMonitor(),],
+        callbacks=[
+            early_stop_callback,
+            metrics_callback,
+            LearningRateMonitor(),
+        ],
     )
     model = TCNLightning(args)
 
@@ -208,7 +212,8 @@ def __init__(self, hparams: Namespace):
         if self.hparams.mix_density_net:
             self.linear_mu = nn.Linear(self.hparams.num_channels[-1], self.density_num)
             self.linear_sigmasq = nn.Linear(
-                self.hparams.num_channels[-1], self.density_num,
+                self.hparams.num_channels[-1],
+                self.density_num,
             )
             self.linear_mix = nn.Linear(self.hparams.num_channels[-1], self.density_num)
         self = self.float()
diff --git a/src/dagobert/modelling/dl/tcn_args.py b/src/dagobert/modelling/dl/tcn_args.py
index cd15e7db..d08b0a0b 100644
--- a/src/dagobert/modelling/dl/tcn_args.py
+++ b/src/dagobert/modelling/dl/tcn_args.py
@@ -43,7 +43,10 @@ def add_run_specific_args(parent_parser):
         help="Number of cores to use to prepare the batches.",
     )
     parser.add_argument(
-        "--exp_name", type=str, default="TCN", help="Name of experiment.",
+        "--exp_name",
+        type=str,
+        default="TCN",
+        help="Name of experiment.",
     )
     parser.add_argument(
         "--tags",
@@ -233,7 +236,9 @@ def add_data_specific_args(parent_parser):
     # this is just a place-holder so it's easier to read the million params in the cmd
     parser.add_argument("--DATA_PARAMS", help="====================================")
     parser.add_argument(
-        "--data_dir", type=str, help="Path to folder holding the data files to use.",
+        "--data_dir",
+        type=str,
+        help="Path to folder holding the data files to use.",
     )
     parser.add_argument(
         "--lookback", type=float, default=6, help="Lookback length in hours."
@@ -267,7 +272,9 @@ def add_data_specific_args(parent_parser):
         ),
     )
     parser.add_argument(
-        "--to_label", action="store_true", help="Label datasets before preprocessing.",
+        "--to_label",
+        action="store_true",
+        help="Label datasets before preprocessing.",
     )
     parser.add_argument(
         "--label_sl", type=int, default=1, help="Stop-loss barrier size."
@@ -279,7 +286,10 @@ def add_data_specific_args(parent_parser):
         "--label_first_or_max",
         type=str,
         default=NBarriers.first,
-        choices=[NBarriers.first, NBarriers.max,],
+        choices=[
+            NBarriers.first,
+            NBarriers.max,
+        ],
         help="Weather to use the first or maximum barrier-touch.",
     )
     parser.add_argument(
diff --git a/src/dagobert/modelling/dl/tcn_net.py b/src/dagobert/modelling/dl/tcn_net.py
index 0bf2bba7..b49b139a 100644
--- a/src/dagobert/modelling/dl/tcn_net.py
+++ b/src/dagobert/modelling/dl/tcn_net.py
@@ -40,7 +40,14 @@ class TemporalBlock(nn.Module):
     """
 
     def __init__(
-        self, n_inputs, n_outputs, kernel_size, stride, dilation, padding, dropout=0.2,
+        self,
+        n_inputs,
+        n_outputs,
+        kernel_size,
+        stride,
+        dilation,
+        padding,
+        dropout=0.2,
     ):
         super(TemporalBlock, self).__init__()
         self.conv1 = weight_norm(

From 06aec4e418d48a38a7eb048da7bad68ce7d6ead0 Mon Sep 17 00:00:00 2001
From: Daniel Homola <dani.homola@gmail.com>
Date: Mon, 4 Jan 2021 18:15:39 +0000
Subject: [PATCH 17/62] ppo now trains and loss doesn't explode (nearly as
 often)

---
 config/rl_config.yaml                    | 13 ++--
 src/dagobert/modelling/rl/environment.py | 72 +++++++++++----------
 src/dagobert/modelling/rl/networks.py    | 26 ++++++--
 src/dagobert/modelling/rl/ppo.py         | 79 +++++++++---------------
 4 files changed, 99 insertions(+), 91 deletions(-)

diff --git a/config/rl_config.yaml b/config/rl_config.yaml
index 3c0e573d..5e9315ec 100644
--- a/config/rl_config.yaml
+++ b/config/rl_config.yaml
@@ -23,7 +23,7 @@ tags:
   - RL_test
 no_comet_logger: True
 seed: 42
-batch_size: 128
+batch_size: 256
 
 
 # --------------------------------------------------------------------------------------
@@ -35,14 +35,15 @@ asset_names:
   - ETH
 trading_cost: 0.002
 reward_type: return
-max_episode_length: 1000
-steps_per_epoch: 2000
+max_episode_length: 500
+steps_per_epoch: 5000
 n_optim_iters: 4
 gamma: 0.99
-lamb: 0.95
-lr_actor: 0.0003
+lam: 0.95
+lr_actor: 0.001
 lr_critic: 0.001
-clip_ratio: 0.2
+clip_ratio: 0.25
+target_kl: 0.01
 
 # don't change these, or preprocessing won't work 
 target_col: rl_return
diff --git a/src/dagobert/modelling/rl/environment.py b/src/dagobert/modelling/rl/environment.py
index ded6c16c..a7074f91 100644
--- a/src/dagobert/modelling/rl/environment.py
+++ b/src/dagobert/modelling/rl/environment.py
@@ -36,7 +36,16 @@ def __init__(
 
         Args:
             hparams: Hyparams parsed by the rl_runner. Similar to how `TCNLightning` is
-                initialized.
+                initialized with the following fields:
+                - max_episode_length
+                - cols_to_model
+                - target_col
+                - mini_series_length
+                - data_dir
+                - augment_dfs
+                - augment_prob
+                - augment_method
+                - augment_dfs_mix
             train_val_test: Whether we are training, validating or testing, it must be
                 either train, val or test.
         """
@@ -68,10 +77,8 @@ def step(self):
         y1 = np.concatenate([[1.0], ys])
         # turn Xs into a batch of 1, ready to be fed into the actor/critic
         Xs = [torch.Tensor(x).unsqueeze(0) for x in Xs]
-        episode_full = self.idx == self.hparams.max_episode_length - 1
-        done = True if episode_full else False
         self.idx += 1
-        return Xs, y1, done
+        return Xs, y1
 
     def reset(self):
         self.idx = np.random.randint(self.dataset_len - self.hparams.max_episode_length)
@@ -92,27 +99,21 @@ class RLPortfolio(object):
     https://github.com/ZhengyaoJiang/PGPortfolio/issues/99
     """
 
-    def __init__(
-        self,
-        asset_names: List[str],
-        max_episode_length: int = 1000,
-        trading_cost: float = 0.002,
-        reward_type: str = "return",
-    ):
+    def __init__(self, hparams: Namespace):
         """
         Class constructor.
 
         Args:
-            asset_names: Names of assets in the portfolio.
-            trading_cost: Commission rate, currently set to Binance's VIP0 taker level
-                plus doubled it to account for slippage. TODO: model slippage.
-            reward_type: Whether to use the log return as reward or the sharpe ratio,
-                which was found to be more stable.
+            hparams: Hyparams parsed by the rl_runner. Similar to how `TCNLightning` is
+                initialized with the following fields:
+                - asset_names
+                - trading_cost
+                - reward_type
         """
-        self.asset_names = asset_names
-        self.asset_n = len(asset_names)
-        self.trading_cost = trading_cost
-        self.reward_type = reward_type
+        self.asset_names = hparams.asset_names
+        self.asset_n = len(self.asset_names)
+        self.trading_cost = hparams.trading_cost
+        self.reward_type = hparams.reward_type
         self.reset()
 
     def step(self, w1: np.array, y1: np.array) -> Tuple[float, dict, bool]:
@@ -198,24 +199,33 @@ def __init__(self, hparams: Namespace):
         An environment for financial portfolio management.
 
         Args:
-            hparams:
-
+            hparams: Hyparams parsed by the rl_runner. Similar to how `TCNLightning` is
+                initialized with the following fields:
+                - max_episode_length
+                - cols_to_model
+                - target_col
+                - mini_series_length
+                - data_dir
+                - augment_dfs
+                - augment_prob
+                - augment_method
+                - augment_dfs_mix
+                - asset_names
+                - trading_cost
+                - reward_type
         """
         self.infos = []
         self.hparams = hparams
         self.asset_n = len(self.hparams.asset_names)
         self.feat_n = len(self.hparams.cols_to_model[npa.anchor])
         self.data = RLData(self.hparams, train_val_test="train")
-        self.portfolio = RLPortfolio(
-            self.hparams.asset_names, self.hparams.max_episode_length
-        )
+        self.portfolio = RLPortfolio(self.hparams)
 
         # setup openai gym env - include cash in the portfolio action space
         self.action_space = gym.spaces.Box(
             0.0, 1.0, shape=(self.asset_n + 1,), dtype=np.float32
         )
-
-        # get the observation space from the data min and max
+        # observation space isn't used anywhere, but we define it for documnetation
         self.observation_space = gym.spaces.Dict(
             {
                 "state": gym.spaces.Box(
@@ -241,16 +251,16 @@ def step(self, action: np.array):
         weights = action
         weights /= weights.sum() + eps
 
-        next_state, y1, done1 = self.data.step()
-        reward, info, done2 = self.portfolio.step(weights, y1)
+        next_state, y1 = self.data.step()
+        reward, info, done = self.portfolio.step(weights, y1)
         self.infos.append(info)
 
-        return next_state, reward, done1 or done2, info
+        return next_state, reward, done, info
 
     def reset(self):
         self.infos = []
         self.portfolio.reset()
-        next_state, _, _ = self.data.reset()
+        next_state, _ = self.data.reset()
         return next_state
 
     def render(self):
diff --git a/src/dagobert/modelling/rl/networks.py b/src/dagobert/modelling/rl/networks.py
index 664e82d0..1d651a6f 100644
--- a/src/dagobert/modelling/rl/networks.py
+++ b/src/dagobert/modelling/rl/networks.py
@@ -32,6 +32,7 @@ def __init__(
         """
         super().__init__()
         self.hparams = hparams
+        self.n_actions = n_actions
         num_inputs = [len(cols) for dataset, cols in hparams.cols_to_model.items()]
         num_channels = (
             hparams.actor_num_channels if actor else hparams.critic_num_channels
@@ -88,14 +89,16 @@ def __init__(self, actor_net):
         any papers or implementations actually doing this.
 
         Args:
-            input_shape: observation shape of the environment
-            n_actions: number of discrete actions available in the environment
+            actor_net: Initialized actor net.
         """
         super().__init__()
         self.actor_net = actor_net
+        self.inv_lin = InverseLinear()
 
     def forward(self, states):
-        concentrations = nn.functional.softmax(self.actor_net(states), dim=1).squeeze(0)
+        # get params for Dirichlet, and drop batch dim if batch_size=1
+        logits = self.actor_net(states)
+        concentrations = self.inv_lin(logits).squeeze(0)
         pi = Dirichlet(concentrations)
         actions = pi.sample()
         return pi, actions
@@ -111,7 +114,22 @@ def get_log_prob(self, pi: Dirichlet, actions: torch.Tensor):
         Returns:
             log probability of the acition under pi
         """
-        return pi.log_prob(actions).sum(axis=-1)
+        return pi.log_prob(actions)
+
+
+class InverseLinear(nn.Module):
+    """
+    Implements a layer specifically designed for Dirichlet distribution as final
+    layer, see here: https://openreview.net/pdf?id=BJeRg205Fm
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        x[x < 0] = 1 / (1 - x[x < 0])
+        x[x >= 0] = x[x >= 0] + 1
+        return x
 
 
 class ActorCriticAgent(object):
diff --git a/src/dagobert/modelling/rl/ppo.py b/src/dagobert/modelling/rl/ppo.py
index 0c70dab8..6b7b7372 100644
--- a/src/dagobert/modelling/rl/ppo.py
+++ b/src/dagobert/modelling/rl/ppo.py
@@ -130,22 +130,6 @@ def __init__(self, hparams: Namespace):
         self.avg_ep_len = 0
         self.avg_reward = 0
 
-    # def forward(
-    #     self, states: List[torch.Tensor]
-    # ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-    #     """
-    #     Passes state x through the network and returns the policy and a sampled action.
-
-    #     Args:
-    #         x: environment state
-
-    #     Returns:
-    #         Tuple of policy and action
-    #     """
-    #     pi, action = self.actor(states)
-    #     value = self.critic(states)
-    #     return pi, action, value
-
     def discount_rewards(self, rewards: List[float], discount: float) -> List[float]:
         """
         Calculate the discounted rewards of all rewards in list.
@@ -157,7 +141,6 @@ def discount_rewards(self, rewards: List[float], discount: float) -> List[float]
             list of discounted rewards/advantages
         """
         assert isinstance(rewards[0], float)
-
         cumul_reward = []
         sum_r = 0.0
         for r in reversed(rewards):
@@ -165,29 +148,28 @@ def discount_rewards(self, rewards: List[float], discount: float) -> List[float]
             cumul_reward.append(sum_r)
         return list(reversed(cumul_reward))
 
-    def calc_advantage(
-        self, rewards: List[float], values: List[float], last_value: float
-    ) -> List[float]:
+    def calc_advantage(self, rewards: List[float], values: List[float]) -> List[float]:
         """
         Calculate the advantage given rewards, state values, and last value of episode.
 
         Args:
             rewards: list of episode rewards
             values: list of state values from critic
-            last_value: value of last state of episode
 
         Returns:
             list of advantages
         """
-        rews = rewards + [last_value]
-        vals = values + [last_value]
         # GAE
         delta = [
-            rews[i] + self.hparams.gamma * vals[i + 1] - vals[i]
-            for i in range(len(rews) - 1)
+            rewards[i] + self.hparams.gamma * values[i + 1] - values[i]
+            for i in range(len(rewards) - 1)
         ]
         adv = self.discount_rewards(delta, self.hparams.gamma * self.hparams.lam)
-        return adv
+
+        # normalise advantage
+        adv = np.array(adv)
+        adv = (adv - adv.mean()) / (adv.std() + np.finfo(float).eps)
+        return list(adv)
 
     def train_batch(
         self,
@@ -220,24 +202,13 @@ def train_batch(
 
             epoch_end = step == (self.hparams.steps_per_epoch - 1)
             terminal = len(self.ep_rewards) == self.hparams.max_episode_length
-
             if epoch_end or done or terminal:
-                # if trajectory ends abtruptly, boostrap value of next state
-                if (terminal or epoch_end) and not done:
-                    with torch.no_grad():
-                        _, _, _, value = self.agent(self.state, self.device)
-                        last_value = value.item()
-                else:
-                    last_value = 0
-
                 # discounted cumulative reward
                 self.batch_qvals += self.discount_rewards(
-                    self.ep_rewards + [last_value], self.hparams.gamma
+                    self.ep_rewards, self.hparams.gamma
                 )[:-1]
                 # advantage
-                self.batch_adv += self.calc_advantage(
-                    self.ep_rewards, self.ep_values, last_value
-                )
+                self.batch_adv += self.calc_advantage(self.ep_rewards, self.ep_values)
                 # logs
                 self.done_episodes += 1
                 self.epoch_rewards += np.sum(self.ep_rewards)
@@ -274,12 +245,15 @@ def train_batch(
     def actor_loss(self, state, action, logp_old, qval, adv) -> torch.Tensor:
         pi, _ = self.actor(state)
         logp = self.actor.get_log_prob(pi, action)
-        ratio = torch.exp(logp - logp_old.sum(-1))
+        old_new_diff = logp - logp_old
+        ratio = torch.exp(old_new_diff)
+        # idea taken from spinningup PPO implemenetation to prevent exploding loss
+        approx_kl = old_new_diff.mean().item()
         clip_ratio = torch.clamp(
             ratio, 1 - self.hparams.clip_ratio, 1 + self.hparams.clip_ratio
         )
         loss_actor = -(torch.min(ratio * adv, clip_ratio * adv)).mean()
-        return loss_actor
+        return loss_actor, approx_kl
 
     def critic_loss(self, state, action, logp_old, qval, adv) -> torch.Tensor:
         value = self.critic(state)
@@ -314,9 +288,8 @@ def training_step(
         self.log(
             "avg_reward", self.avg_reward, prog_bar=True, on_step=False, on_epoch=True
         )
-
-        if optimizer_idx % 2 == 0:
-            loss_actor = self.actor_loss(state, action, old_logp, qval, adv)
+        if optimizer_idx == 0:
+            loss_actor, approx_kl = self.actor_loss(state, action, old_logp, qval, adv)
             self.log(
                 "loss_actor",
                 loss_actor,
@@ -325,9 +298,17 @@ def training_step(
                 prog_bar=True,
                 logger=True,
             )
+            self.log(
+                "approx_kl",
+                approx_kl,
+                on_step=False,
+                on_epoch=True,
+                prog_bar=True,
+                logger=True,
+            )
             return loss_actor
 
-        else:
+        elif optimizer_idx == 1:
             loss_critic = self.critic_loss(state, action, old_logp, qval, adv)
             self.log(
                 "loss_critic",
@@ -345,13 +326,11 @@ def configure_optimizers(self) -> List[optim.Optimizer]:
         optimizer_critic = optim.Adam(
             self.critic.parameters(), lr=self.hparams.lr_critic
         )
+        return optimizer_actor, optimizer_critic
 
-        # workaround to run multple steps of gradient descent within LightningModule
-        optimizers = []
+    def optimizer_step(self, *args, **kwargs):
         for _ in range(self.hparams.n_optim_iters):
-            optimizers.append(optimizer_actor)
-            optimizers.append(optimizer_critic)
-        return optimizers
+            super().optimizer_step(*args, **kwargs)
 
     def train_dataloader(self) -> DataLoader:
         """Initialize the Replay Buffer dataset used for retrieving experiences"""

From 9a7a57a7e606c709af8563458e89b3c62526b206 Mon Sep 17 00:00:00 2001
From: MateMarcell <marcell-mate@hotmail.com>
Date: Thu, 7 Jan 2021 00:12:50 +0000
Subject: [PATCH 18/62] started

---
 notebooks/modelling/rl_env.ipynb            |  4 +-
 src/dagobert/data/lambda/orderbook_data.py  | 28 ++++++++++
 src/dagobert/modelling/augmentation/tgan.py | 58 +++++++++++++++++++++
 3 files changed, 88 insertions(+), 2 deletions(-)
 create mode 100644 src/dagobert/data/lambda/orderbook_data.py
 create mode 100644 src/dagobert/modelling/augmentation/tgan.py

diff --git a/notebooks/modelling/rl_env.ipynb b/notebooks/modelling/rl_env.ipynb
index ea5164b2..a3333841 100644
--- a/notebooks/modelling/rl_env.ipynb
+++ b/notebooks/modelling/rl_env.ipynb
@@ -346,9 +346,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3.7.6 64-bit ('dagobert': conda)",
+   "display_name": "Python 3",
    "language": "python",
-   "name": "python37664bitdagobertconda90fcdb25face404d8cd237e8f8473045"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
diff --git a/src/dagobert/data/lambda/orderbook_data.py b/src/dagobert/data/lambda/orderbook_data.py
new file mode 100644
index 00000000..0ddbfb33
--- /dev/null
+++ b/src/dagobert/data/lambda/orderbook_data.py
@@ -0,0 +1,28 @@
+import os
+import boto3
+from binance.client import Client
+import pandas as pd
+import time
+
+
+def fetch_orderbook_data():
+    s3 = boto3.resource("s3")
+    client = Client("", "")
+
+    pairs = ["BTCUSDT", "ETHUSDT", "XRPUSDT", "BCHUSDT", "LTCUSDT"]
+    bucket_name = "dagobert-orderbook"
+
+    for pair in pairs:
+        response = client.get_order_book(symbol=pair, limit=1000)
+
+        df = pd.DataFrame(response)
+        df = df[["bids", "asks"]]
+
+        name = f"{pair}_{int(time.time())}.csv"
+
+        df.to_csv(name, compression="gzip")
+
+        file_object = s3.Object(bucket_name, name)
+        file_object.upload_file(name)
+
+        os.remove(name)
diff --git a/src/dagobert/modelling/augmentation/tgan.py b/src/dagobert/modelling/augmentation/tgan.py
new file mode 100644
index 00000000..5706658d
--- /dev/null
+++ b/src/dagobert/modelling/augmentation/tgan.py
@@ -0,0 +1,58 @@
+"""
+TimeGAN network, following the original implementation:
+https://bitbucket.org/mvdschaar/mlforhealthlabpub/src/master/alg/timegan/tgan.py.
+"""
+from typing import List
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as f
+from torch.nn.utils import weight_norm
+
+
+class Generator(nn.Module):
+    def __init__(
+        self,
+        input_size,
+        hidden_size,
+        num_layers,
+        dropout,
+        batch_first=True,
+    ):
+        super(Generator, self).__init__()
+
+        # input/output: (batch, seq, feature)
+        # TODO: hparams?
+        self.lstm = nn.LSTM(
+            input_size=input_size,
+            hidden_size=hidden_size,
+            num_layers=num_layers,
+            dropout=dropout,
+            batch_first=batch_first,
+        )
+        # TODO: size of output is ?
+        self.linear = nn.Linear(hidden_size, hidden_size)
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, x):
+        # lstm_out = (batch_size, seq_len, hidden_size)
+        lstm_out, _ = self.lstm(x)
+        y_pred = self.linear(lstm_out[:, -1])
+        return y_pred
+
+
+def rnn_cell(module_name):
+    # GRU
+    if module_name == "gru":
+        rnn_cell = tf.nn.rnn_cell.GRUCell(num_units=hidden_dim, activation=tf.nn.tanh)
+    # LSTM
+    elif module_name == "lstm":
+        rnn_cell = tf.contrib.rnn.BasicLSTMCell(
+            num_units=hidden_dim, activation=tf.nn.tanh
+        )
+    # LSTM Layer Normalization
+    elif module_name == "lstmLN":
+        rnn_cell = tf.contrib.rnn.LayerNormBasicLSTMCell(
+            num_units=hidden_dim, activation=tf.nn.tanh
+        )
+    return rnn_cell

From 0e7cc2556f6f68362a7e6f98f3ae06b6a53de246 Mon Sep 17 00:00:00 2001
From: Daniel Homola <dani.homola@gmail.com>
Date: Thu, 7 Jan 2021 09:34:46 +0000
Subject: [PATCH 19/62] small changes

---
 config/rl_config.yaml                         |  1 -
 notebooks/modelling/rl_env.ipynb              | 41 ++--------
 src/dagobert/modelling/dl/data.py             |  4 +-
 src/dagobert/modelling/rl/networks.py         | 76 +++++++++++--------
 src/dagobert/modelling/rl/ppo.py              | 67 ++++++++++++----
 src/dagobert/modelling/utils.py               |  2 +-
 .../feature_creation/time_features.py         |  4 +-
 7 files changed, 108 insertions(+), 87 deletions(-)

diff --git a/config/rl_config.yaml b/config/rl_config.yaml
index 5e9315ec..e6488e7a 100644
--- a/config/rl_config.yaml
+++ b/config/rl_config.yaml
@@ -25,7 +25,6 @@ no_comet_logger: True
 seed: 42
 batch_size: 256
 
-
 # --------------------------------------------------------------------------------------
 # RL
 # --------------------------------------------------------------------------------------
diff --git a/notebooks/modelling/rl_env.ipynb b/notebooks/modelling/rl_env.ipynb
index ea5164b2..e4e1b0f2 100644
--- a/notebooks/modelling/rl_env.ipynb
+++ b/notebooks/modelling/rl_env.ipynb
@@ -4,16 +4,7 @@
    "cell_type": "code",
    "execution_count": 2,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "The autoreload extension is already loaded. To reload it, use:\n",
-      "  %reload_ext autoreload\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "%load_ext autoreload\n",
     "%autoreload 2\n",
@@ -37,7 +28,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 43,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
@@ -46,7 +37,7 @@
        "array([0.48192771, 0.26506024, 0.25301205])"
       ]
      },
-     "execution_count": 43,
+     "execution_count": 9,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -70,7 +61,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 44,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
@@ -79,7 +70,7 @@
        "0.00020481927710843396"
       ]
      },
-     "execution_count": 44,
+     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -114,26 +105,6 @@
     "p0 * (1 - mu)"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 54,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "1.0375"
-      ]
-     },
-     "execution_count": 54,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "np.dot(y1, w0)"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": 45,
@@ -189,7 +160,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 49,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
diff --git a/src/dagobert/modelling/dl/data.py b/src/dagobert/modelling/dl/data.py
index 91e96391..969a2a56 100644
--- a/src/dagobert/modelling/dl/data.py
+++ b/src/dagobert/modelling/dl/data.py
@@ -325,7 +325,7 @@ def _get_target(self, df: pd.DataFrame) -> np.array:
         """Returns the target values (y) to use for batching for a given DF."""
         if self.simple_lookahead_y:
             # calculate simple moving average on the close original to smooth it
-            mean_bar_length = pd.Series(self.idx).diff().dt.seconds.mean() / 60
+            mean_bar_length = pd.Series(self.idx).diff().dt.total_seconds().mean() / 60
             window_size = int(np.round(self.simple_lookahead_y / mean_bar_length))
             return (
                 df[npa.close_original]
@@ -541,7 +541,7 @@ def plot(self) -> Tuple[Figure]:
 
         # plot the data columns, add date_diff_seconds to data coumns for sanity check
         df_data = df[self.cols_to_model[npa.anchor]].copy(deep=True)
-        date_diff_secs = pd.Series(df.index).diff().dt.seconds.values
+        date_diff_secs = pd.Series(df.index).diff().dt.total_seconds().values
         df_data.insert(0, "date_diff_secs", date_diff_secs)
         subplot_cols_n = 5
         cols_n_to_plot = len(df_data.columns)
diff --git a/src/dagobert/modelling/rl/networks.py b/src/dagobert/modelling/rl/networks.py
index 1d651a6f..3f05e70c 100644
--- a/src/dagobert/modelling/rl/networks.py
+++ b/src/dagobert/modelling/rl/networks.py
@@ -16,16 +16,19 @@ class ActorCriticTCN(nn.Module):
     """
 
     def __init__(
-        self, hparams: Namespace, n_actions: int, actor: bool = True
+        self, hparams: Namespace, n_actions: int, output_size: int, actor: bool = True
     ) -> nn.Module:
         """
         Init a TCN like we do in `dagobert.modelling.dl.tcn`.
 
         Args:
             hparams: Hparam parsed and updated by PPO module in dagobert.modelling.rl.
-            n_actions: Number of units at the end of the network: different for actor/critic
-            actor: If True, we are using the network params in hparams for the actor net,
-                else we take the params for the critic.
+            n_actions: Dimension of actions which is one of the inputs to the networks
+                along with the state (history price tensor).
+            output_size: Number of units at the end of the network. This is
+                different for actor/critic.
+            actor: If True, we are using the network params in hparams for the actor
+                net, else we take the params for the critic.
 
         Returns:
             Initiated TCN with the appropriate size for actor or critic.
@@ -47,16 +50,21 @@ def __init__(
             time_feat_n=hparams.time_feat_n,
             time_embed_dim=hparams.time_embed_dim,
         )
+        self.linear_a = nn.Linear(n_actions + 1, num_channels[-1])
         self.linear1 = nn.Linear(hparams.mini_series_length, 1)
-        self.linear2 = nn.Linear(num_channels[-1], n_actions)
+        # self.linear2 = nn.Linear(num_channels[-1] * 2, output_size)
+        self.linear2 = nn.Linear(num_channels[-1], output_size)
 
-    def forward(self, x):
-        y1 = self.tcn(*x)
+    def forward(self, state, past_pw):
+        s1 = self.tcn(*state)
+        a1 = torch.tanh(self.linear_a(past_pw))
         if self.hparams.use_last_timepoint:
-            return self.linear2(y1[:, :, -1])
+            s2 = s1[:, :, -1]
         else:
-            y2 = nn.functional.relu(self.linear1(y1).squeeze(-1))
-            return self.linear2(y2)
+            s2 = torch.tanh(self.linear1(s1).squeeze(-1))
+        # bring together the state and past_pw representations
+        # return self.linear2(torch.cat([s2, a1], dim=1))
+        return self.linear2(s2)
 
 
 class ActorContinous(nn.Module):
@@ -95,9 +103,9 @@ def __init__(self, actor_net):
         self.actor_net = actor_net
         self.inv_lin = InverseLinear()
 
-    def forward(self, states):
+    def forward(self, states, past_pw):
         # get params for Dirichlet, and drop batch dim if batch_size=1
-        logits = self.actor_net(states)
+        logits = self.actor_net(states, past_pw)
         concentrations = self.inv_lin(logits).squeeze(0)
         pi = Dirichlet(concentrations)
         actions = pi.sample()
@@ -117,21 +125,6 @@ def get_log_prob(self, pi: Dirichlet, actions: torch.Tensor):
         return pi.log_prob(actions)
 
 
-class InverseLinear(nn.Module):
-    """
-    Implements a layer specifically designed for Dirichlet distribution as final
-    layer, see here: https://openreview.net/pdf?id=BJeRg205Fm
-    """
-
-    def __init__(self):
-        super().__init__()
-
-    def forward(self, x):
-        x[x < 0] = 1 / (1 - x[x < 0])
-        x[x >= 0] = x[x >= 0] + 1
-        return x
-
-
 class ActorCriticAgent(object):
     """
     Actor Critic Agent used during trajectory collection. It returns a
@@ -145,21 +138,27 @@ def __init__(self, actor_net: nn.Module, critic_net: nn.Module):
         self.critic_net = critic_net
 
     @torch.no_grad()
-    def __call__(self, state: torch.Tensor, device: str) -> Tuple:
+    def __call__(
+        self, state: torch.Tensor, past_pw: torch.Tensor, device: str
+    ) -> Tuple:
         """
         Takes in the current state and returns the agents policy, sampled
         action, log probability of the action, and value of the given state
         Args:
-            states: current state of the environment
+            state: current state of the environment
+            past_pw: the previous portfolio value and weights
             device: the device used for the current batch
         Returns:
             torch dsitribution and randomly sampled action
         """
 
         state = [s.to(device=device) for s in state]
-        pi, actions = self.actor_net(state)
+        past_pw = past_pw.to(device=device)
+
+        pi, actions = self.actor_net(state, past_pw)
         log_p = self.get_log_prob(pi, actions)
-        value = self.critic_net(state)
+
+        value = self.critic_net(state, past_pw)
         return pi, actions, log_p, value
 
     def get_log_prob(self, pi: Dirichlet, actions: torch.Tensor) -> torch.Tensor:
@@ -173,3 +172,18 @@ def get_log_prob(self, pi: Dirichlet, actions: torch.Tensor) -> torch.Tensor:
             log probability of the acition under pi
         """
         return self.actor_net.get_log_prob(pi, actions)
+
+
+class InverseLinear(nn.Module):
+    """
+    Implements a layer specifically designed for Dirichlet distribution as final
+    layer, see here: https://openreview.net/pdf?id=BJeRg205Fm
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        x[x < 0] = 1 / (1 - x[x < 0])
+        x[x >= 0] = x[x >= 0] + 1
+        return x
diff --git a/src/dagobert/modelling/rl/ppo.py b/src/dagobert/modelling/rl/ppo.py
index 6b7b7372..c64780ee 100644
--- a/src/dagobert/modelling/rl/ppo.py
+++ b/src/dagobert/modelling/rl/ppo.py
@@ -109,13 +109,18 @@ def __init__(self, hparams: Namespace):
         # create env, init starting state and policy/value networks
         self.env = RLEnv(self.hparams)
         self.state = self.env.reset()
-        self.critic = ActorCriticTCN(self.hparams, 1, actor=False)
-        actor_tcn = ActorCriticTCN(self.hparams, self.env.action_space.shape[0])
-        self.actor = ActorContinous(actor_tcn)
+        n_actions = self.env.action_space.shape[0]
+        self.critic = ActorCriticTCN(
+            self.hparams, n_actions=n_actions, output_size=1, actor=False
+        )
+        self.actor = ActorContinous(
+            ActorCriticTCN(self.hparams, n_actions=n_actions, output_size=n_actions)
+        )
         self.agent = ActorCriticAgent(self.actor, self.critic)
 
         # init batching and progress tracking vars
         self.batch_states = []
+        self.batch_past_pw = []
         self.batch_actions = []
         self.batch_adv = []
         self.batch_qvals = []
@@ -165,9 +170,16 @@ def calc_advantage(self, rewards: List[float], values: List[float]) -> List[floa
             for i in range(len(rewards) - 1)
         ]
         adv = self.discount_rewards(delta, self.hparams.gamma * self.hparams.lam)
+        return self.normalise_advantage(adv)
 
+    @staticmethod
+    def normalise_advantage(batch_adv: List[float]) -> List[float]:
+        """
+        Normalise across all episodes within the epoch. Apparently this helps with
+        covergence.
+        """
         # normalise advantage
-        adv = np.array(adv)
+        adv = np.array(batch_adv)
         adv = (adv - adv.mean()) / (adv.std() + np.finfo(float).eps)
         return list(adv)
 
@@ -187,19 +199,21 @@ def train_batch(
            Tuple of Lists containing tensors for states, actions, log probs, qvals and
             advantage.
         """
-
+        past_pw = self._init_past_pw()
         for step in range(self.hparams.steps_per_epoch):
-            pi, action, log_prob, value = self.agent(self.state, self.device)
+            pi, action, log_prob, value = self.agent(self.state, past_pw, self.device)
             next_state, reward, done, info = self.env.step(action.cpu().numpy())
 
             # drop first batch dim so dataloader later can resample them for backprop
             self.batch_states.append([s.squeeze(0) for s in self.state])
+            self.batch_past_pw.append(past_pw.squeeze(0))
             self.batch_actions.append(action)
             self.batch_logp.append(log_prob)
             self.ep_rewards.append(reward)
             self.ep_values.append(value.item())
             self.state = next_state
 
+            past_pw = self._update_past_pw(info["portfolio_value"], action)
             epoch_end = step == (self.hparams.steps_per_epoch - 1)
             terminal = len(self.ep_rewards) == self.hparams.max_episode_length
             if epoch_end or done or terminal:
@@ -220,16 +234,18 @@ def train_batch(
             if epoch_end:
                 train_data = zip(
                     self.batch_states,
+                    self.batch_past_pw,
                     self.batch_actions,
                     self.batch_logp,
                     self.batch_qvals,
                     self.batch_adv,
                 )
 
-                for state, action, logp_old, qval, adv in train_data:
-                    yield state, action, logp_old, qval, adv
+                for state, past_pw, action, logp_old, qval, adv in train_data:
+                    yield state, past_pw, action, logp_old, qval, adv
 
                 self.batch_states.clear()
+                self.batch_past_pw.clear()
                 self.batch_actions.clear()
                 self.batch_adv.clear()
                 self.batch_logp.clear()
@@ -242,8 +258,8 @@ def train_batch(
                 self.epoch_rewards = 0
                 self.done_episodes = 0
 
-    def actor_loss(self, state, action, logp_old, qval, adv) -> torch.Tensor:
-        pi, _ = self.actor(state)
+    def actor_loss(self, state, past_pw, action, logp_old, adv) -> torch.Tensor:
+        pi, _ = self.actor(state, past_pw)
         logp = self.actor.get_log_prob(pi, action)
         old_new_diff = logp - logp_old
         ratio = torch.exp(old_new_diff)
@@ -255,8 +271,8 @@ def actor_loss(self, state, action, logp_old, qval, adv) -> torch.Tensor:
         loss_actor = -(torch.min(ratio * adv, clip_ratio * adv)).mean()
         return loss_actor, approx_kl
 
-    def critic_loss(self, state, action, logp_old, qval, adv) -> torch.Tensor:
-        value = self.critic(state)
+    def critic_loss(self, state, past_pw, qval) -> torch.Tensor:
+        value = self.critic(state, past_pw)
         loss_critic = (qval - value).pow(2).mean()
         return loss_critic
 
@@ -274,7 +290,7 @@ def training_step(
         Returns:
             loss
         """
-        state, action, old_logp, qval, adv = batch
+        state, past_pw, action, old_logp, qval, adv = batch
         self.log(
             "avg_ep_len", self.avg_ep_len, prog_bar=True, on_step=False, on_epoch=True
         )
@@ -289,7 +305,9 @@ def training_step(
             "avg_reward", self.avg_reward, prog_bar=True, on_step=False, on_epoch=True
         )
         if optimizer_idx == 0:
-            loss_actor, approx_kl = self.actor_loss(state, action, old_logp, qval, adv)
+            loss_actor, approx_kl = self.actor_loss(
+                state, past_pw, action, old_logp, adv
+            )
             self.log(
                 "loss_actor",
                 loss_actor,
@@ -309,7 +327,7 @@ def training_step(
             return loss_actor
 
         elif optimizer_idx == 1:
-            loss_critic = self.critic_loss(state, action, old_logp, qval, adv)
+            loss_critic = self.critic_loss(state, past_pw, qval)
             self.log(
                 "loss_critic",
                 loss_critic,
@@ -338,6 +356,25 @@ def train_dataloader(self) -> DataLoader:
         dataloader = DataLoader(dataset=dataset, batch_size=self.hparams.batch_size)
         return dataloader
 
+    def _init_past_pw(self) -> torch.Tensor:
+        """
+        Init past portfolio value and weights to [1, 1, 0, ..., 0], since after the
+        portfolio is reset for each trajector p0=1, w0[0]=1 (USD relative price is
+        always 1).
+        """
+        past_pw = torch.zeros(len(self.hparams.asset_names) + 2).to(self.device)
+        past_pw[:2] = 1
+        return past_pw.unsqueeze(0)
+
+    def _update_past_pw(self, p1: float, action: torch.Tensor) -> torch.Tensor:
+        """
+        After each interaction, update the past weight / portfolio value vector as for
+        the next interaction the actor and critic networks take that in along with the
+        new state to form their outputs.
+        """
+        p1 = torch.Tensor([p1]).to(self.device)
+        return torch.cat([p1, action]).unsqueeze(0)
+
     @staticmethod
     def _pre_sanity_check(hparams: Namespace):
         # ensure we have the rl specific target column in the config
diff --git a/src/dagobert/modelling/utils.py b/src/dagobert/modelling/utils.py
index ac34c93c..e23befd4 100644
--- a/src/dagobert/modelling/utils.py
+++ b/src/dagobert/modelling/utils.py
@@ -409,7 +409,7 @@ def update_lookback(
     for _ in range(num_samples):
         s = np.random.randint(anchor_len)
         diffs.append(idx[s + mini_series_length] - idx[s])
-    lookback = (pd.Series(diffs).dt.seconds / 3600).quantile(quantile)
+    lookback = (pd.Series(diffs).dt.total_seconds() / 3600).quantile(quantile)
     return lookback
 
 
diff --git a/src/dagobert/preprocessing/feature_creation/time_features.py b/src/dagobert/preprocessing/feature_creation/time_features.py
index 3cda9c46..479e82b1 100644
--- a/src/dagobert/preprocessing/feature_creation/time_features.py
+++ b/src/dagobert/preprocessing/feature_creation/time_features.py
@@ -105,11 +105,11 @@ def add_time_features(self) -> Optional:
 
         if self.add_time_to_label:
             btt_col = pd.to_datetime(self.df_bars[self.barrier_touch_time_col])
-            time_to_label = (btt_col - date_col).dt.seconds.fillna(0)
+            time_to_label = (btt_col - date_col).dt.total_seconds().fillna(0)
             self.df_bars.insert(0, NTimeFeatures.time_to_label, time_to_label)
 
         if self.add_date_diff:
-            date_diff = date_col.diff().dt.seconds.fillna(0)
+            date_diff = date_col.diff().dt.total_seconds().fillna(0)
             self.df_bars.insert(0, NTimeFeatures.date_diff, date_diff)
 
         logger.info("Added time features.")

From 530f1218c6f1aad8f7bb48e8864834a97865cdc1 Mon Sep 17 00:00:00 2001
From: MateMarcell <marcell-mate@hotmail.com>
Date: Fri, 8 Jan 2021 21:05:44 +0000
Subject: [PATCH 20/62] init

---
 config/timegan_config.yaml                    |  49 +++
 src/dagobert/modelling/augmentation/tgan.py   |  58 ----
 .../modelling/augmentation/timegan.py         | 302 ++++++++++++++++++
 3 files changed, 351 insertions(+), 58 deletions(-)
 create mode 100644 config/timegan_config.yaml
 delete mode 100644 src/dagobert/modelling/augmentation/tgan.py
 create mode 100644 src/dagobert/modelling/augmentation/timegan.py

diff --git a/config/timegan_config.yaml b/config/timegan_config.yaml
new file mode 100644
index 00000000..e1f8d377
--- /dev/null
+++ b/config/timegan_config.yaml
@@ -0,0 +1,49 @@
+
+# --------------------------------------------------------------------------------------
+# LIGHTNING
+# --------------------------------------------------------------------------------------
+
+gpus: 0
+
+
+# --------------------------------------------------------------------------------------
+# RUN
+# --------------------------------------------------------------------------------------
+
+log_dir: logs
+num_workers: 4
+exp_name: Time-GAN
+tags:
+  - time_gan_test
+no_comet_logger: True
+seed: 42
+batch_size: 256
+
+
+# --------------------------------------------------------------------------------------
+# GAN
+# --------------------------------------------------------------------------------------
+
+# gru or lstm
+rnn: lstm
+
+# --------------------------------------------------------------------------------------
+# MODEL
+# --------------------------------------------------------------------------------------
+
+dropout: 0.2
+num_layers: 2
+hidden_size: 50
+z_dim: 50
+
+# --------------------------------------------------------------------------------------
+# DATA
+# --------------------------------------------------------------------------------------
+
+#data_dir: "C:/Work/dagobert/data/modelling"
+#data_dir: "/home/daniel/dagobert_data/modelling"
+data_dir: "C:/Users/u164428/Desktop/Dagobert/data/modelling"
+
+# --------------------------------------------------------------------------------------
+# PREPROCESSING
+# --------------------------------------------------------------------------------------
diff --git a/src/dagobert/modelling/augmentation/tgan.py b/src/dagobert/modelling/augmentation/tgan.py
deleted file mode 100644
index 5706658d..00000000
--- a/src/dagobert/modelling/augmentation/tgan.py
+++ /dev/null
@@ -1,58 +0,0 @@
-"""
-TimeGAN network, following the original implementation:
-https://bitbucket.org/mvdschaar/mlforhealthlabpub/src/master/alg/timegan/tgan.py.
-"""
-from typing import List
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as f
-from torch.nn.utils import weight_norm
-
-
-class Generator(nn.Module):
-    def __init__(
-        self,
-        input_size,
-        hidden_size,
-        num_layers,
-        dropout,
-        batch_first=True,
-    ):
-        super(Generator, self).__init__()
-
-        # input/output: (batch, seq, feature)
-        # TODO: hparams?
-        self.lstm = nn.LSTM(
-            input_size=input_size,
-            hidden_size=hidden_size,
-            num_layers=num_layers,
-            dropout=dropout,
-            batch_first=batch_first,
-        )
-        # TODO: size of output is ?
-        self.linear = nn.Linear(hidden_size, hidden_size)
-        self.sigmoid = nn.Sigmoid()
-
-    def forward(self, x):
-        # lstm_out = (batch_size, seq_len, hidden_size)
-        lstm_out, _ = self.lstm(x)
-        y_pred = self.linear(lstm_out[:, -1])
-        return y_pred
-
-
-def rnn_cell(module_name):
-    # GRU
-    if module_name == "gru":
-        rnn_cell = tf.nn.rnn_cell.GRUCell(num_units=hidden_dim, activation=tf.nn.tanh)
-    # LSTM
-    elif module_name == "lstm":
-        rnn_cell = tf.contrib.rnn.BasicLSTMCell(
-            num_units=hidden_dim, activation=tf.nn.tanh
-        )
-    # LSTM Layer Normalization
-    elif module_name == "lstmLN":
-        rnn_cell = tf.contrib.rnn.LayerNormBasicLSTMCell(
-            num_units=hidden_dim, activation=tf.nn.tanh
-        )
-    return rnn_cell
diff --git a/src/dagobert/modelling/augmentation/timegan.py b/src/dagobert/modelling/augmentation/timegan.py
new file mode 100644
index 00000000..94a2b076
--- /dev/null
+++ b/src/dagobert/modelling/augmentation/timegan.py
@@ -0,0 +1,302 @@
+"""
+TimeGAN network, following the original implementation:
+https://bitbucket.org/mvdschaar/mlforhealthlabpub/src/master/alg/timegan/tgan.py.
+"""
+from typing import List, Optional
+from argparse import Namespace
+import logging
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+import matplotlib
+from scipy.stats import spearmanr
+from matplotlib.figure import Figure
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as f
+from torch.nn.utils import weight_norm
+
+from pytorch_lightning import LightningModule
+
+
+class RnnBlock(nn.Module):
+    """
+    Generate time-series data in latent space.
+    """
+
+    def __init__(
+        self,
+        input_size: int,
+        hidden_size: int,
+        num_layers: int,
+        dropout: float = 0.2,
+        batch_first: bool = True,
+        rnn: str = "lstm",
+    ):
+        super(RnnBlock, self).__init__()
+
+        # input/output: (batch, seq, feature)
+        if rnn == "lstm":
+            self.rnn = nn.LSTM(
+                input_size=input_size,
+                hidden_size=hidden_size,
+                num_layers=num_layers,
+                dropout=dropout,
+                batch_first=batch_first,
+            )
+        elif rnn == "gru":
+            self.rnn = nn.GRU(
+                input_size=input_size,
+                hidden_size=hidden_size,
+                num_layers=num_layers,
+                dropout=dropout,
+                batch_first=batch_first,
+            )
+        self.tanh = nn.Tanh()
+        # TODO: whats' size of output in latent space
+        self.linear = nn.Linear(hidden_size, hidden_size)
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, z):
+        # lstm_out = (batch_size, seq_len, hidden_size)
+        rnn_out, _hidden = self.rnn(z)
+        output = self.tanh(rnn_out)
+        return output
+
+
+class Supervisor(nn.Module):
+    """
+    Generate next sequence using the previous sequence.
+    """
+
+    def __init__(
+        self,
+        input_size,
+        hidden_size,
+        num_layers,
+        dropout,
+        batch_first=True,
+    ):
+        super(Supervisor, self).__init__()
+
+        # input/output: (batch, seq, feature)
+        # TODO: hparams?
+        self.lstm = nn.LSTM(
+            input_size=input_size,
+            hidden_size=hidden_size,
+            num_layers=num_layers,
+            dropout=dropout,
+            batch_first=batch_first,
+        )
+        self.tanh = nn.Tanh()
+        # TODO: whats' size of output in latent space
+        self.linear = nn.Linear(hidden_size, hidden_size)
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, z):
+        # lstm_out = (batch_size, seq_len, hidden_size)
+        lstm_out, _hidden = self.lstm(z)
+        lstm_out = self.tanh(lstm_out)
+        synthetic_series = self.sigmoid(self.linear(lstm_out))
+        return synthetic_series
+
+
+class Discriminator(nn.Module):
+    """
+    Discriminate the original and synthetic time-series data
+    """
+
+    def __init__(
+        self,
+        input_size,
+        hidden_size,
+        num_layers,
+        dropout,
+        batch_first=True,
+    ):
+        super(Discriminator, self).__init__()
+
+        # input/output: (batch, seq, feature)
+        # TODO: hparams?
+        self.lstm = nn.LSTM(
+            input_size=input_size,
+            hidden_size=hidden_size,
+            num_layers=num_layers,
+            dropout=dropout,
+            batch_first=batch_first,
+        )
+        self.tanh = nn.Tanh()
+        # TODO: whats' size of output in latent space
+        self.linear = nn.Linear(hidden_size, 1)
+
+    def forward(self, z):
+        # lstm_out = (batch_size, seq_len, hidden_size)
+        lstm_out, _hidden = self.lstm(z)
+        lstm_out = self.tanh(lstm_out)
+        synthetic_series = self.linear(lstm_out)
+        return synthetic_series
+
+
+class Embedder(nn.Module):
+    """
+    Embedding network between original feature space to latent space.
+    """
+
+    def __init__(
+        self,
+        input_size,
+        hidden_size,
+        num_layers,
+        dropout,
+        batch_first=True,
+    ):
+        super(Embedder, self).__init__()
+
+        # input/output: (batch, seq, feature)
+        # TODO: hparams?
+        self.lstm = nn.LSTM(
+            input_size=input_size,
+            hidden_size=hidden_size,
+            num_layers=num_layers,
+            dropout=dropout,
+            batch_first=batch_first,
+        )
+        self.tanh = nn.Tanh()
+        # TODO: whats' size of output in latent space
+        self.linear = nn.Linear(hidden_size, hidden_size)
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, x):
+        # lstm_out = (batch_size, seq_len, hidden_size)
+        lstm_out, _hidden = self.lstm(x)
+        lstm_out = self.tanh(lstm_out)
+        embedded_real = self.sigmoid(self.linear(lstm_out))
+        return embedded_real
+
+
+class Recovery(nn.Module):
+    """
+    Recovery network from latent space to original space.
+    """
+
+    def __init__(
+        self,
+        input_size,
+        hidden_size,
+        num_layers,
+        dropout,
+        batch_first=True,
+    ):
+        super(Recovery, self).__init__()
+
+        # input/output: (batch, seq, feature)
+        # TODO: hparams?
+        self.lstm = nn.LSTM(
+            input_size=input_size,
+            hidden_size=hidden_size,
+            num_layers=num_layers,
+            dropout=dropout,
+            batch_first=batch_first,
+        )
+        self.tanh = nn.Tanh()
+        # TODO: output size is same as original number of features
+        self.linear = nn.Linear(hidden_size, input_size)
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, x):
+        # lstm_out = (batch_size, seq_len, hidden_size)
+        lstm_out, _hidden = self.lstm(x)
+        lstm_out = self.tanh(lstm_out)
+        X_tilde = self.sigmoid(self.linear(lstm_out))
+        return X_tilde
+
+
+class TimeGANLightning(LightningModule):
+    """
+    Lightning model made of RNN nets working together.
+    """
+
+    # ----------------------------------------------------------------------------------
+    # INIT, FORWARD, OPTIMIZER SETUP
+    # ----------------------------------------------------------------------------------
+
+    def __init__(self, hparams: Namespace):
+        """
+        Class constructor.
+
+        Args:
+            hparams: Hyper-params passed in to the module. See the docs for more details
+                https://pytorch-lightning.readthedocs.io/en/latest/hyperparameters.html
+                and dagobert.modelling.dl.tcn_args for more information on the params.
+        """
+
+        # define main vars (other than model)
+        super().__init__()
+        # TODO: sanity check, define hparams
+        # lightning sets this to cuda too late for some of our setup to work
+        self.tgan_device = "cuda" if hparams.gpus > 0 else "cpu"
+        # TODO: check if real data is the right one, get data in
+        # TODO: any sanity checks on data, hypermparams
+        # TODO set up losses
+        self.real_logging = None
+        self.comet_logging = not self.hparams.no_comet_logger
+
+        # get feature number of instruments
+        num_inputs = [len(cols) for dataset, cols in self.hparams.cols_to_model.items()]
+        all_inputs = sum(num_inputs)
+
+        # components of network
+        self.generator = RnnBlock(
+            input_size=self.hparams.z_dim,
+            hidden_size=self.hparams.hidden_size,
+            num_layers=self.hparams.num_layers,
+            dropout=self.hparams.dropout,
+            batch_first=True,
+            rnn=self.hparams.rnn,
+        )
+        self.embedder = RnnBlock(
+            input_size=all_inputs,
+            hidden_size=self.hparams.hidden_size,
+            num_layers=self.hparams.num_layers,
+            dropout=self.hparams.dropout,
+            batch_first=True,
+            rnn=self.hparams.rnn,
+        )
+        self.supervisor = RnnBlock(
+            input_size=self.hparams.hidden_size,
+            hidden_size=self.hparams.hidden_size,
+            num_layers=self.hparams.num_layers,
+            dropout=self.hparams.dropout,
+            batch_first=True,
+            rnn=self.hparams.rnn,
+        )
+        self.recovery = RnnBlock(
+            input_size=self.hparams.hidden_size,
+            hidden_size=self.hparams.hidden_size,
+            num_layers=self.hparams.num_layers,
+            dropout=self.hparams.dropout,
+            batch_first=True,
+            rnn=self.hparams.rnn,
+        )
+        self.discriminator = RnnBlock(
+            input_size=self.hparams.hidden_size,
+            hidden_size=self.hparams.hidden_size,
+            num_layers=self.hparams.num_layers,
+            dropout=self.hparams.dropout,
+            batch_first=True,
+            rnn=self.hparams.rnn,
+        )
+        # final linear layers
+        self.generator_linear = nn.Linear(
+            self.hparams.hidden_size, self.hparams.hidden_size
+        )
+        self.embedder_linear = nn.Linear(
+            self.hparams.hidden_size, self.hparams.hidden_size
+        )
+        self.supervisor_linear = nn.Linear(
+            self.hparams.hidden_size, self.hparams.hidden_size
+        )
+        self.recovery_linear = nn.Linear(self.hparams.hidden_size, all_inputs)
+        self.discriminator_linear = nn.Linear(self.hparams.hidden_size, 1)

From bf55dc4fcc67a5bfe14c3ba9f95b0b671c7466bb Mon Sep 17 00:00:00 2001
From: Daniel Homola <dani.homola@gmail.com>
Date: Sat, 9 Jan 2021 15:34:41 +0000
Subject: [PATCH 21/62] fixing inverse linear implementation, changing past
 weights to past dirichlet concentrations in what we pass to the network along
 with the state

---
 notebooks/notes/rl_episodes.xlsx      | Bin 0 -> 10100 bytes
 src/dagobert/modelling/rl/networks.py |  20 +++++++++++---------
 src/dagobert/modelling/rl/ppo.py      |  16 +++++++++-------
 3 files changed, 20 insertions(+), 16 deletions(-)
 create mode 100644 notebooks/notes/rl_episodes.xlsx

diff --git a/notebooks/notes/rl_episodes.xlsx b/notebooks/notes/rl_episodes.xlsx
new file mode 100644
index 0000000000000000000000000000000000000000..db08bae49194fc64e5a634d6427db76c96b6e647
GIT binary patch
literal 10100
zcmeHtg<BlS_V(a|ySuvt2o6DmyGw9)3la!0xCVC#?ht~zLk0*04Z%IQyZ>f)@BMZ+
zd%wTn-kxW=o|&%qRCU*R>#aVgRODe`aRBfDAOHZM1ehLWTN^?F0CBJY05$*!tuNu^
z=x*WYZlvMuY~l8X&C9`_A|Do-Ar}A*ssF#@zxWAMCXFcdabQSY%RESIvdS&iiUMCA
z27bX}QWfd$PaG^a)6cQ7dCrNr#}LoMwdMVc^=;9I^K{g@*3qFcFnq904J8T$?$$IU
z=4BuFazNXKPnh7Ur+b`>LoE7&(8wspG82&I($uHMEx9HpS4m`zizgDWkv;!K3v*?#
z1*KfClpnm%v#O-EjLA1#HifY<fjw<&!{}ppzQHOB8%5=_i05Hgq(Z5q&S<LE;(C`V
z-|5EC%r#?d2EEJ?8rOlgG`0fY*2znz$W2sR7I{&wHyCHf+~sTF`>u=OL)3-JT`Lc+
zwGpd+jwpySaPkL0#V)EIuZq+0+xz}kxCII$C%qI)_q1!+B>gMI=B+ugapp<{R1Pew
zMPG6xmHBy@0{u$BL(P2M%7<nozK+Ec2%E4{H_td%n7gg#-QpNS-IO%%!wy%$yMjIk
z<6HVcN2e*f{+v(n0KoGz3_#^?v~1AipuT|kngWD%$Pii@xmwt}v9tYl{vRFxi#hn0
zTdzz|RPN(I2|bp12m@cwEX84n%X^B+v{7pK1;{R<H^das5HEGnlVE5N1;I%BclbRF
zFDwbg?13q-*10RAuy6#a8$GK+Q}0~fA~4apro48kT<gd5n7N#}N|%=NX7K2UWvXZ{
z&66Kmp?*DeD)|}X8=E!}B6bl;D4uXSNN-S4ciHr&8fr#d{je&urinLqFY#NZ|7=R}
zF6ygreuaa{bX>5TsnuMyFW8>)@|H+b-J0K~#x%!OfXds*+`ju<GP4W&(U(;z3#>-P
zjdjEMP5KLM?xnw8J@@HIws$`-VqXRLWGFC-ymT3&Q2!)Jsu;1p3k(3@4UsZ*$dmE1
zXZLh+wKH{cvinW9DmC9YWy)c^|1|v^;6-0U!10<gc{GfOu`It|(U3*cPUkCA_v8|-
z2hL^JiY|x8q=t6qZo`Qmn$@?jS-N9g2YYfpR0b;Fg9QZY++(;2M{Md4FONs2Pu?oZ
zEN(WNq9R^eQC+a(e|(kgMCfn8byW%_N-f@zREw&hZcbbFna_;X#<;||eGseI*DNvx
zJ1Z&yLsz@lK{rR&Y<>quyj!`)SW1iwtFE#_mKp&=iHQZ&$za3$x?VsKu6pp{vt3N|
z46A6RjntMKdb!<OJOf9f!P8dgCQtB;2MHPa6F6{?OgXU1Q+kGbL!m}I2dbVw&Tsow
ztwQi}u<^ABd<i8#9lf%~)=77y54rgkX-zvdLG2R}DyaTl6)Z+OdVULnW22vo;dtaC
z`s9rBf?r?5utt#`E*|Gl8I5<TvdM7c<|y>>yp%Naj|Tb=hr2%q#Kw3HMr1*4a%^H?
zY9Wo#?NNJe-9V0?$u#Xf36hoZxqTf*{uM1`f=q-A25q#j1cwoE@OF@WL-4Gi45Nry
z<f}F`OQn1E+5p$u{EpTwk^FYE_6T*T`q*r-L7}qwa+;OXGL?pm_+9LY&$(~1$^z0b
z;z5@jPr?GCIS2(JPBgQ|zJx|%AC6A2%~9OrkCE@N)YhN9P7`H`iol%3)yD)%rUQef
zpt!a9I%`aQ)o9aWe6OAk3bW?KEkigYu4&S1bp2T`-XjX;9cw?K=YjcWl$q!6(LP;C
zB~0PhN%jM(?6&2HWY&pwtj^k0{$y70wI;2H*JRdgBPt&iN`}pK?&WrhVS9PbNHs|o
zwfe;$X-^ON^!Rc>KNF{&i<iHqEnqLyWO--sxqZp;aZv{Q4wQE3fOJ>v6pc_k_iCk_
zgDMQ+@CSudtWHKqE{BMXMx2vDH=47=V{sPWlHlfsm?`hc+@(Qeu9dgb8T_Hi3I=_r
zR*h?dqU^4wf-WVpSu6(0WS1tur`Y|^ss0>(E;u7tZY`U9KEcP+tc-X%0deqQ0mpW=
z$g@T|*tt7}K+_0IJnJDh6ka9zBb))%^<wGT;m7r$)I_H%Cu|l$1_RByM<Er5RwH2+
zyuMx$WhYy<<`~VzYp3VTHBv95c<h*|du-GKr`eHxY==(?Td`sF_><%vFYoP38)xJA
zf1a$9L%8=(c7jBbQqLh*S|Jif1OP%o*!hQw{gtEtRx~Kc0u#dV|L&twO;Ns|1ET}^
zDU8E2(*qZC&Yhj=Kyx1*I8@KLKu5{tcd<gu)cQu}yF5FLQ>gdx=ogRcmuTxS*q1%b
zrIA>$@38HU1>u}V&qrZ^riWEzfzoi;SO<Ii6^F=Z8E)7uA_-uY92~x%Hw?6dVFkrh
zk{2zLq1?7(9B(;o-Ns3WAv5|&{bs;ju=+t&V0}{)MFykg)^m%*!WI0-KyUJib3#N}
zAyjX`#LhG&^G%P`74WS6!BWwC%&w>44rPm^LY}PrrLb?PJ4F~+n%ti;!EGx)y-o9Y
z&0YL{n6Yc&jAi8F*m85|%ghR7r2pwM7N#^UIgs^`Ed>BT0IBh>3Ej=w!ouB+{rBR<
zZ!>$Ao}Jq|56<Iq`V&HI7s7*5FF=q|bf~~SL382T2h3I7=(0nh#_Xr-J-CGGuhV-9
zO~}$=KCy)r)N}J|sozYu+c(O`MjpYJJ~{iV@3pGW9xXSYLVx0GSF7{x`XwI?3zN0H
zaz6AUe14cZDX8q6)sH3)ll(DD?YY1mCe~$SrEgI9_LI=6u#Il2W<1ZPUBm8d7*YxI
zRe|X&x)A#=zD=j6cd*zD?dul9ZNi+CIepMID!SU*n|X}KOP*KLU#z9R(;CEUt*oHo
z>*|uogwOh>^1jkhnPsf)S+X?J+TH(faX%G1`SrFE><@K|O*X}`2-eut)%?EIx*HFE
zUp;s5^!B;+qh*PV<+EVOX@OtJikkq4Q{&{B=Ju3Piy|oGnM=nN>2+8IQTtX!{Qb50
zyTqGi-*@qg%Xjz_n|F8XOr2+J33HkR--?H}%o83qK7>_dedRbzR$sdWcbp<HoGw0j
zfsuBuN&KgWtx9B#y4Fa?h!hfTs?W7`@m6Y+#<KkhnO9QlsXU>JG#~U*R&1ID&m>}L
zs&xr6_(n4xNLSWL^Pbleb1y$CAU`SuUnt8>f0Nd9)4EkouIt^uaF}gRB1g`0YDAS)
zyr}6Vtn^cG$&Qz!EAtUkD6ri65KQsx8JWc<s}`|Ke}7MH^iuz9==e516{GPya^Es%
zD!e+eV@6T;fs#*w9DP7TX^M^%z2dBo(&Dv`tITfxB?6T0>7iaYP%cO13LIk)wpliG
z(;nMl@|^9?j~%^09aLWODqY+d9>foeXNhO@%@x7l6&-{C6cRVKu|&LJF<Mj3p*TSS
z;R7kqVC~VsaUTb>E6a{H{3Pbwt9Qbs*EE5HX1xo#cszSDP<OU^wRbF&YK<9qDC5CR
zBlS%fqDZ;B6tw1fLA9Esc_OOV%hfvp-8j(UFnKT7&1rRk1@@Xcu)(KSaihd(b^6P6
zQp%JZK>#&?12;DFL{e4chaRk-hxXPtSU&?a1cJIrpk<)Ai&C}Z81*$XcH~Oh-!Nsn
zh%#kM02l-4qH`Ps?pIMQDtg)8ns6<%g^cp&vZ#dQ?KU?~ip`aOV1Dx@t10SkAG8(a
zRk5omGKatdA2gwfJKKsx17wXT74#r~+jz(nK~-~0A!aRWZq+1mvH+}HDxzRGXT+FN
z@f94gaBqk;o=gd(Q1^d{QAnlt_SU0wWT%|KxLsaMWk9s#Jtv=xl)Jabk^DC$+gs^w
zztljA*)?sDOph}BnLP#MT93!rKFoehz$F{37^(8kpfddSoTwoREMj4i$RW!dLJkR(
zvLP_=pf?*NgeW;SQrF}C0@BhQ%a8OX^hH7FqYez%2ITRsW7M%aQS%Z*6f`x=4rGtc
zW0;NDfKztZyvt1&dLoPGBh<|vQ9~ulSZSX^*ZSpg1jNiK)}jg?7@jymJg3d!ybk>E
zgWe#J5D`#qV<+&CCqLlhp7Qo7fxYB)AO+kjpd^wXH>BZKp^*}}n)d_7_wRswV#7W-
z;k9{X;cSDB63E>Wk|;d&(qgMcdzS&UR1@|`yPze*!5z8y0b_QFKw|-#_pOa(mNm=+
z#@ga6%V3Ay_$5cIfF&B9(4$30p_+5w*V<udULsh*DP(i}Zh$O)__1i%Jc|zp4_tsJ
zLYQaCK;tWh`Z}K9s^!GM<<Z7x(}r{>V0rQ21*b@Q{KSHGlMQI05aZZpsQ0}EL{o!e
z*;x!11dtB1KoiOQa=BbU@saU4ehFX}C1{KSGR6m4l(1uQqE;BeDg3o=p%_{4*gGhJ
z<CD~aRzMIJ%{U?KZ;fyTcN1bk{x_zJJb+l5DG!2xf09UfjCJqG<;(vY5gx#0ApV;v
zNv*Mf{5mJxDm;<Mu@2bi%I_~tFL<aUD^plaFTV3>RbWYN8@ftqMWMZD<@4Bga@xv3
zYJcwJ5uIq^Rs3wgl@0yEZvi;qs_2gFe$?U1V%MopvBgjoqsn5}s-Np7MLLpygeT(-
zQ^1^{xx_l&GCTU2W#18$<Vny<r8~O>oy~lpgXhsCI(*(<4X&--a7$J^qK`#K&Bl4r
z+4%h?nX0Rj*}1_aHMSu!+36xOPJ}I6#gA3s6k(GsMkpJ`cMTnUm>R+U2-Wf_KQ)!}
z!OKgA_hz<!e)E2QZtZ1T3{9Wm>Au`2X%XHn(6Z`e#q?|~C2!0ZCw<qF*rB42v(y9a
zY$HE!Lg@G(nL(#r|54L=@G&}-5CDKdh^Arxp=s{c77iBdzn_1Xvjg4HC}LimF8sS!
zC~lq)FE?Uomsh{pCoa*LXD5*~G#;pGad9TJ6JtZu@?WYk6c(k5I|(OEio&3FT*M;M
z)bEl{(2xG0uG*r>w3pNKK#h`q>L@9>&hdM9?tVJik@ikH4Y@ZV#i&!|OgJn0M;gsk
zioYEn1GlfTOj0za1)}3EGRuSf!bd+>Oot)hmme^<lJ<Ei<g)w=On3vp(D>$VOFqyF
zUGx{3Uh0o{Ur0!UmecT<i`1$Js1t4s15H(Y-g8n?AD|}7`TLZ$E3dtX7e|d|SkW&3
zOq=(4qpwyxJY+}hh?H{v%p~W9AlbUl<TV2pmY?-4?c+9n6Fc677M<i4ysm^}?lWhL
zg0Zj8gxlF}kL@G{Fi6j*$Oxv?;LGLyqh43zWC@ftcVKdH;}3!TWXCuYRF%7#yf*D)
z%@MriaiceNEu`hKA+xy}{xzZGD6Okf$e+X*MGX4FvMo!j_Gfc(jP(0v<qr*2nl9{A
zBxPNjL)k(aY&~1%am{;oK434S1c`MdFn&{wL#;SaQ&-TT;@#4VKH?Ms1VF3N$gU3!
zpeT<6Yn!7O_*eTEXs3W!PN7(I#?2axsB)S&pF}`U#nh3F9Gr#8Z;T^!h}EvVO#6ov
z8XuT*XLu>tbA)}KPIk}3x`h2Nc5eHOYFiqZ<Q)-c)zyz?VvU|3AMuxsx}Hw<PI-5y
z=}&w6yB=;wQoEi{1ukpW7jfA;-=CfBCgQKWzuZ=iC7oX&Ax%4>jQD&~y=Hby7i9uM
zyFZ0`mEF%F-k%_fx#LlKGy;Y;)Th|%eOq(byFj8d2-hyuWjEJKe%eCwA^p?q&v1)o
z(H;bz0;}a2BPS=$ISNaO=PeO!5##Z8M{IJHfa`6pT<vwWxtJP@pQw>o5ygBxlj>cc
zC89|y+9blaW`#M|6>1_bjxat2g-_8+xEj-{vDHhMhL7}GPz2APjV?QU>MOOI5%x8w
zHi7hm!!NvSYjDJxP4z&HU-HFE(#|OJO-@(;9%;I2b{ffdk&NeeW-P(-KLx@riNv~)
zP&Ux+$=>wyB*?q&OtwT3aK#II{D@Hza+_C30QdXRp;1c3dj`qk`^L@*N(xJmsMNk#
zY(i}8eYB)%hTF8wNTLtFvlt`&+#a3YL5}YpSQBM2FVy&>VKjnEP~f~QdA^N6-937c
zz^6&XO=sMMYh=4+LYvAsne;%ek{LXvd6{JlWsy*B_npx-707IHeVXJ@W98%$=!M4+
zfw@w6OC-4@_l9pPOXFSk2Q<aim(W|}n^9AYvv4~c32j}n`!PUWtJHURN4{^Inf;vV
zz8D)V3m<mE68hQ_Qd~7%jbf<U1qzvj0+CSgmjc6apA00kEA;G-k}`i}@}R@uyw>W#
zPTY(c_!d!1|KK@H<_YBn4;QO{dY>ea$zZW9393W9CTFpGXPvXy?VM2Q_T5|4&2Z-C
z`M%t8%24E!#h6|Cx!&OA)uotidb=-ZnC|#&Cr7#c^A;Cn($NUqK%2Pdd)W4gXatVZ
z1=Uvi7F(z>s+)|LX)KFL)T|pWbg$?+=VQlF^du~ny0an`E;YQk=yJ^p!`?FEejKm5
zS338KjhRxiYN>3^C_{!zxb<Q*h>=K@E8fu)K3=Yx-uK(l@YGF(XPBEJFj251R((sL
zYJ?_^V`vp@IG3(7;jS~GvX`2t_u6F(iNgUG*K6x)RjOkvOl)2()RH7=+YGm)KD|A<
z8fh&3^}9r_vUDgFQ+6(jSa<cgRu^yEw>IS+QiZJ4(e8CI9FrV@_-ibJqbc{ggRSXd
z)11mJrY*#-TzfI-n08WB^vxhTI`#2>LC4z$TVN9tHMiX@_6|YctQUL>^|U^IzIw9<
zp=^%hji*TpA%1O`WRieEqS7$@i}*C3q<&T-R{vyF?g^572WIU63=zGJ3{htCOcxAy
zL9iER54BC`5`k)t5;<s1z1sPyd|-vNl4g#iw}U(;wgX0Bn-{ZCM~9sm;X_0~RFLc5
z=nGJ04Q~9_t>?P&u=Q+v<RpL62Wo*R6_5Nh^PJw75!GKOl$ps1l;<*f^7h$;(#m13
zI+T}q$!CV$;B9_d6%R1&O}i)Vh6!$>R-8#p*D@73J}R-gR@<?)pkd8;%eAjfzmJ0H
z;b8svl~Yl-U8yg!u1<BKo{I&DqCCxW<M8%1pYUll{VQ<xSG~8QEc<fVH+RE}?K74L
zo*<!j&i;HQYC9_8PN~!kcay~6Bp&0a#YEj0;7U6`J?pa{qdmfs+z-|yB9kr*jUcr=
z%*Kik4+5=`#~OpZw@m1(#__5KXjA)lPzEOBQXV#+zRa>6ZEUfQhG0fTDHwglZ&<rq
z?=+b6bveeQ$i8nF&ee7Bunn<n5N3ewP%0Qd_%0A)SfziA`fjOaf<)+~gBlfWe#TK{
zpBJgYwMidnOKag*pj(QNEGJmnMEschNWpAoyv6w5yc2M3+t&E}A7g}%`eq6(kQm_=
zG5~=3hy1#^d)r&M{q`Q&uchy_z=PvQKK@K`@@9fI#f&&1Qm-)-+Chbautr-i-#ot^
zh?6};dbi>uBiq<IXp>TP7^O&-JtFP*fJWGL#OFp9Zd-<fh*a&niJ=Nkh!ha>U)(-|
z=C2ye?^i%d$fdTK*uR)@a;b=oUOYscKr6&>2)xqcW&UOtCA`+$k${SMQj`#BhN`aV
z<4^!()5q>l$G>}8+(<*3w$eFW=a=QvB1ezWmsvJe^h{EHU*8hf7d5k>Cllg{lIzif
zg0F=CRddJPmc_JcWqPe%r8YUaGq63Ul1VSlX^pL;C@jj24*XE7AhvnCrfqjTjm$MF
zBt=%5hhF4D8{(KPP8O^*juMc;s`DP_j6FV6x%*@_dGL^a8;Yw27A(IZ&RIr+I^8p-
z8`Yh4K}KllUQ+5(!BKMXBU8ex#va`-t9O~Wz{>f;n(R}--kZQP5;W1(d%AUc9DPfX
z>3f@j{LE)N3lCoXA4<bNA{N0Xz))^@T<d&EG;Aw^6>?%@)c|83p2`)^cDs;RF#_RN
zxyjKf@KV`xbzIIFqfOq84K{S(ujyWBk;t;=LT$HDvQ7II9++itC<~p5<-t#U<-@>I
z-l7-O$)SA})*ZahdORm>iFy*~vKtBOdPrZRDF(w$05-_IKeBi8{I#$Lie0q%*Sp=S
zS5goI9qkLb(M%A+Og?`4DEcHX2Gx%)M(J8ChUQ4*(Jp*Ks@mc((U{RteHi)eoHzP=
zAoi3xAnxsLZXsPrV`?bw2`#LOhotkP9fs%}CX{<Wc?-gF(p1hGV$18#&p8Gq6=YWY
z0Rz6z_F28;8w-uM_$1k6lZg|`<IB<uBO7Xm^bGq2Jw0T7LqpuH!9L}$O2-VD?hupA
z*qWi0*mjSLs-anlQUX#VV7stZBP72V$RX-&V6mx<1Wt^DI2A+IXxcyHZ<lWLJbE%J
zKT&j*b#?eYr9Y*R)c4x6m`HLme)g*DubS9rgr?z-s4*2Hm(;PC%O;L~EFV1wH|9*W
zQcBA*`5NCnUAz$6KJ`(XgbCY1HzeN)n&y!T9NX|LMu+!EE}S~n^MdZ~a#P_J^LLu%
z9U0|8Ukd3mY_7AzY8kQaa4FDjT(&<eBEOs49-=;cB{Xr6iPewyRf{z!EQ<(FFrZ6t
z0gF5R;^go>pr+(U>e`||He6?>(LG2ob0efJ%}Cmj3~tR*MBk}ED8o3`@FwZJgJOp~
z1m3rPG70<j0C)yGK8-}va}gosMUB0D+mqw>A30947q?yw;r2e{i}NpzTbsCAn5(<H
z+BjPMPIdROQKViD4A|ASRl(p6yYj$rn$uNuFEtf%ih4U@Yo3NS-1k2FV*Np1(qlm{
zDx@oCNcGVoSQG{c8487PL2}5c=Z^J&f%USJw^a2flA+1OXef21e)9HtST-qoZiXfo
zLQ1G=?6cFa+27mCk19rWt|F<LR+Z@T%uXOcCzpKa??us&Ui-dwmzce_8r^*8;xAy|
zXlq)}SFp0huJGJ*fZY0O-w@LKe}={IT$AA>A$M*CiKJpeBB|z1W-6{u&Ti~xPOcXJ
z%FzEWfC_nZ(FsQKeH=IgD{x&BLq3_snG)g)pbu+7Fg+3L`Z7n2#^+%)t%nPljddPh
z5h&SPwik`YItsJMH4=S6=aV}~@hsAa){eo`3rHQ6sk^LlzQ%ergVv?vm_iw|v)Z&x
zNyG@sV>-@afyz{}KEn<n9+WzfpSJ~7zFJM>6*Rw4`i$|(F^YAA+#b0Zd`_r^k!B29
z!%Le7g)7PQAQV2jD<WW!NWdVupQM??KEH$E+j_R&Ns3n=a;|qNR$qLfe_}|oh`^4p
z;n(%;Dv|B=SLKt1z0(%rm&vV5xCk1lf$B#Aa2tDYQ=Hx$!`zkq1@GGWLT_s#_m^cb
zJioZ@7m<`LVF(jRVwp|&g_Cc+b?wq&SADYyCwuPY_#H0qRKe=_aNp|D;cI8E65*39
z3q!gZCn2Uk;B5qZ=j%s^I{wr3Ok@!;A|S4Z36W5Yzq+1@v-AHv58`tFI<gYPoEA7x
zLXKhX$WfMv`Il5sK$WI4+7~q@PzjAzip6GmwKU<C@dvZ9jT6&z@AhIgeZ1}@WtT_<
zj4t9P22>)p@^Ro`iS6*OlD&Q`z460^OF(QmeIZKwAz=Nrk*``N3AVXw0Gu>zrKEh4
zmRi0;9#^w5f6Zk7BH=pEv4uU6)f&%yDjM`qMQwYEGX?PFbm}e+Lvg>RGC68h0qM<&
zi>Crg<xEpjdtK;F%3Vnvzo~ucjbF`ba#mF2ZlkeLZ9xHBa#oYAT};?Zg#AG;SlSi%
zJ+iMg^)W7l4SO~-K}BiE?*3j5+W9vQ81Pwg7j%Ny;h*6brlT8_WprN+JVdhs@)GRR
zLe2QE$Ly}9*rj3cw+AVWp-t9ET(-e5PqsXCQ-b{?{HI`!QzgCykduJh7_VWv@*TYo
zX`6X(e2DsGUx<~jOq-V-&~VmUL}TH*dFd^j(!L1lw6GKBJ*D|lMork=%y}1w(9_06
z-{sjmRX?`dH&$G<D^ImgkoDak?h6IY3Q>Xoyuaz+!}ag-KkRl=k^j4azwgBOH}JPI
z6~c+X?92ES_-l6hPiQ+N#qmq7`d9GZlV^WI0RTOu-@*T%RNAk0e$A--X(<Nf|8?RY
z`ITR-{2Igh(~1csqy@Q^Un5$-8u)b;{-=ReqTdbtxhDS=`m4DA3GFBQ1Ny7f|7zjy
x^7$to04RiHF8?8>zrz1M1^yLIPW2b~za~T#c{s@E0su&mPXI)(muY_c^?xtK!fpTn

literal 0
HcmV?d00001

diff --git a/src/dagobert/modelling/rl/networks.py b/src/dagobert/modelling/rl/networks.py
index 3f05e70c..e52328ad 100644
--- a/src/dagobert/modelling/rl/networks.py
+++ b/src/dagobert/modelling/rl/networks.py
@@ -52,8 +52,8 @@ def __init__(
         )
         self.linear_a = nn.Linear(n_actions + 1, num_channels[-1])
         self.linear1 = nn.Linear(hparams.mini_series_length, 1)
-        # self.linear2 = nn.Linear(num_channels[-1] * 2, output_size)
-        self.linear2 = nn.Linear(num_channels[-1], output_size)
+        self.linear2 = nn.Linear(num_channels[-1] * 2, output_size)
+        # self.linear2 = nn.Linear(num_channels[-1], output_size)
 
     def forward(self, state, past_pw):
         s1 = self.tcn(*state)
@@ -63,8 +63,8 @@ def forward(self, state, past_pw):
         else:
             s2 = torch.tanh(self.linear1(s1).squeeze(-1))
         # bring together the state and past_pw representations
-        # return self.linear2(torch.cat([s2, a1], dim=1))
-        return self.linear2(s2)
+        return self.linear2(torch.cat([s2, a1], dim=1))
+        # return self.linear2(s2)
 
 
 class ActorContinous(nn.Module):
@@ -109,7 +109,7 @@ def forward(self, states, past_pw):
         concentrations = self.inv_lin(logits).squeeze(0)
         pi = Dirichlet(concentrations)
         actions = pi.sample()
-        return pi, actions
+        return pi, actions, logits
 
     def get_log_prob(self, pi: Dirichlet, actions: torch.Tensor):
         """
@@ -149,17 +149,19 @@ def __call__(
             past_pw: the previous portfolio value and weights
             device: the device used for the current batch
         Returns:
-            torch dsitribution and randomly sampled action
+            torch distribution and randomly sampled action, the logits that went into
+                the Dirichlet dist, the probability of the sample, the estimated reward
+                for this action by the critic
         """
 
         state = [s.to(device=device) for s in state]
         past_pw = past_pw.to(device=device)
 
-        pi, actions = self.actor_net(state, past_pw)
+        pi, actions, actor_logits = self.actor_net(state, past_pw)
         log_p = self.get_log_prob(pi, actions)
 
         value = self.critic_net(state, past_pw)
-        return pi, actions, log_p, value
+        return pi, actions, actor_logits, log_p, value
 
     def get_log_prob(self, pi: Dirichlet, actions: torch.Tensor) -> torch.Tensor:
         """
@@ -184,6 +186,6 @@ def __init__(self):
         super().__init__()
 
     def forward(self, x):
-        x[x < 0] = 1 / (1 - x[x < 0])
         x[x >= 0] = x[x >= 0] + 1
+        x[x < 0] = 1 / (1 - x[x < 0])
         return x
diff --git a/src/dagobert/modelling/rl/ppo.py b/src/dagobert/modelling/rl/ppo.py
index c64780ee..8bf2f078 100644
--- a/src/dagobert/modelling/rl/ppo.py
+++ b/src/dagobert/modelling/rl/ppo.py
@@ -201,7 +201,9 @@ def train_batch(
         """
         past_pw = self._init_past_pw()
         for step in range(self.hparams.steps_per_epoch):
-            pi, action, log_prob, value = self.agent(self.state, past_pw, self.device)
+            pi, action, actor_logits, log_prob, value = self.agent(
+                self.state, past_pw, self.device
+            )
             next_state, reward, done, info = self.env.step(action.cpu().numpy())
 
             # drop first batch dim so dataloader later can resample them for backprop
@@ -213,7 +215,7 @@ def train_batch(
             self.ep_values.append(value.item())
             self.state = next_state
 
-            past_pw = self._update_past_pw(info["portfolio_value"], action)
+            past_pw = self._update_past_pw(info["portfolio_value"], actor_logits)
             epoch_end = step == (self.hparams.steps_per_epoch - 1)
             terminal = len(self.ep_rewards) == self.hparams.max_episode_length
             if epoch_end or done or terminal:
@@ -238,7 +240,7 @@ def train_batch(
                     self.batch_actions,
                     self.batch_logp,
                     self.batch_qvals,
-                    self.batch_adv,
+                    PPO.normalise_advantage(self.batch_adv),
                 )
 
                 for state, past_pw, action, logp_old, qval, adv in train_data:
@@ -259,7 +261,7 @@ def train_batch(
                 self.done_episodes = 0
 
     def actor_loss(self, state, past_pw, action, logp_old, adv) -> torch.Tensor:
-        pi, _ = self.actor(state, past_pw)
+        pi, _, _ = self.actor(state, past_pw)
         logp = self.actor.get_log_prob(pi, action)
         old_new_diff = logp - logp_old
         ratio = torch.exp(old_new_diff)
@@ -362,8 +364,8 @@ def _init_past_pw(self) -> torch.Tensor:
         portfolio is reset for each trajector p0=1, w0[0]=1 (USD relative price is
         always 1).
         """
-        past_pw = torch.zeros(len(self.hparams.asset_names) + 2).to(self.device)
-        past_pw[:2] = 1
+        past_pw = torch.ones(len(self.hparams.asset_names) + 2).to(self.device)
+        # past_pw[:2] = 1
         return past_pw.unsqueeze(0)
 
     def _update_past_pw(self, p1: float, action: torch.Tensor) -> torch.Tensor:
@@ -373,7 +375,7 @@ def _update_past_pw(self, p1: float, action: torch.Tensor) -> torch.Tensor:
         new state to form their outputs.
         """
         p1 = torch.Tensor([p1]).to(self.device)
-        return torch.cat([p1, action]).unsqueeze(0)
+        return torch.cat([p1.unsqueeze(0), action], -1)
 
     @staticmethod
     def _pre_sanity_check(hparams: Namespace):

From e8d9e8f97c53c76b5f53c293f2a7f84971a9eba0 Mon Sep 17 00:00:00 2001
From: Daniel Homola <dani.homola@gmail.com>
Date: Mon, 11 Jan 2021 11:30:21 +0000
Subject: [PATCH 22/62] major rewrite of the experience collection to make it
 parallel - doesn't work yet

---
 src/dagobert/modelling/rl/__init__.py |   1 +
 src/dagobert/modelling/rl/networks.py |   4 +-
 src/dagobert/modelling/rl/ppo.py      | 259 +++++++++++++-------------
 src/dagobert/modelling/rl/utils.py    | 203 +++++++++++++++++++-
 4 files changed, 330 insertions(+), 137 deletions(-)

diff --git a/src/dagobert/modelling/rl/__init__.py b/src/dagobert/modelling/rl/__init__.py
index d4900664..72758a44 100644
--- a/src/dagobert/modelling/rl/__init__.py
+++ b/src/dagobert/modelling/rl/__init__.py
@@ -1,3 +1,4 @@
 from .environment import RLData, RLPortfolio, RLEnv
 from .networks import ActorCriticTCN, ActorCriticAgent, ActorContinous
 from .ppo import PPO
+from .utils import ExperienceBuffer, ParallelExperiences
diff --git a/src/dagobert/modelling/rl/networks.py b/src/dagobert/modelling/rl/networks.py
index e52328ad..4082a7cf 100644
--- a/src/dagobert/modelling/rl/networks.py
+++ b/src/dagobert/modelling/rl/networks.py
@@ -62,8 +62,8 @@ def forward(self, state, past_pw):
             s2 = s1[:, :, -1]
         else:
             s2 = torch.tanh(self.linear1(s1).squeeze(-1))
-        # bring together the state and past_pw representations
-        return self.linear2(torch.cat([s2, a1], dim=1))
+        # bring together the state and past_pw representations make residual connection
+        return past_pw[:, 1:] + self.linear2(torch.cat([s2, a1], dim=1))
         # return self.linear2(s2)
 
 
diff --git a/src/dagobert/modelling/rl/ppo.py b/src/dagobert/modelling/rl/ppo.py
index 8bf2f078..e90241e9 100644
--- a/src/dagobert/modelling/rl/ppo.py
+++ b/src/dagobert/modelling/rl/ppo.py
@@ -9,6 +9,7 @@
 from typing import List, Tuple
 from argparse import Namespace
 
+
 import gym
 import torch
 import numpy as np
@@ -27,6 +28,8 @@
     ActorCriticTCN,
     ActorContinous,
     ActorCriticAgent,
+    ParallelExperiences,
+    ExperienceBuffer,
 )
 from dagobert.modelling.dl import (
     ExperienceSourceDataset,
@@ -70,6 +73,7 @@ def run_rl(args):
 
     # define trainer and and lightning module
     args.multiprocessing = True if args.gpus != 1 else False
+    args.num_workers = 1 if args.num_workers == 0 else args.num_workers
     trainer = Trainer.from_argparse_args(
         args,
         logger=tcn_loggers,
@@ -106,10 +110,9 @@ def __init__(self, hparams: Namespace):
         hparams = Preprocessing().preprocess_train_dfs(hparams)
         self.hparams = TCNLightning._check_mini_series_lookback(hparams)
 
-        # create env, init starting state and policy/value networks
+        # create env, policy/value networks and experience buffer + tracking vars
         self.env = RLEnv(self.hparams)
-        self.state = self.env.reset()
-        n_actions = self.env.action_space.shape[0]
+        n_actions = self.envs.action_space.shape[0]
         self.critic = ActorCriticTCN(
             self.hparams, n_actions=n_actions, output_size=1, actor=False
         )
@@ -117,27 +120,73 @@ def __init__(self, hparams: Namespace):
             ActorCriticTCN(self.hparams, n_actions=n_actions, output_size=n_actions)
         )
         self.agent = ActorCriticAgent(self.actor, self.critic)
-
-        # init batching and progress tracking vars
-        self.batch_states = []
-        self.batch_past_pw = []
-        self.batch_actions = []
-        self.batch_adv = []
-        self.batch_qvals = []
-        self.batch_logp = []
-
-        self.ep_rewards = []
-        self.ep_values = []
-
-        self.done_episodes = 0
-        self.epoch_rewards = 0
+        self.buffer = ExperienceBuffer()
         self.avg_ep_reward = 0
         self.avg_ep_len = 0
         self.avg_reward = 0
 
-    def discount_rewards(self, rewards: List[float], discount: float) -> List[float]:
+    # ----------------------------------------------------------------------------------
+    # EXPERIENCE COLLECTION FOR TRAIN DATALOADER
+    # ----------------------------------------------------------------------------------
+
+    def train_dataloader(self) -> DataLoader:
+        """Initialize the Experience Buffer dataset used for retrieving experiences"""
+        dataset = ExperienceSourceDataset(self.generate_experience_buffer)
+        dataloader = DataLoader(dataset=dataset, batch_size=self.hparams.batch_size)
+        return dataloader
+
+    def generate_experience_buffer(
+        self,
+    ) -> Tuple[
+        List[torch.Tensor],
+        List[torch.Tensor],
+        List[torch.Tensor],
+        List[torch.Tensor],
+        List[torch.Tensor],
+    ]:
+        """
+        Logic for generating trajectory data to train policy and value networks. This
+        is done leveraging the `Process` and `Queue` classes of the `multiprocessing`
+        module of Python. We'll launch hparams.num_workers number of new processes,
+        each replicating the environment in memory, so this can get expensive where
+        `num_assets` in the portfolio is large.
+
+        Yield:
+           Tuple of Lists containing tensors for states, actions, log probs, qvals and
+            advantage.
+        """
+        # setup workers and pass them the env, agent, vars to work with
+        max_worker_steps = int(self.hparams.steps_per_epoch / self.hparams.num_workers)
+        parallel_experiences = ParallelExperiences()
+        for i in range(self.hparams.num_workers):
+            args = (
+                deepcopy(self.env),
+                self.agent,
+                self.device,
+                max_worker_steps,
+                self.hparams.max_episode_length,
+                len(self.hparams.asset_names),
+                self.hparams.gamma,
+                self.hparams.lam,
+            )
+            parallel_experiences.create_worker(args)
+
+        # collect experiences in parallel, then merge them and create dataset
+        self.buffer.merge_buffers(parallel_experiences.collect_experiences())
+        self.buffer.yield_dataset()  # this will yield a dataset for dataloader
+        self.buffer.clear_buffer()
+
+        # finally update metrics we log
+        self.avg_ep_reward = self.buffer.epoch_rewards / self.buffer.done_episodes
+        self.avg_reward = self.buffer.epoch_rewards / self.hparams.steps_per_epoch
+        self.avg_ep_len = self.hparams.steps_per_epoch / self.buffer.done_episodes
+
+    @staticmethod
+    def discount_rewards(rewards: List[float], discount: float) -> List[float]:
         """
-        Calculate the discounted rewards of all rewards in list.
+        Calculate the discounted rewards of all rewards in list. This is used as
+        Q-values for training the critic network so it becomes better approximating
+        the real reward we can expect from a given state.
 
         Args:
             rewards: list of rewards/advantages
@@ -153,24 +202,39 @@ def discount_rewards(self, rewards: List[float], discount: float) -> List[float]
             cumul_reward.append(sum_r)
         return list(reversed(cumul_reward))
 
-    def calc_advantage(self, rewards: List[float], values: List[float]) -> List[float]:
+    @staticmethod
+    def calc_advantage(
+        rewards: List[float],
+        values: List[float],
+        gamma: float = 0.99,
+        lam: float = 0.95,
+        norm: bool = True,
+    ) -> List[float]:
         """
         Calculate the advantage given rewards, state values, and last value of episode.
+        The advantage compares how much better the actor did compared to what the
+        critic thought the given state is worth in reward.
 
         Args:
             rewards: list of episode rewards
             values: list of state values from critic
+            gamma: Gamma for discounting the long-term rewards.
+            lam: Lambda for the GAE advantage calculation.
+            norm: If True, the advantages are normalised to mean=0, std=1.
 
         Returns:
-            list of advantages
+            List of advantages.
         """
         # GAE
         delta = [
-            rewards[i] + self.hparams.gamma * values[i + 1] - values[i]
+            rewards[i] + gamma * values[i + 1] - values[i]
             for i in range(len(rewards) - 1)
         ]
-        adv = self.discount_rewards(delta, self.hparams.gamma * self.hparams.lam)
-        return self.normalise_advantage(adv)
+        adv = PPO.discount_rewards(delta, gamma * lam)
+        if norm:
+            return PPO.normalise_advantage(adv)
+        else:
+            return adv
 
     @staticmethod
     def normalise_advantage(batch_adv: List[float]) -> List[float]:
@@ -183,82 +247,30 @@ def normalise_advantage(batch_adv: List[float]) -> List[float]:
         adv = (adv - adv.mean()) / (adv.std() + np.finfo(float).eps)
         return list(adv)
 
-    def train_batch(
-        self,
-    ) -> Tuple[
-        List[torch.Tensor],
-        List[torch.Tensor],
-        List[torch.Tensor],
-        List[torch.Tensor],
-        List[torch.Tensor],
-    ]:
+    @staticmethod
+    def _init_past_pw(asset_num, device) -> torch.Tensor:
         """
-        Logic for generating trajectory data to train policy and value network
+        Init past portfolio value and weights to [1, 1, 0, ..., 0], since after the
+        portfolio is reset for each trajector p0=1, w0[0]=1 (USD relative price is
+        always 1).
+        """
+        past_pw = torch.ones(asset_num + 2).to(device)
+        # past_pw[:2] = 1
+        return past_pw.unsqueeze(0)
 
-        Yield:
-           Tuple of Lists containing tensors for states, actions, log probs, qvals and
-            advantage.
+    @staticmethod
+    def _update_past_pw(p1: float, action: torch.Tensor, device) -> torch.Tensor:
         """
-        past_pw = self._init_past_pw()
-        for step in range(self.hparams.steps_per_epoch):
-            pi, action, actor_logits, log_prob, value = self.agent(
-                self.state, past_pw, self.device
-            )
-            next_state, reward, done, info = self.env.step(action.cpu().numpy())
-
-            # drop first batch dim so dataloader later can resample them for backprop
-            self.batch_states.append([s.squeeze(0) for s in self.state])
-            self.batch_past_pw.append(past_pw.squeeze(0))
-            self.batch_actions.append(action)
-            self.batch_logp.append(log_prob)
-            self.ep_rewards.append(reward)
-            self.ep_values.append(value.item())
-            self.state = next_state
-
-            past_pw = self._update_past_pw(info["portfolio_value"], actor_logits)
-            epoch_end = step == (self.hparams.steps_per_epoch - 1)
-            terminal = len(self.ep_rewards) == self.hparams.max_episode_length
-            if epoch_end or done or terminal:
-                # discounted cumulative reward
-                self.batch_qvals += self.discount_rewards(
-                    self.ep_rewards, self.hparams.gamma
-                )[:-1]
-                # advantage
-                self.batch_adv += self.calc_advantage(self.ep_rewards, self.ep_values)
-                # logs
-                self.done_episodes += 1
-                self.epoch_rewards += np.sum(self.ep_rewards)
-                # reset params
-                self.ep_rewards = []
-                self.ep_values = []
-                self.state = self.env.reset()
-
-            if epoch_end:
-                train_data = zip(
-                    self.batch_states,
-                    self.batch_past_pw,
-                    self.batch_actions,
-                    self.batch_logp,
-                    self.batch_qvals,
-                    PPO.normalise_advantage(self.batch_adv),
-                )
-
-                for state, past_pw, action, logp_old, qval, adv in train_data:
-                    yield state, past_pw, action, logp_old, qval, adv
-
-                self.batch_states.clear()
-                self.batch_past_pw.clear()
-                self.batch_actions.clear()
-                self.batch_adv.clear()
-                self.batch_logp.clear()
-                self.batch_qvals.clear()
-
-                self.avg_ep_reward = self.epoch_rewards / self.done_episodes
-                self.avg_reward = self.epoch_rewards / self.hparams.steps_per_epoch
-                self.avg_ep_len = self.hparams.steps_per_epoch / self.done_episodes
-
-                self.epoch_rewards = 0
-                self.done_episodes = 0
+        After each interaction, update the past weight / portfolio value vector as for
+        the next interaction the actor and critic networks take that in along with the
+        new state to form their outputs.
+        """
+        p1 = torch.Tensor([p1]).to(device)
+        return torch.cat([p1.unsqueeze(0), action], -1)
+
+    # ----------------------------------------------------------------------------------
+    # LOSSES AND OPTIMIZERS
+    # ----------------------------------------------------------------------------------
 
     def actor_loss(self, state, past_pw, action, logp_old, adv) -> torch.Tensor:
         pi, _, _ = self.actor(state, past_pw)
@@ -278,6 +290,22 @@ def critic_loss(self, state, past_pw, qval) -> torch.Tensor:
         loss_critic = (qval - value).pow(2).mean()
         return loss_critic
 
+    def configure_optimizers(self) -> List[optim.Optimizer]:
+        """ Initialize Adam optimizer"""
+        optimizer_actor = optim.Adam(self.actor.parameters(), lr=self.hparams.lr_actor)
+        optimizer_critic = optim.Adam(
+            self.critic.parameters(), lr=self.hparams.lr_critic
+        )
+        return optimizer_actor, optimizer_critic
+
+    def optimizer_step(self, *args, **kwargs):
+        for _ in range(self.hparams.n_optim_iters):
+            super().optimizer_step(*args, **kwargs)
+
+    # ----------------------------------------------------------------------------------
+    # TRAINING PHASE
+    # ----------------------------------------------------------------------------------
+
     def training_step(
         self, batch: Tuple[torch.Tensor, torch.Tensor], batch_idx, optimizer_idx
     ):
@@ -340,43 +368,6 @@ def training_step(
             )
             return loss_critic
 
-    def configure_optimizers(self) -> List[optim.Optimizer]:
-        """ Initialize Adam optimizer"""
-        optimizer_actor = optim.Adam(self.actor.parameters(), lr=self.hparams.lr_actor)
-        optimizer_critic = optim.Adam(
-            self.critic.parameters(), lr=self.hparams.lr_critic
-        )
-        return optimizer_actor, optimizer_critic
-
-    def optimizer_step(self, *args, **kwargs):
-        for _ in range(self.hparams.n_optim_iters):
-            super().optimizer_step(*args, **kwargs)
-
-    def train_dataloader(self) -> DataLoader:
-        """Initialize the Replay Buffer dataset used for retrieving experiences"""
-        dataset = ExperienceSourceDataset(self.train_batch)
-        dataloader = DataLoader(dataset=dataset, batch_size=self.hparams.batch_size)
-        return dataloader
-
-    def _init_past_pw(self) -> torch.Tensor:
-        """
-        Init past portfolio value and weights to [1, 1, 0, ..., 0], since after the
-        portfolio is reset for each trajector p0=1, w0[0]=1 (USD relative price is
-        always 1).
-        """
-        past_pw = torch.ones(len(self.hparams.asset_names) + 2).to(self.device)
-        # past_pw[:2] = 1
-        return past_pw.unsqueeze(0)
-
-    def _update_past_pw(self, p1: float, action: torch.Tensor) -> torch.Tensor:
-        """
-        After each interaction, update the past weight / portfolio value vector as for
-        the next interaction the actor and critic networks take that in along with the
-        new state to form their outputs.
-        """
-        p1 = torch.Tensor([p1]).to(self.device)
-        return torch.cat([p1.unsqueeze(0), action], -1)
-
     @staticmethod
     def _pre_sanity_check(hparams: Namespace):
         # ensure we have the rl specific target column in the config
diff --git a/src/dagobert/modelling/rl/utils.py b/src/dagobert/modelling/rl/utils.py
index f2651087..25520783 100644
--- a/src/dagobert/modelling/rl/utils.py
+++ b/src/dagobert/modelling/rl/utils.py
@@ -1,10 +1,211 @@
-"""Util functions for portfolio optimization and other RL related tasks"""
+"""
+Util functions for portfolio optimization and other RL related tasks, including the 
+classes for gathering experience in parallel.
+"""
+# pylint: disable=no-member
+from typing import List
+from multiprocessing import Process, Queue
 
+import gym
+import torch
 import numpy as np
 
+from dagobert.modelling.rl import PPO, ActorCriticAgent
+
 eps = np.finfo(float).eps
 
 
+class ExperienceBuffer:
+    """
+    Object holding all states, rewards, actions, logp vals, etc of a rollout session,
+    i.e. the phase of the training when we're collecting experience to train on later
+    using the current policy of the actor.
+
+    This is designed to work both with a single worker (single process) or with
+    multiple workers collecting experience in parallel.
+    """
+
+    def __init__(self):
+        """Class constructor"""
+        # step vars
+        self.states = []
+        self.past_pws = []
+        self.actions = []
+        self.advs = []
+        self.qvals = []
+        self.logps = []
+        self.infos = []
+
+        # episode vars
+        self.ep_rewards = []
+        self.ep_values = []
+        self.done_episodes = 0
+        self.epoch_rewards = 0
+
+    def append(
+        self,
+        state: torch.Tensor,
+        past_pw: torch.Tensor,
+        action: torch.Tensor,
+        logp: torch.Tensor,
+        reward: float,
+        value: torch.Tensor,
+        info: dict,
+    ):
+        """
+        Appends the state (including portfolio value and weights), actions, logp,
+        reward to the buffer after a single step taken in the environment.
+
+        Args:
+            state: State that went into the agent (i.e. both actor and critic).
+            past_pw: Past portfolio value and weights that went into the agent.
+            action: Agent's action to the state and past_pw.
+            logp: Log-probability of the action sampled from the actor's distribution.
+            reward: Reward obtained by the action.
+            value: Estimated (by critic) reward we should have got with this action.
+            info: Portfolio related information returned by the env after the step.
+        """
+        # drop first batch dim so dataloader later can resample them for backprop
+        self.states.append([s.squeeze(0) for s in state])
+        self.past_pws.append(past_pw.squeeze(0))
+        self.actions.append(action)
+        self.logps.append(logp)
+        self.infos.append(info)
+        self.ep_rewards.append(reward)
+        self.ep_values.append(value.item())
+
+    def merge_buffers(self, buffers: List[ExperienceBuffer]):
+        """
+        Merges the passed in ExperienceBuffers and overwrites the current state with it.
+
+        Args:
+            buffers: List of smaller ExpereinceBuffers to merge together from parallel
+                processes.
+        """
+        pass
+
+    def yield_dataset(
+        self,
+    ) -> Tuple[
+        List[torch.Tensor],
+        List[torch.Tensor],
+        List[torch.Tensor],
+        List[torch.Tensor],
+        List[torch.Tensor],
+    ]:
+        """
+        Yields an iterable dataset for Pytorch Lightning from the contents of the
+        ExperienceBuffer.
+
+        Yield:
+           Tuple of Lists containing tensors for states, actions, log probs, qvals and
+            advantage.
+        """
+        data = zip(
+            self.batch_states,
+            self.batch_past_pw,
+            self.batch_actions,
+            self.batch_logp,
+            self.batch_qvals,
+            PPO.normalise_advantage(self.batch_adv),
+        )
+        for state, past_pw, action, logp_old, qval, adv in data:
+            yield state, past_pw, action, logp_old, qval, adv
+
+    def clear_buffer(self):
+        """Resets the ExperienceBuffer."""
+        self.batch_states.clear()
+        self.batch_past_pw.clear()
+        self.batch_actions.clear()
+        self.batch_adv.clear()
+        self.batch_logp.clear()
+        self.batch_qvals.clear()
+
+
+class ParallelExperiences:
+    def __init__(self):
+        self.exp_queue = Queue()
+        self.processes = []
+
+    def collect_experiences(self) -> List[ExperienceBuffer]:
+        """Returns the experiences from parallel workers. You need to wait for these."""
+        exp_buffers = []
+        # gather results from workers using the queue and merge them into one
+        for process in self.processes:
+            exp_buffers.append(self.exp_queue.get())  # will block
+        for process in self.processes:
+            process.join()
+        return exp_buffers
+
+    def create_worker(self, *args):
+        """Creates a new worker, with the args passed in for `_gather_experience`."""
+        process = Process(target=self.gather_experience, args=args)
+        self.processes.append(process)
+        process.start()
+
+    def gather_experience(
+        self,
+        env: gym.Env,
+        agent: ActorCriticAgent,
+        device: torch.device,
+        max_steps: int,
+        max_episode_length: int,
+        asset_num: int,
+        gamma: float,
+        lam: float,
+    ):
+        """
+        Workhorse function of the parallel experience gathering. This function can be
+        called as many times as many CPUs are available on the system, to collect the
+        desired number of steps and store them into an `ExperienceBuffer` that is then
+        passed back (via a `multiprocessing.Queue` object) to the main process that
+        spawned the parallel processes.
+
+        Args:
+            env: An instance of the environment to act on.
+            agent: An instance of the PPO's `ActorCriticAgent`.
+            device: Device where the agent lives (GPU or CPU).
+            max_steps: Total number of steps (over multiple episodes) a worker can take.
+            max_episode_length: Maximum length of a trajectory / episode.
+            asset_num: Number of assets we are modelling (not including USD).
+            gamma: See docs of :func:`PPO.calc_advantage`
+            lam: See docs of :func:`PPO.calc_advantage`
+
+        Returns:
+            Adds the results to `exp_queue` so it can be processed in the main process.
+        """
+        buffer = ExperienceBuffer()
+        state = env.reset()
+        past_pw = PPO._init_past_pw(asset_num, device)
+        for step in range(max_steps):
+            # get action, make step, get reward and info from env
+            pi, action, actor_logits, logp, value = agent(state, past_pw, device)
+            next_state, reward, done, info = env.step(action.cpu().numpy())
+
+            # store everything and update state, past_pw
+            buffer.append(state, past_pw, action, logp, reward, value, info)
+            state = next_state
+            past_pw = PPO._update_past_pw(info["portfolio_value"], actor_logits, device)
+
+            terminal = len(buffer.ep_rewards) == max_episode_length
+            if done or terminal:
+                buffer.qvals += PPO.discount_rewards(buffer.ep_rewards, gamma)[:-1]
+                buffer.advs += PPO.calc_advantage(
+                    buffer.ep_rewards, buffer.ep_values, gamma, lam
+                )
+                buffer.done_episodes += 1
+                buffer.epoch_rewards += np.sum(buffer.ep_rewards)
+
+                # episode over, reset the env and the buffer
+                buffer.ep_rewards = []
+                buffer.ep_values = []
+                state = env.reset()
+                past_pw = PPO._init_past_pw(asset_num, device)
+
+        # add collected experience to the queue so it can be returned to master process
+        self.exp_queue.put(buffer)
+
+
 def sharpe_ratio(returns, freq: int = 30, rfr: int = 0):
     """
     Given a set of returns, calculates naive (rfr=0) sharpe (eq 28).

From d90a86b2683320fe38868dd7c8fe809a52bba74e Mon Sep 17 00:00:00 2001
From: MateMarcell <marcell-mate@hotmail.com>
Date: Tue, 12 Jan 2021 14:37:20 +0000
Subject: [PATCH 23/62] optimizers

---
 .../modelling/augmentation/timegan.py         | 107 +++++++++++++++++-
 1 file changed, 101 insertions(+), 6 deletions(-)

diff --git a/src/dagobert/modelling/augmentation/timegan.py b/src/dagobert/modelling/augmentation/timegan.py
index 94a2b076..76ad3c94 100644
--- a/src/dagobert/modelling/augmentation/timegan.py
+++ b/src/dagobert/modelling/augmentation/timegan.py
@@ -20,11 +20,11 @@
 
 from pytorch_lightning import LightningModule
 
+from dagobert.modelling.dl import AdaBelief
+
 
 class RnnBlock(nn.Module):
-    """
-    Generate time-series data in latent space.
-    """
+    """"""
 
     def __init__(
         self,
@@ -67,9 +67,7 @@ def forward(self, z):
 
 
 class Supervisor(nn.Module):
-    """
-    Generate next sequence using the previous sequence.
-    """
+    """"""
 
     def __init__(
         self,
@@ -248,6 +246,7 @@ def __init__(self, hparams: Namespace):
         all_inputs = sum(num_inputs)
 
         # components of network
+        # Generate time-series data in latent space.
         self.generator = RnnBlock(
             input_size=self.hparams.z_dim,
             hidden_size=self.hparams.hidden_size,
@@ -264,6 +263,7 @@ def __init__(self, hparams: Namespace):
             batch_first=True,
             rnn=self.hparams.rnn,
         )
+        # Generate next sequence using the previous sequence.
         self.supervisor = RnnBlock(
             input_size=self.hparams.hidden_size,
             hidden_size=self.hparams.hidden_size,
@@ -300,3 +300,98 @@ def __init__(self, hparams: Namespace):
         )
         self.recovery_linear = nn.Linear(self.hparams.hidden_size, all_inputs)
         self.discriminator_linear = nn.Linear(self.hparams.hidden_size, 1)
+
+    def forward(self, *x):
+        pass
+
+    def configure_optimizers(self):
+        """
+        Optimizer setup. list of optimizers accessed by idx in training step.
+        """
+        optimizers = []
+        param_pairs = [
+            list(self.embedder.parameters()) + list(self.recovery.parameters()),
+            list(self.generator.parameters()) + list(self.supervisor.parameters()),
+            list(self.generator.parameters()) + list(self.supervisor.parameters()),
+            list(self.embedder.parameters()) + list(self.recovery.parameters()),
+            list(self.discriminator.parameters()),
+        ]
+        if "adam" in self.hparams.optimizer.lower():
+            for param_pair in param_pairs:
+                optimizer = torch.optim.AdamW(param_pair, lr=self.hparams.lr)
+                optimizers.append(optimizer)
+        elif "adabelief" in self.hparams.optimizer.lower():
+            for param_pair in param_pairs:
+                optimizer = AdaBelief(param_pair, lr=self.hparams.lr)
+                optimizers.append(optimizer)
+        return optimizers
+
+    # ----------------------------------------------------------------------------------
+    # SETUP FUNCTIONS
+    # ----------------------------------------------------------------------------------
+
+    def _setup_loss(self, loss_name):
+        """
+        Different losses are used to direct updates in the components of TimeGAN.
+        """
+        if loss_name == "BCEWithLogits":
+            return nn.BCEWithLogitsLoss()
+        elif self.hparams.regression:
+            return nn.MSELoss()
+
+        else:
+            if self.hparams.no_class_weights:
+                if self.hparams.output_size > 1:
+                    return nn.CrossEntropyLoss()
+                else:
+                    return nn.BCEWithLogitsLoss()
+            else:
+                if self.hparams.output_size > 1:
+                    return nn.CrossEntropyLoss(self._get_class_weights())
+                else:
+                    pos_weight = self._get_class_weights()[1]
+                    return nn.BCEWithLogitsLoss(pos_weight=pos_weight)
+
+    # ----------------------------------------------------------------------------------
+    # CALCULATION
+    # ----------------------------------------------------------------------------------
+
+    def _calculate_loss(self, x, y_true):
+        """
+        Calculates the appropriate loss, given the `classification` flag.
+
+        Args:
+            x: A batch of X.
+            y_true: A batch of target.
+
+        Returns:
+            Tuple of loss, y_true and y_pred.
+        """
+        y_pred = self(*[xi.float() for xi in x])
+        if self.hparams.output_size == 1 and self.hparams.last_y:
+            y_true = y_true.reshape(-1, 1).float()
+        elif self.hparams.output_size == 1 and not self.hparams.last_y:
+            y_true = y_true.float()
+        elif self.hparams.output_size == 3 and not self.hparams.regression:
+            # convert triple barrier method's -1/0/1 into 0, 1, 2 torch's cross-entropy
+            y_true = y_true.long() + 1
+        if not self.hparams.last_y:
+            # we only keep the latest fraction of labels of the mini-series
+            keep = int(self.hparams.mini_series_length * self.hparams.non_last_y_frac)
+            keep_ix = self.hparams.mini_series_length - keep
+            y_true = y_true[:, keep_ix:]
+            if self.hparams.output_size == 1:
+                y_pred = y_pred[:, keep_ix:]
+            else:
+                y_pred = y_pred.transpose(1, 2)[:, :, keep_ix:]
+        loss = self.loss_f(y_pred, y_true)
+
+        if self.hparams.mix_density_net:
+            # for mix density nets we need to estimate y_preds as a mixture of mus
+            y_pred = self.loss_f.get_mu_preds(y_pred)
+        elif not self.hparams.last_y:
+            # we only use the last timepoint's pred for plotting and metric calculation
+            # otherwise we often run out of memory at the end of the epoch
+            y_pred = y_pred[:, -1]
+            y_true = y_true[:, -1]
+        return loss, y_true, y_pred

From 7630ab2376c9498e4e7a2f9de49dc44517bd1338 Mon Sep 17 00:00:00 2001
From: MateMarcell <marcell-mate@hotmail.com>
Date: Tue, 12 Jan 2021 17:59:39 +0000
Subject: [PATCH 24/62] clean up some of mess, adding flow of training step

---
 .../modelling/augmentation/timegan.py         | 214 ++++--------------
 1 file changed, 46 insertions(+), 168 deletions(-)

diff --git a/src/dagobert/modelling/augmentation/timegan.py b/src/dagobert/modelling/augmentation/timegan.py
index 76ad3c94..1790bf9b 100644
--- a/src/dagobert/modelling/augmentation/timegan.py
+++ b/src/dagobert/modelling/augmentation/timegan.py
@@ -15,6 +15,7 @@
 
 import torch
 import torch.nn as nn
+import torch.optim as optim
 import torch.nn.functional as f
 from torch.nn.utils import weight_norm
 
@@ -24,16 +25,21 @@
 
 
 class RnnBlock(nn.Module):
-    """"""
+    """
+    Class for creating 5 components of TimeGAN.
+    """
 
     def __init__(
         self,
         input_size: int,
         hidden_size: int,
         num_layers: int,
+        linear_input_size: int,
+        linear_output_size: int,
         dropout: float = 0.2,
         batch_first: bool = True,
         rnn: str = "lstm",
+        linear_activation: bool = True,
     ):
         super(RnnBlock, self).__init__()
 
@@ -55,165 +61,27 @@ def __init__(
                 batch_first=batch_first,
             )
         self.tanh = nn.Tanh()
-        # TODO: whats' size of output in latent space
-        self.linear = nn.Linear(hidden_size, hidden_size)
-        self.sigmoid = nn.Sigmoid()
-
-    def forward(self, z):
-        # lstm_out = (batch_size, seq_len, hidden_size)
-        rnn_out, _hidden = self.rnn(z)
-        output = self.tanh(rnn_out)
-        return output
-
-
-class Supervisor(nn.Module):
-    """"""
-
-    def __init__(
-        self,
-        input_size,
-        hidden_size,
-        num_layers,
-        dropout,
-        batch_first=True,
-    ):
-        super(Supervisor, self).__init__()
-
-        # input/output: (batch, seq, feature)
-        # TODO: hparams?
-        self.lstm = nn.LSTM(
-            input_size=input_size,
-            hidden_size=hidden_size,
-            num_layers=num_layers,
-            dropout=dropout,
-            batch_first=batch_first,
-        )
-        self.tanh = nn.Tanh()
-        # TODO: whats' size of output in latent space
-        self.linear = nn.Linear(hidden_size, hidden_size)
-        self.sigmoid = nn.Sigmoid()
-
-    def forward(self, z):
-        # lstm_out = (batch_size, seq_len, hidden_size)
-        lstm_out, _hidden = self.lstm(z)
-        lstm_out = self.tanh(lstm_out)
-        synthetic_series = self.sigmoid(self.linear(lstm_out))
-        return synthetic_series
-
-
-class Discriminator(nn.Module):
-    """
-    Discriminate the original and synthetic time-series data
-    """
-
-    def __init__(
-        self,
-        input_size,
-        hidden_size,
-        num_layers,
-        dropout,
-        batch_first=True,
-    ):
-        super(Discriminator, self).__init__()
-
-        # input/output: (batch, seq, feature)
-        # TODO: hparams?
-        self.lstm = nn.LSTM(
-            input_size=input_size,
-            hidden_size=hidden_size,
-            num_layers=num_layers,
-            dropout=dropout,
-            batch_first=batch_first,
-        )
-        self.tanh = nn.Tanh()
-        # TODO: whats' size of output in latent space
-        self.linear = nn.Linear(hidden_size, 1)
-
-    def forward(self, z):
-        # lstm_out = (batch_size, seq_len, hidden_size)
-        lstm_out, _hidden = self.lstm(z)
-        lstm_out = self.tanh(lstm_out)
-        synthetic_series = self.linear(lstm_out)
-        return synthetic_series
-
-
-class Embedder(nn.Module):
-    """
-    Embedding network between original feature space to latent space.
-    """
-
-    def __init__(
-        self,
-        input_size,
-        hidden_size,
-        num_layers,
-        dropout,
-        batch_first=True,
-    ):
-        super(Embedder, self).__init__()
-
-        # input/output: (batch, seq, feature)
-        # TODO: hparams?
-        self.lstm = nn.LSTM(
-            input_size=input_size,
-            hidden_size=hidden_size,
-            num_layers=num_layers,
-            dropout=dropout,
-            batch_first=batch_first,
-        )
-        self.tanh = nn.Tanh()
-        # TODO: whats' size of output in latent space
-        self.linear = nn.Linear(hidden_size, hidden_size)
+        self.linear = nn.Linear(linear_input_size, linear_output_size)
         self.sigmoid = nn.Sigmoid()
+        self.linear_activation = linear_activation
 
     def forward(self, x):
-        # lstm_out = (batch_size, seq_len, hidden_size)
-        lstm_out, _hidden = self.lstm(x)
-        lstm_out = self.tanh(lstm_out)
-        embedded_real = self.sigmoid(self.linear(lstm_out))
-        return embedded_real
-
-
-class Recovery(nn.Module):
-    """
-    Recovery network from latent space to original space.
-    """
-
-    def __init__(
-        self,
-        input_size,
-        hidden_size,
-        num_layers,
-        dropout,
-        batch_first=True,
-    ):
-        super(Recovery, self).__init__()
-
-        # input/output: (batch, seq, feature)
-        # TODO: hparams?
-        self.lstm = nn.LSTM(
-            input_size=input_size,
-            hidden_size=hidden_size,
-            num_layers=num_layers,
-            dropout=dropout,
-            batch_first=batch_first,
-        )
-        self.tanh = nn.Tanh()
-        # TODO: output size is same as original number of features
-        self.linear = nn.Linear(hidden_size, input_size)
-        self.sigmoid = nn.Sigmoid()
-
-    def forward(self, x):
-        # lstm_out = (batch_size, seq_len, hidden_size)
-        lstm_out, _hidden = self.lstm(x)
-        lstm_out = self.tanh(lstm_out)
-        X_tilde = self.sigmoid(self.linear(lstm_out))
-        return X_tilde
+        rnn_out, _hidden = self.rnn(x)
+        rnn_out = self.tanh(rnn_out)
+        output = self.linear(rnn_out)
+        if self.linear_activation:
+            output = self.sigmoid(output)
+        return output
 
 
 class TimeGANLightning(LightningModule):
     """
-    Lightning model made of RNN nets working together.
+    Lightning model made of 5 RNN nets working together:
+        - Embedding network between original feature space to latent space.
+        - Recovery network from latent space to original space.
+        - Generator function: generate time-series data in latent space.
+        - Discriminate the original and synthetic time-series data
+        - Supervisor generating next sequence using the previous sequence.
     """
 
     # ----------------------------------------------------------------------------------
@@ -246,65 +114,75 @@ def __init__(self, hparams: Namespace):
         all_inputs = sum(num_inputs)
 
         # components of network
-        # Generate time-series data in latent space.
         self.generator = RnnBlock(
             input_size=self.hparams.z_dim,
             hidden_size=self.hparams.hidden_size,
             num_layers=self.hparams.num_layers,
+            linear_input_size=self.hparams.hidden_size,
+            linear_output_size=self.hparams.hidden_size,
             dropout=self.hparams.dropout,
             batch_first=True,
             rnn=self.hparams.rnn,
+            linear_activation=True,
         )
         self.embedder = RnnBlock(
             input_size=all_inputs,
             hidden_size=self.hparams.hidden_size,
             num_layers=self.hparams.num_layers,
+            linear_input_size=self.hparams.hidden_size,
+            linear_output_size=self.hparams.hidden_size,
             dropout=self.hparams.dropout,
             batch_first=True,
             rnn=self.hparams.rnn,
+            linear_activation=True,
         )
         # Generate next sequence using the previous sequence.
         self.supervisor = RnnBlock(
             input_size=self.hparams.hidden_size,
             hidden_size=self.hparams.hidden_size,
             num_layers=self.hparams.num_layers,
+            linear_input_size=self.hparams.hidden_size,
+            linear_output_size=self.hparams.hidden_size,
             dropout=self.hparams.dropout,
             batch_first=True,
             rnn=self.hparams.rnn,
+            linear_activation=True,
         )
         self.recovery = RnnBlock(
             input_size=self.hparams.hidden_size,
             hidden_size=self.hparams.hidden_size,
             num_layers=self.hparams.num_layers,
+            linear_input_size=self.hparams.hidden_size,
+            linear_output_size=all_inputs,
             dropout=self.hparams.dropout,
             batch_first=True,
             rnn=self.hparams.rnn,
+            linear_activation=True,
         )
         self.discriminator = RnnBlock(
             input_size=self.hparams.hidden_size,
             hidden_size=self.hparams.hidden_size,
             num_layers=self.hparams.num_layers,
+            linear_input_size=self.hparams.hidden_size,
+            linear_output_size=1,
             dropout=self.hparams.dropout,
             batch_first=True,
             rnn=self.hparams.rnn,
+            linear_activation=False,
         )
-        # final linear layers
-        self.generator_linear = nn.Linear(
-            self.hparams.hidden_size, self.hparams.hidden_size
-        )
-        self.embedder_linear = nn.Linear(
-            self.hparams.hidden_size, self.hparams.hidden_size
-        )
-        self.supervisor_linear = nn.Linear(
-            self.hparams.hidden_size, self.hparams.hidden_size
-        )
-        self.recovery_linear = nn.Linear(self.hparams.hidden_size, all_inputs)
-        self.discriminator_linear = nn.Linear(self.hparams.hidden_size, 1)
 
-    def forward(self, *x):
+    def training_step(self, batch, batch_idx, optimizer_idx):
+        real, _ = batch
+        # embedding
+        h = torch.tanh(self.embedder(real))
+        h = torch.sigmoid(self.embedder_linear(h))
+        # recover embedding
+        x_tilde = torch.tanh(self.recovery(h))
+        x_tilde = torch.sigmoid(self.recovery_linear(x_tilde))
+
         pass
 
-    def configure_optimizers(self):
+    def configure_optimizers(self) -> List[optim.Optimizer]:
         """
         Optimizer setup. list of optimizers accessed by idx in training step.
         """

From 1cb5a8e6219fd2b692fccabcbada232c9d01a767 Mon Sep 17 00:00:00 2001
From: Daniel Homola <dani.homola@gmail.com>
Date: Wed, 13 Jan 2021 07:43:35 +0000
Subject: [PATCH 25/62] gave up on parallel ppo on windows, let's try it on
 aws/linux

---
 config/rl_config.yaml                    |   2 +-
 src/dagobert/modelling/rl/__init__.py    |   1 -
 src/dagobert/modelling/rl/environment.py |   1 -
 src/dagobert/modelling/rl/ppo.py         | 281 ++++++++++++++++++-----
 src/dagobert/modelling/rl/utils.py       | 196 ----------------
 5 files changed, 229 insertions(+), 252 deletions(-)

diff --git a/config/rl_config.yaml b/config/rl_config.yaml
index e6488e7a..5f273987 100644
--- a/config/rl_config.yaml
+++ b/config/rl_config.yaml
@@ -17,7 +17,7 @@ auto_scale_batch_size:
 # --------------------------------------------------------------------------------------
 
 log_dir: logs
-num_workers: 4
+num_workers: 1
 exp_name: RL-PPO-TCN
 tags:
   - RL_test
diff --git a/src/dagobert/modelling/rl/__init__.py b/src/dagobert/modelling/rl/__init__.py
index 72758a44..d4900664 100644
--- a/src/dagobert/modelling/rl/__init__.py
+++ b/src/dagobert/modelling/rl/__init__.py
@@ -1,4 +1,3 @@
 from .environment import RLData, RLPortfolio, RLEnv
 from .networks import ActorCriticTCN, ActorCriticAgent, ActorContinous
 from .ppo import PPO
-from .utils import ExperienceBuffer, ParallelExperiences
diff --git a/src/dagobert/modelling/rl/environment.py b/src/dagobert/modelling/rl/environment.py
index a7074f91..11efe152 100644
--- a/src/dagobert/modelling/rl/environment.py
+++ b/src/dagobert/modelling/rl/environment.py
@@ -13,7 +13,6 @@
 
 from dagobert.naming import NPreprocessingArgs as npa
 from dagobert.modelling.dl import PortfolioCryptoDataset
-from dagobert.modelling.rl.utils import sharpe_ratio, max_drawdown
 
 
 logger = logging.getLogger(__name__)
diff --git a/src/dagobert/modelling/rl/ppo.py b/src/dagobert/modelling/rl/ppo.py
index e90241e9..6b5c187d 100644
--- a/src/dagobert/modelling/rl/ppo.py
+++ b/src/dagobert/modelling/rl/ppo.py
@@ -9,7 +9,6 @@
 from typing import List, Tuple
 from argparse import Namespace
 
-
 import gym
 import torch
 import numpy as np
@@ -21,15 +20,12 @@
 from pytorch_lightning.callbacks import ModelCheckpoint
 from pytorch_lightning.trainer import seed_everything
 
-
 from dagobert.naming import NRL, NStudy, NPreprocessingArgs as npa
 from dagobert.modelling.rl import (
     RLEnv,
     ActorCriticTCN,
     ActorContinous,
     ActorCriticAgent,
-    ParallelExperiences,
-    ExperienceBuffer,
 )
 from dagobert.modelling.dl import (
     ExperienceSourceDataset,
@@ -39,6 +35,7 @@
 
 
 logger = logging.getLogger(__name__)
+mp = torch.multiprocessing.get_context("spawn")
 
 
 def run_rl(args):
@@ -111,14 +108,16 @@ def __init__(self, hparams: Namespace):
         self.hparams = TCNLightning._check_mini_series_lookback(hparams)
 
         # create env, policy/value networks and experience buffer + tracking vars
-        self.env = RLEnv(self.hparams)
-        n_actions = self.envs.action_space.shape[0]
+        self.envs = [RLEnv(self.hparams) for _ in range(self.hparams.num_workers)]
+        n_actions = self.envs[0].action_space.shape[0]
         self.critic = ActorCriticTCN(
             self.hparams, n_actions=n_actions, output_size=1, actor=False
         )
+        self.critic.share_memory()
         self.actor = ActorContinous(
             ActorCriticTCN(self.hparams, n_actions=n_actions, output_size=n_actions)
         )
+        self.actor.actor_net.share_memory()
         self.agent = ActorCriticAgent(self.actor, self.critic)
         self.buffer = ExperienceBuffer()
         self.avg_ep_reward = 0
@@ -152,15 +151,18 @@ def generate_experience_buffer(
         `num_assets` in the portfolio is large.
 
         Yield:
-           Tuple of Lists containing tensors for states, actions, log probs, qvals and
-            advantage.
+           Tuple of Lists containing tensors for states, actions, log probs, qvals
+               and advantage.
         """
         # setup workers and pass them the env, agent, vars to work with
         max_worker_steps = int(self.hparams.steps_per_epoch / self.hparams.num_workers)
         parallel_experiences = ParallelExperiences()
+        from IPython import embed
+
+        embed()
         for i in range(self.hparams.num_workers):
             args = (
-                deepcopy(self.env),
+                self.envs[i],
                 self.agent,
                 self.device,
                 max_worker_steps,
@@ -169,7 +171,7 @@ def generate_experience_buffer(
                 self.hparams.gamma,
                 self.hparams.lam,
             )
-            parallel_experiences.create_worker(args)
+            parallel_experiences.create_worker(*args)
 
         # collect experiences in parallel, then merge them and create dataset
         self.buffer.merge_buffers(parallel_experiences.collect_experiences())
@@ -208,7 +210,6 @@ def calc_advantage(
         values: List[float],
         gamma: float = 0.99,
         lam: float = 0.95,
-        norm: bool = True,
     ) -> List[float]:
         """
         Calculate the advantage given rewards, state values, and last value of episode.
@@ -220,7 +221,6 @@ def calc_advantage(
             values: list of state values from critic
             gamma: Gamma for discounting the long-term rewards.
             lam: Lambda for the GAE advantage calculation.
-            norm: If True, the advantages are normalised to mean=0, std=1.
 
         Returns:
             List of advantages.
@@ -231,10 +231,7 @@ def calc_advantage(
             for i in range(len(rewards) - 1)
         ]
         adv = PPO.discount_rewards(delta, gamma * lam)
-        if norm:
-            return PPO.normalise_advantage(adv)
-        else:
-            return adv
+        return adv
 
     @staticmethod
     def normalise_advantage(batch_adv: List[float]) -> List[float]:
@@ -321,51 +318,21 @@ def training_step(
             loss
         """
         state, past_pw, action, old_logp, qval, adv = batch
-        self.log(
-            "avg_ep_len", self.avg_ep_len, prog_bar=True, on_step=False, on_epoch=True
-        )
-        self.log(
-            "avg_ep_reward",
-            self.avg_ep_reward,
-            prog_bar=True,
-            on_step=False,
-            on_epoch=True,
-        )
-        self.log(
-            "avg_reward", self.avg_reward, prog_bar=True, on_step=False, on_epoch=True
-        )
+        adv = PPO.normalise_advantage(adv)
+        self.log("avg_ep_len", self.avg_ep_len, on_step=False, on_epoch=True)
+        self.log("avg_ep_reward", self.avg_ep_reward, on_step=False, on_epoch=True)
+        self.log("avg_reward", self.avg_reward, on_step=False, on_epoch=True)
         if optimizer_idx == 0:
             loss_actor, approx_kl = self.actor_loss(
                 state, past_pw, action, old_logp, adv
             )
-            self.log(
-                "loss_actor",
-                loss_actor,
-                on_step=False,
-                on_epoch=True,
-                prog_bar=True,
-                logger=True,
-            )
-            self.log(
-                "approx_kl",
-                approx_kl,
-                on_step=False,
-                on_epoch=True,
-                prog_bar=True,
-                logger=True,
-            )
+            self.log("loss_actor", loss_actor, on_epoch=True, on_step=False)
+            self.log("approx_kl", approx_kl, on_epoch=True, on_step=False)
             return loss_actor
 
         elif optimizer_idx == 1:
             loss_critic = self.critic_loss(state, past_pw, qval)
-            self.log(
-                "loss_critic",
-                loss_critic,
-                on_step=False,
-                on_epoch=True,
-                prog_bar=True,
-                logger=True,
-            )
+            self.log("loss_critic", loss_critic, on_epoch=True, on_step=False)
             return loss_critic
 
     @staticmethod
@@ -382,3 +349,211 @@ def _pre_sanity_check(hparams: Namespace):
                         hparams.cols_to_model[npa.anchor]
                     )
         return hparams
+
+
+# --------------------------------------------------------------------------------------
+# HELPER CLASSES FOR PARALLEL EXPERIENCE COLLECTION
+#
+# Moving this to another module would result in circular dependencies. Been there,
+# done that, it was painful, so let's just leave these here.
+# --------------------------------------------------------------------------------------
+
+
+class ExperienceBuffer:
+    """
+    Object holding all states, rewards, actions, logp vals, etc of a rollout session,
+    i.e. the phase of the training when we're collecting experience to train on later
+    using the current policy of the actor.
+
+    This is designed to work both with a single worker (single process) or with
+    multiple workers collecting experience in parallel.
+    """
+
+    def __init__(self):
+        """Class constructor"""
+        # step vars
+        self.states = []
+        self.past_pws = []
+        self.actions = []
+        self.advs = []
+        self.qvals = []
+        self.logps = []
+        self.infos = []
+
+        # episode vars
+        self.ep_rewards = []
+        self.ep_values = []
+        self.done_episodes = 0
+        self.epoch_rewards = 0
+
+    def append(
+        self,
+        state: torch.Tensor,
+        past_pw: torch.Tensor,
+        action: torch.Tensor,
+        logp: torch.Tensor,
+        reward: float,
+        value: torch.Tensor,
+        info: dict,
+    ):
+        """
+        Appends the state (including portfolio value and weights), actions, logp,
+        reward to the buffer after a single step taken in the environment.
+
+        Args:
+            state: State that went into the agent (i.e. both actor and critic).
+            past_pw: Past portfolio value and weights that went into the agent.
+            action: Agent's action to the state and past_pw.
+            logp: Log-probability of the action sampled from the actor's distribution.
+            reward: Reward obtained by the action.
+            value: Estimated (by critic) reward we should have got with this action.
+            info: Portfolio related information returned by the env after the step.
+        """
+        # drop first batch dim so dataloader later can resample them for backprop
+        self.states.append([s.squeeze(0) for s in state])
+        self.past_pws.append(past_pw.squeeze(0))
+        self.actions.append(action)
+        self.logps.append(logp)
+        self.infos.append(info)
+        self.ep_rewards.append(reward)
+        self.ep_values.append(value.item())
+
+    def merge_buffers(self, buffers):
+        """
+        Merges the passed in ExperienceBuffers and overwrites the current state with it.
+
+        Args:
+            buffers: List of smaller ExpereinceBuffers to merge together from parallel
+                processes.
+        """
+        pass
+
+    def yield_dataset(
+        self,
+    ) -> Tuple[
+        List[torch.Tensor],
+        List[torch.Tensor],
+        List[torch.Tensor],
+        List[torch.Tensor],
+        List[torch.Tensor],
+    ]:
+        """
+        Yields an iterable dataset for Pytorch Lightning from the contents of the
+        ExperienceBuffer.
+
+        Yield:
+           Tuple of Lists containing tensors for states, actions, log probs, qvals and
+            advantage.
+        """
+        data = zip(
+            self.states,
+            self.past_pws,
+            self.actions,
+            self.logps,
+            self.qvals,
+            self.advs,
+        )
+        for state, past_pw, action, logp_old, qval, adv in data:
+            yield state, past_pw, action, logp_old, qval, adv
+
+    def clear_buffer(self):
+        """Resets the ExperienceBuffer."""
+        self.states.clear()
+        self.past_pws.clear()
+        self.actions.clear()
+        self.advs.clear()
+        self.logps.clear()
+        self.qvals.clear()
+        self.ep_rewards.clear()
+        self.ep_values.clear()
+        self.done_episodes = 0
+        self.epoch_rewards = 0
+
+
+class ParallelExperiences:
+    """
+    Parallelised experience gathering, idea from https://stackoverflow.com/a/45829852
+    """
+
+    def __init__(self):
+        """Class constructor."""
+        self.exp_queue = mp.Queue()
+        self.processes = []
+
+    def collect_experiences(self) -> List[ExperienceBuffer]:
+        """Returns the experiences from parallel workers. You need to wait for these."""
+        buffers = []
+        # gather results from workers using the queue and merge them into one
+        for process in self.processes:
+            buffers.append(self.exp_queue.get())  # will block
+        for process in self.processes:
+            process.join()
+        return buffers
+
+    def create_worker(self, *args):
+        """Creates a new worker, with the args passed in for `_gather_experience`."""
+        process = mp.Process(target=self.gather_experience, args=args)
+        self.processes.append(process)
+        process.start()
+
+    def gather_experience(
+        self,
+        env: gym.Env,
+        agent: ActorCriticAgent,
+        device: torch.device,
+        max_steps: int,
+        max_episode_length: int,
+        asset_num: int,
+        gamma: float,
+        lam: float,
+    ):
+        """
+        Workhorse function of the parallel experience gathering. This function can be
+        called as many times as many CPUs are available on the system, to collect the
+        desired number of steps and store them into an `ExperienceBuffer` that is then
+        passed back (via a `multiprocessing.Queue` object) to the main process that
+        spawned the parallel processes.
+
+        Args:
+            env: An instance of the environment to act on.
+            agent: An instance of the PPO's `ActorCriticAgent`.
+            device: Device where the agent lives (GPU or CPU).
+            max_steps: Total number of steps (over multiple episodes) a worker can take.
+            max_episode_length: Maximum length of a trajectory / episode.
+            asset_num: Number of assets we are modelling (not including USD).
+            gamma: See docs of :func:`PPO.calc_advantage`
+            lam: See docs of :func:`PPO.calc_advantage`
+
+        Returns:
+            Adds the results to `exp_queue` so it can be processed in the main process.
+        """
+        buffer = ExperienceBuffer()
+        state = env.reset()
+        past_pw = PPO._init_past_pw(asset_num, device)
+        for step in range(max_steps):
+            # get action, make step, get reward and info from env
+            pi, action, actor_logits, logp, value = agent(state, past_pw, device)
+            next_state, reward, done, info = env.step(action.cpu().numpy())
+
+            # store everything and update state, past_pw
+            buffer.append(state, past_pw, action, logp, reward, value, info)
+            state = next_state
+            past_pw = PPO._update_past_pw(info["portfolio_value"], actor_logits, device)
+
+            terminal = len(buffer.ep_rewards) == max_episode_length
+            if done or terminal:
+                buffer.qvals += PPO.discount_rewards(buffer.ep_rewards, gamma)[:-1]
+                buffer.advs += PPO.calc_advantage(
+                    buffer.ep_rewards, buffer.ep_values, gamma, lam
+                )
+                buffer.done_episodes += 1
+                buffer.epoch_rewards += np.sum(buffer.ep_rewards)
+
+                # episode over, reset the env and the buffer
+                buffer.ep_rewards = []
+                buffer.ep_values = []
+                state = env.reset()
+                past_pw = PPO._init_past_pw(asset_num, device)
+
+        # add collected experience to the queue so it can be returned to master process
+        self.exp_queue.put(buffer)
diff --git a/src/dagobert/modelling/rl/utils.py b/src/dagobert/modelling/rl/utils.py
index 25520783..fad0f6c7 100644
--- a/src/dagobert/modelling/rl/utils.py
+++ b/src/dagobert/modelling/rl/utils.py
@@ -3,209 +3,13 @@
 classes for gathering experience in parallel.
 """
 # pylint: disable=no-member
-from typing import List
-from multiprocessing import Process, Queue
 
-import gym
-import torch
 import numpy as np
 
-from dagobert.modelling.rl import PPO, ActorCriticAgent
 
 eps = np.finfo(float).eps
 
 
-class ExperienceBuffer:
-    """
-    Object holding all states, rewards, actions, logp vals, etc of a rollout session,
-    i.e. the phase of the training when we're collecting experience to train on later
-    using the current policy of the actor.
-
-    This is designed to work both with a single worker (single process) or with
-    multiple workers collecting experience in parallel.
-    """
-
-    def __init__(self):
-        """Class constructor"""
-        # step vars
-        self.states = []
-        self.past_pws = []
-        self.actions = []
-        self.advs = []
-        self.qvals = []
-        self.logps = []
-        self.infos = []
-
-        # episode vars
-        self.ep_rewards = []
-        self.ep_values = []
-        self.done_episodes = 0
-        self.epoch_rewards = 0
-
-    def append(
-        self,
-        state: torch.Tensor,
-        past_pw: torch.Tensor,
-        action: torch.Tensor,
-        logp: torch.Tensor,
-        reward: float,
-        value: torch.Tensor,
-        info: dict,
-    ):
-        """
-        Appends the state (including portfolio value and weights), actions, logp,
-        reward to the buffer after a single step taken in the environment.
-
-        Args:
-            state: State that went into the agent (i.e. both actor and critic).
-            past_pw: Past portfolio value and weights that went into the agent.
-            action: Agent's action to the state and past_pw.
-            logp: Log-probability of the action sampled from the actor's distribution.
-            reward: Reward obtained by the action.
-            value: Estimated (by critic) reward we should have got with this action.
-            info: Portfolio related information returned by the env after the step.
-        """
-        # drop first batch dim so dataloader later can resample them for backprop
-        self.states.append([s.squeeze(0) for s in state])
-        self.past_pws.append(past_pw.squeeze(0))
-        self.actions.append(action)
-        self.logps.append(logp)
-        self.infos.append(info)
-        self.ep_rewards.append(reward)
-        self.ep_values.append(value.item())
-
-    def merge_buffers(self, buffers: List[ExperienceBuffer]):
-        """
-        Merges the passed in ExperienceBuffers and overwrites the current state with it.
-
-        Args:
-            buffers: List of smaller ExpereinceBuffers to merge together from parallel
-                processes.
-        """
-        pass
-
-    def yield_dataset(
-        self,
-    ) -> Tuple[
-        List[torch.Tensor],
-        List[torch.Tensor],
-        List[torch.Tensor],
-        List[torch.Tensor],
-        List[torch.Tensor],
-    ]:
-        """
-        Yields an iterable dataset for Pytorch Lightning from the contents of the
-        ExperienceBuffer.
-
-        Yield:
-           Tuple of Lists containing tensors for states, actions, log probs, qvals and
-            advantage.
-        """
-        data = zip(
-            self.batch_states,
-            self.batch_past_pw,
-            self.batch_actions,
-            self.batch_logp,
-            self.batch_qvals,
-            PPO.normalise_advantage(self.batch_adv),
-        )
-        for state, past_pw, action, logp_old, qval, adv in data:
-            yield state, past_pw, action, logp_old, qval, adv
-
-    def clear_buffer(self):
-        """Resets the ExperienceBuffer."""
-        self.batch_states.clear()
-        self.batch_past_pw.clear()
-        self.batch_actions.clear()
-        self.batch_adv.clear()
-        self.batch_logp.clear()
-        self.batch_qvals.clear()
-
-
-class ParallelExperiences:
-    def __init__(self):
-        self.exp_queue = Queue()
-        self.processes = []
-
-    def collect_experiences(self) -> List[ExperienceBuffer]:
-        """Returns the experiences from parallel workers. You need to wait for these."""
-        exp_buffers = []
-        # gather results from workers using the queue and merge them into one
-        for process in self.processes:
-            exp_buffers.append(self.exp_queue.get())  # will block
-        for process in self.processes:
-            process.join()
-        return exp_buffers
-
-    def create_worker(self, *args):
-        """Creates a new worker, with the args passed in for `_gather_experience`."""
-        process = Process(target=self.gather_experience, args=args)
-        self.processes.append(process)
-        process.start()
-
-    def gather_experience(
-        self,
-        env: gym.Env,
-        agent: ActorCriticAgent,
-        device: torch.device,
-        max_steps: int,
-        max_episode_length: int,
-        asset_num: int,
-        gamma: float,
-        lam: float,
-    ):
-        """
-        Workhorse function of the parallel experience gathering. This function can be
-        called as many times as many CPUs are available on the system, to collect the
-        desired number of steps and store them into an `ExperienceBuffer` that is then
-        passed back (via a `multiprocessing.Queue` object) to the main process that
-        spawned the parallel processes.
-
-        Args:
-            env: An instance of the environment to act on.
-            agent: An instance of the PPO's `ActorCriticAgent`.
-            device: Device where the agent lives (GPU or CPU).
-            max_steps: Total number of steps (over multiple episodes) a worker can take.
-            max_episode_length: Maximum length of a trajectory / episode.
-            asset_num: Number of assets we are modelling (not including USD).
-            gamma: See docs of :func:`PPO.calc_advantage`
-            lam: See docs of :func:`PPO.calc_advantage`
-
-        Returns:
-            Adds the results to `exp_queue` so it can be processed in the main process.
-        """
-        buffer = ExperienceBuffer()
-        state = env.reset()
-        past_pw = PPO._init_past_pw(asset_num, device)
-        for step in range(max_steps):
-            # get action, make step, get reward and info from env
-            pi, action, actor_logits, logp, value = agent(state, past_pw, device)
-            next_state, reward, done, info = env.step(action.cpu().numpy())
-
-            # store everything and update state, past_pw
-            buffer.append(state, past_pw, action, logp, reward, value, info)
-            state = next_state
-            past_pw = PPO._update_past_pw(info["portfolio_value"], actor_logits, device)
-
-            terminal = len(buffer.ep_rewards) == max_episode_length
-            if done or terminal:
-                buffer.qvals += PPO.discount_rewards(buffer.ep_rewards, gamma)[:-1]
-                buffer.advs += PPO.calc_advantage(
-                    buffer.ep_rewards, buffer.ep_values, gamma, lam
-                )
-                buffer.done_episodes += 1
-                buffer.epoch_rewards += np.sum(buffer.ep_rewards)
-
-                # episode over, reset the env and the buffer
-                buffer.ep_rewards = []
-                buffer.ep_values = []
-                state = env.reset()
-                past_pw = PPO._init_past_pw(asset_num, device)
-
-        # add collected experience to the queue so it can be returned to master process
-        self.exp_queue.put(buffer)
-
-
 def sharpe_ratio(returns, freq: int = 30, rfr: int = 0):
     """
     Given a set of returns, calculates naive (rfr=0) sharpe (eq 28).

From 0dc2b310a4393c1ef00319a34cd8310a86275489 Mon Sep 17 00:00:00 2001
From: MateMarcell <marcell-mate@hotmail.com>
Date: Wed, 13 Jan 2021 20:37:44 +0000
Subject: [PATCH 26/62] gotta commit

---
 config/timegan_config.yaml                    |   3 +
 .../modelling/augmentation/__init__.py        |   1 +
 .../modelling/augmentation/timegan.py         | 156 +++++++++++++++++-
 src/dagobert/modelling/augmentation/utils.py  |  21 +++
 4 files changed, 173 insertions(+), 8 deletions(-)
 create mode 100644 src/dagobert/modelling/augmentation/utils.py

diff --git a/config/timegan_config.yaml b/config/timegan_config.yaml
index e1f8d377..cd65fefd 100644
--- a/config/timegan_config.yaml
+++ b/config/timegan_config.yaml
@@ -26,6 +26,8 @@ batch_size: 256
 
 # gru or lstm
 rnn: lstm
+# embedding weight in cost of generator loss
+emb_weight: 1
 
 # --------------------------------------------------------------------------------------
 # MODEL
@@ -35,6 +37,7 @@ dropout: 0.2
 num_layers: 2
 hidden_size: 50
 z_dim: 50
+mini_series_length: 240
 
 # --------------------------------------------------------------------------------------
 # DATA
diff --git a/src/dagobert/modelling/augmentation/__init__.py b/src/dagobert/modelling/augmentation/__init__.py
index a4400910..dbab2837 100644
--- a/src/dagobert/modelling/augmentation/__init__.py
+++ b/src/dagobert/modelling/augmentation/__init__.py
@@ -1 +1,2 @@
 from .augmentation import augment
+from .timegan import RnnBlock, TimeGANLightning
diff --git a/src/dagobert/modelling/augmentation/timegan.py b/src/dagobert/modelling/augmentation/timegan.py
index 1790bf9b..31d0f532 100644
--- a/src/dagobert/modelling/augmentation/timegan.py
+++ b/src/dagobert/modelling/augmentation/timegan.py
@@ -22,6 +22,7 @@
 from pytorch_lightning import LightningModule
 
 from dagobert.modelling.dl import AdaBelief
+from dagobert.modelling.augmentation.utils import get_noise
 
 
 class RnnBlock(nn.Module):
@@ -172,15 +173,84 @@ def __init__(self, hparams: Namespace):
         )
 
     def training_step(self, batch, batch_idx, optimizer_idx):
-        real, _ = batch
+        """
+        Carries out updates to networks from a batch of real samples.
+        Args:
+            batch: batch of
+            batch_idx:
+            optimizer_idx: idx that controls optimizing the 5 networks
+
+        Returns:
+            Loss
+        """
+        x, label = batch
+        batch_len = len(x)
+        z = get_noise(
+            batch_len,
+            self.hparams.mini_series_length,
+            self.hparams.z_dim,
+            device=self.tgan_device,
+        )
+
         # embedding
-        h = torch.tanh(self.embedder(real))
-        h = torch.sigmoid(self.embedder_linear(h))
-        # recover embedding
-        x_tilde = torch.tanh(self.recovery(h))
-        x_tilde = torch.sigmoid(self.recovery_linear(x_tilde))
+        h = self.embedder(x)
+        x_tilde = self.recovery(h)
+        if optimizer_idx == 0:
+            # recover embedding
+            embed_loss0 = TimeGANLightning.embed_loss0(
+                x_tilde,
+                x,
+                optimizer_idx,
+            )
+            self.log(
+                "embed_loss0",
+                embed_loss0,
+                on_step=False,
+                on_epoch=True,
+                prog_bar=True,
+                logger=True,
+            )
+            return embed_loss0
 
-        pass
+        # generator
+        e_hat = self.generator(z)
+        h_hat = self.supervisor(e_hat)
+        h_hat_supervise = self.supervisor(h)
+
+        if optimizer_idx == 1:
+            # supervisor and generator
+            gen_sup_loss = TimeGANLightning.gen_loss_sup(
+                h_hat_supervise,
+                h,
+                optimizer_idx,
+            )
+            self.log(
+                "gen_sup_loss",
+                gen_sup_loss,
+                on_step=False,
+                on_epoch=True,
+                prog_bar=True,
+                logger=True,
+            )
+            return gen_sup_loss
+
+        # synthetic data
+        x_hat = self.recovery(h_hat)
+
+        if optimizer_idx in [2, 3]:
+            with torch.no_grad():
+                y_fake = self.discriminator(h_hat)
+                y_fake_e = self.discriminator(e_hat)
+            gen_loss = TimeGANLightning.generator_loss(
+                y_fake,
+                y_fake_e,
+                h,
+                h_hat_supervise,
+                x,
+                x_hat,
+                self.hparams.emb_weight,
+            )
+            return gen_loss
 
     def configure_optimizers(self) -> List[optim.Optimizer]:
         """
@@ -234,7 +304,7 @@ def _setup_loss(self, loss_name):
     # CALCULATION
     # ----------------------------------------------------------------------------------
 
-    def _calculate_loss(self, x, y_true):
+    def _calculate_loss2(self, x, y_true):
         """
         Calculates the appropriate loss, given the `classification` flag.
 
@@ -273,3 +343,73 @@ def _calculate_loss(self, x, y_true):
             y_pred = y_pred[:, -1]
             y_true = y_true[:, -1]
         return loss, y_true, y_pred
+
+    @staticmethod
+    def embed_loss0(x_tilde, x, optimizer_idx):
+        """
+
+        Args:
+            x_tilde:
+            x:
+            optimizer_idx:
+
+        Returns:
+
+        """
+        if optimizer_idx == 0:
+            e_loss_t0 = nn.MSELoss()(x_tilde, x)
+            e_loss0 = 10 * torch.sqrt(e_loss_t0)
+            return e_loss0
+
+    @staticmethod
+    def gen_loss_sup(h_hat_supervise, h, optimizer_idx):
+        """
+
+        Args:
+            h_hat_supervise:
+            h:
+            optimizer_idx:
+
+        Returns:
+
+        """
+        if optimizer_idx == 1:
+            gen_sup_loss = nn.MSELoss()(h_hat_supervise[:, 1:, :], h[:, 1:, :])
+            return gen_sup_loss
+
+    @staticmethod
+    def generator_loss(
+        y_fake,
+        y_fake_e,
+        h,
+        h_hat_supervise,
+        x,
+        x_hat,
+        emb_weight,
+    ):
+        """
+
+        Args:
+            y_fake:
+            y_fake_e:
+            h:
+            h_hat_supervise:
+            x:
+            x_hat:
+            emb_weight:
+            optimizer_idx:
+
+        Returns:
+
+        """
+
+        # adversarial
+        g_loss_u = nn.BCELoss()(y_fake, torch.ones_like(y_fake))
+        g_loss_u_e = nn.BCELoss()(y_fake_e, torch.ones_like(y_fake_e))
+        # supervisor
+        g_loss_s = nn.MSELoss()(h_hat_supervise[:, 1:, :], h[:, 1:, :])
+        # 2 moments
+        d = torch.sqrt(torch.var(x_hat, 0) + 1e-6) - torch.sqrt(torch.var(x, 0) + 1e-6)
+        g_loss_v1 = torch.mean(torch.abs(d))
+        g_loss_v2 = torch.mean(torch.abs(torch.mean(x_hat, 0) - torch.mean(x, 0)))
+        g_loss_v = g_loss_v1 + g_loss_v2
diff --git a/src/dagobert/modelling/augmentation/utils.py b/src/dagobert/modelling/augmentation/utils.py
new file mode 100644
index 00000000..72e162eb
--- /dev/null
+++ b/src/dagobert/modelling/augmentation/utils.py
@@ -0,0 +1,21 @@
+"""Util functions for TimeGAN and other augmentation related tasks"""
+
+import torch
+
+
+def get_noise(n_samples: int, mini_series_length: int, z_dim: int, device: str = "cpu"):
+    """
+    Function for creating noise vectors given the dimensions (n_samples,
+    mini_series_length, z_dim). Research shows that it is not hyperimportant which
+    distribution is the noise from, here we'll use uniform
+
+    Args:
+        n_samples: the number of samples to generate
+        mini_series_length: length of series
+        z_dim: dimension for generator input at given time point
+        device: the device type
+
+    Returns:
+        Tensor of filled with random numbers from uniform distribution.
+    """
+    return torch.rand(n_samples, mini_series_length, z_dim, device=device)

From 4e7e0a89fa9cf035ed1cd3fe036fd388975d13b4 Mon Sep 17 00:00:00 2001
From: MateMarcell <marcell-mate@hotmail.com>
Date: Thu, 14 Jan 2021 17:11:56 +0000
Subject: [PATCH 27/62] on track

---
 .../modelling/augmentation/timegan.py         | 170 ++++++++++--------
 1 file changed, 92 insertions(+), 78 deletions(-)

diff --git a/src/dagobert/modelling/augmentation/timegan.py b/src/dagobert/modelling/augmentation/timegan.py
index 31d0f532..71d9e8d9 100644
--- a/src/dagobert/modelling/augmentation/timegan.py
+++ b/src/dagobert/modelling/augmentation/timegan.py
@@ -200,7 +200,6 @@ def training_step(self, batch, batch_idx, optimizer_idx):
             embed_loss0 = TimeGANLightning.embed_loss0(
                 x_tilde,
                 x,
-                optimizer_idx,
             )
             self.log(
                 "embed_loss0",
@@ -222,7 +221,6 @@ def training_step(self, batch, batch_idx, optimizer_idx):
             gen_sup_loss = TimeGANLightning.gen_loss_sup(
                 h_hat_supervise,
                 h,
-                optimizer_idx,
             )
             self.log(
                 "gen_sup_loss",
@@ -237,7 +235,9 @@ def training_step(self, batch, batch_idx, optimizer_idx):
         # synthetic data
         x_hat = self.recovery(h_hat)
 
-        if optimizer_idx in [2, 3]:
+        # TODO: If you need to control how often those optimizers step or override
+        #   the default .step() schedule, override the optimizer_step() hook.
+        if optimizer_idx == 2:
             with torch.no_grad():
                 y_fake = self.discriminator(h_hat)
                 y_fake_e = self.discriminator(e_hat)
@@ -250,8 +250,50 @@ def training_step(self, batch, batch_idx, optimizer_idx):
                 x_hat,
                 self.hparams.emb_weight,
             )
+            self.log(
+                "gen_loss",
+                gen_loss,
+                on_step=False,
+                on_epoch=True,
+                prog_bar=True,
+                logger=True,
+            )
             return gen_loss
 
+        if optimizer_idx == 3:
+            embed_loss = TimeGANLightning.embed_loss(
+                x_tilde,
+                x,
+                h_hat_supervise,
+                h,
+            )
+            self.log(
+                "embed_loss",
+                embed_loss,
+                on_step=False,
+                on_epoch=True,
+                prog_bar=True,
+                logger=True,
+            )
+            return embed_loss
+
+        if optimizer_idx == 4:
+            # y_fake =
+            # y_fake_e =
+            # y_real =
+            disc_loss = TimeGANLightning.discriminator_loss(
+                y_fake, y_fake_e, y_real, self.hparams.emb_weight
+            )
+            self.log(
+                "disc_loss",
+                disc_loss,
+                on_step=False,
+                on_epoch=True,
+                prog_bar=True,
+                logger=True,
+            )
+            return disc_loss
+
     def configure_optimizers(self) -> List[optim.Optimizer]:
         """
         Optimizer setup. list of optimizers accessed by idx in training step.
@@ -278,104 +320,38 @@ def configure_optimizers(self) -> List[optim.Optimizer]:
     # SETUP FUNCTIONS
     # ----------------------------------------------------------------------------------
 
-    def _setup_loss(self, loss_name):
-        """
-        Different losses are used to direct updates in the components of TimeGAN.
-        """
-        if loss_name == "BCEWithLogits":
-            return nn.BCEWithLogitsLoss()
-        elif self.hparams.regression:
-            return nn.MSELoss()
-
-        else:
-            if self.hparams.no_class_weights:
-                if self.hparams.output_size > 1:
-                    return nn.CrossEntropyLoss()
-                else:
-                    return nn.BCEWithLogitsLoss()
-            else:
-                if self.hparams.output_size > 1:
-                    return nn.CrossEntropyLoss(self._get_class_weights())
-                else:
-                    pos_weight = self._get_class_weights()[1]
-                    return nn.BCEWithLogitsLoss(pos_weight=pos_weight)
-
     # ----------------------------------------------------------------------------------
     # CALCULATION
     # ----------------------------------------------------------------------------------
 
-    def _calculate_loss2(self, x, y_true):
-        """
-        Calculates the appropriate loss, given the `classification` flag.
-
-        Args:
-            x: A batch of X.
-            y_true: A batch of target.
-
-        Returns:
-            Tuple of loss, y_true and y_pred.
-        """
-        y_pred = self(*[xi.float() for xi in x])
-        if self.hparams.output_size == 1 and self.hparams.last_y:
-            y_true = y_true.reshape(-1, 1).float()
-        elif self.hparams.output_size == 1 and not self.hparams.last_y:
-            y_true = y_true.float()
-        elif self.hparams.output_size == 3 and not self.hparams.regression:
-            # convert triple barrier method's -1/0/1 into 0, 1, 2 torch's cross-entropy
-            y_true = y_true.long() + 1
-        if not self.hparams.last_y:
-            # we only keep the latest fraction of labels of the mini-series
-            keep = int(self.hparams.mini_series_length * self.hparams.non_last_y_frac)
-            keep_ix = self.hparams.mini_series_length - keep
-            y_true = y_true[:, keep_ix:]
-            if self.hparams.output_size == 1:
-                y_pred = y_pred[:, keep_ix:]
-            else:
-                y_pred = y_pred.transpose(1, 2)[:, :, keep_ix:]
-        loss = self.loss_f(y_pred, y_true)
-
-        if self.hparams.mix_density_net:
-            # for mix density nets we need to estimate y_preds as a mixture of mus
-            y_pred = self.loss_f.get_mu_preds(y_pred)
-        elif not self.hparams.last_y:
-            # we only use the last timepoint's pred for plotting and metric calculation
-            # otherwise we often run out of memory at the end of the epoch
-            y_pred = y_pred[:, -1]
-            y_true = y_true[:, -1]
-        return loss, y_true, y_pred
-
     @staticmethod
-    def embed_loss0(x_tilde, x, optimizer_idx):
+    def embed_loss0(x_tilde, x):
         """
 
         Args:
             x_tilde:
             x:
-            optimizer_idx:
 
         Returns:
 
         """
-        if optimizer_idx == 0:
-            e_loss_t0 = nn.MSELoss()(x_tilde, x)
-            e_loss0 = 10 * torch.sqrt(e_loss_t0)
-            return e_loss0
+        e_loss_t0 = nn.MSELoss()(x_tilde, x)
+        e_loss0 = 10 * torch.sqrt(e_loss_t0)
+        return e_loss0
 
     @staticmethod
-    def gen_loss_sup(h_hat_supervise, h, optimizer_idx):
+    def gen_loss_sup(h_hat_supervise, h):
         """
 
         Args:
             h_hat_supervise:
             h:
-            optimizer_idx:
 
         Returns:
 
         """
-        if optimizer_idx == 1:
-            gen_sup_loss = nn.MSELoss()(h_hat_supervise[:, 1:, :], h[:, 1:, :])
-            return gen_sup_loss
+        gen_sup_loss = nn.MSELoss()(h_hat_supervise[:, 1:, :], h[:, 1:, :])
+        return gen_sup_loss
 
     @staticmethod
     def generator_loss(
@@ -397,15 +373,14 @@ def generator_loss(
             x:
             x_hat:
             emb_weight:
-            optimizer_idx:
 
         Returns:
 
         """
-
         # adversarial
         g_loss_u = nn.BCELoss()(y_fake, torch.ones_like(y_fake))
         g_loss_u_e = nn.BCELoss()(y_fake_e, torch.ones_like(y_fake_e))
+        w_g_loss_u_e = emb_weight * g_loss_u_e
         # supervisor
         g_loss_s = nn.MSELoss()(h_hat_supervise[:, 1:, :], h[:, 1:, :])
         # 2 moments
@@ -413,3 +388,42 @@ def generator_loss(
         g_loss_v1 = torch.mean(torch.abs(d))
         g_loss_v2 = torch.mean(torch.abs(torch.mean(x_hat, 0) - torch.mean(x, 0)))
         g_loss_v = g_loss_v1 + g_loss_v2
+        # sum
+        g_loss = g_loss_u + w_g_loss_u_e + 100 * torch.sqrt(g_loss_s) + 100 * g_loss_v
+        return g_loss
+
+    @staticmethod
+    def embed_loss(x_tilde, x, h_hat_supervise, h):
+        """
+
+        Args:
+            x_tilde:
+            x:
+            h_hat_supervise:
+            h:
+
+        Returns:
+
+        """
+        e_loss_t0 = nn.MSELoss()(x_tilde, x)
+        e_loss0 = 10 * torch.sqrt(e_loss_t0)
+        e_loss = e_loss0 + 0.1 * nn.MSELoss()(h_hat_supervise[:, 1:, :], h[:, 1:, :])
+        return e_loss
+
+    @staticmethod
+    def discriminator_loss(y_fake, y_fake_e, y_real, emb_weight):
+        """
+
+        Args:
+            y_fake:
+            y_fake_e:
+            y_real:
+            emb_weight:
+
+        Returns:
+
+        """
+        d_loss_fake_e = nn.BCELoss()(y_fake_e, torch.zeros_like(y_fake_e))
+        d_loss_fake = nn.BCELoss()(y_fake, torch.zeros_like(y_fake))
+        d_loss_real = nn.BCELoss()(y_real, torch.ones_like(y_real))
+        return emb_weight * d_loss_fake_e + d_loss_fake + d_loss_real

From 2ee9d54fc82a81956330920f36db94c1a19feb71 Mon Sep 17 00:00:00 2001
From: MateMarcell <marcell-mate@hotmail.com>
Date: Thu, 14 Jan 2021 17:23:52 +0000
Subject: [PATCH 28/62] disc

---
 src/dagobert/modelling/augmentation/timegan.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/dagobert/modelling/augmentation/timegan.py b/src/dagobert/modelling/augmentation/timegan.py
index 71d9e8d9..7ee32983 100644
--- a/src/dagobert/modelling/augmentation/timegan.py
+++ b/src/dagobert/modelling/augmentation/timegan.py
@@ -278,9 +278,13 @@ def training_step(self, batch, batch_idx, optimizer_idx):
             return embed_loss
 
         if optimizer_idx == 4:
-            # y_fake =
-            # y_fake_e =
-            # y_real =
+            e_hat = self.generator()
+            h_hat = self.supervisor(e_hat)
+
+            y_fake = self.discriminator(h_hat.detach())
+            y_fake_e = self.discriminator(e_hat.detach())
+            y_real = self.discriminator(h.detach())
+
             disc_loss = TimeGANLightning.discriminator_loss(
                 y_fake, y_fake_e, y_real, self.hparams.emb_weight
             )

From f528469a0c13d64f76a3e84d1b6a10bb79699149 Mon Sep 17 00:00:00 2001
From: Daniel Homola <dani.homola@gmail.com>
Date: Fri, 15 Jan 2021 09:50:01 +0000
Subject: [PATCH 29/62] got parallel experience gathering to a point, but now I
 simpy can't make it work unless I reengineer everything, see
 https://github.com/danielhomola/dagobert/issues/65

---
 config/rl_config.yaml                 |  5 ++--
 src/dagobert/modelling/dl/tcn_args.py | 13 +++++++++
 src/dagobert/modelling/dl/tcn_net.py  | 42 +++++++++++++++------------
 src/dagobert/modelling/rl/networks.py |  3 +-
 src/dagobert/modelling/rl/ppo.py      |  8 +++--
 5 files changed, 46 insertions(+), 25 deletions(-)

diff --git a/config/rl_config.yaml b/config/rl_config.yaml
index 5f273987..cce7b3d7 100644
--- a/config/rl_config.yaml
+++ b/config/rl_config.yaml
@@ -4,7 +4,7 @@
 # LIGHTNING
 # --------------------------------------------------------------------------------------
 
-gpus: 1
+gpus: 0
 pin_memory: True
 profiler: True
 #val_check_interval: 0.5
@@ -17,7 +17,7 @@ auto_scale_batch_size:
 # --------------------------------------------------------------------------------------
 
 log_dir: logs
-num_workers: 1
+num_workers: 4
 exp_name: RL-PPO-TCN
 tags:
   - RL_test
@@ -49,6 +49,7 @@ target_col: rl_return
 to_label: False
 no_sample_weights: True
 binariser_method: 
+no_weight_norm: True
 
 # --------------------------------------------------------------------------------------
 # MODEL
diff --git a/src/dagobert/modelling/dl/tcn_args.py b/src/dagobert/modelling/dl/tcn_args.py
index d08b0a0b..21df5792 100644
--- a/src/dagobert/modelling/dl/tcn_args.py
+++ b/src/dagobert/modelling/dl/tcn_args.py
@@ -188,6 +188,19 @@ def add_model_specific_args(parent_parser):
             "multi-class (3) classification with CrossEntropyLoss."
         ),
     )
+    parser.add_argument(
+        "--no_weight_norm",
+        action="store_true",
+        help=(
+            " Weight norm is registered as a pre_forward_hook on the 1D convolutional "
+            "layers of the TemporalBlock, and these cannot be serialised when training "
+            "with parallel processes interacting with the model concurrently. If True, "
+            "we add weight normalisation around these layers, and TCN cannot be used "
+            "in a multiprocessing setting. If False, then it can be used, even staying "
+            "on GPU in linux (CPU only on Windows)."
+        ),
+    )
+
     parser.add_argument(
         "--no_class_weights",
         action="store_true",
diff --git a/src/dagobert/modelling/dl/tcn_net.py b/src/dagobert/modelling/dl/tcn_net.py
index b49b139a..344ad328 100644
--- a/src/dagobert/modelling/dl/tcn_net.py
+++ b/src/dagobert/modelling/dl/tcn_net.py
@@ -48,33 +48,34 @@ def __init__(
         dilation,
         padding,
         dropout=0.2,
+        no_weight_norm=False,
     ):
         super(TemporalBlock, self).__init__()
-        self.conv1 = weight_norm(
-            nn.Conv1d(
-                n_inputs,
-                n_outputs,
-                kernel_size,
-                stride=stride,
-                padding=padding,
-                dilation=dilation,
-            )
+        self.conv1 = nn.Conv1d(
+            n_inputs,
+            n_outputs,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
         )
+        if not no_weight_norm:
+            self.conv1 = weight_norm(self.conv1)
         self.chomp1 = Chomp1d(padding)
         self.relu1 = nn.ReLU()
         self.dropout1 = nn.Dropout(dropout)
         self.batch_norm1 = nn.BatchNorm1d(n_outputs)
 
-        self.conv2 = weight_norm(
-            nn.Conv1d(
-                n_outputs,
-                n_outputs,
-                kernel_size,
-                stride=stride,
-                padding=padding,
-                dilation=dilation,
-            )
+        self.conv2 = nn.Conv1d(
+            n_outputs,
+            n_outputs,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
         )
+        if not no_weight_norm:
+            self.conv2 = weight_norm(self.conv2)
         self.chomp2 = Chomp1d(padding)
         self.relu2 = nn.ReLU()
         self.dropout2 = nn.Dropout(dropout)
@@ -112,6 +113,7 @@ def __init__(
         dropout: float = 0.2,
         time_feat_n: int = 1,
         time_embed_dim: int = 12,
+        no_weight_norm: bool = False,
     ):
         """
         Class constructor.
@@ -126,6 +128,8 @@ def __init__(
             time_feat_n: Number of time features per input DF. Note this has to be
                 consistent across all input DFs, you can't mix and match.
             time_embed_dim: Dimensionality of time2vec vectors.
+            no_weight_norm: If True, we don't add weight_norm to 1dconv layers. See
+                no_weight_norm param help in `tcn_args.py` for more info.
         """
         super(TemporalConvNet, self).__init__()
 
@@ -147,6 +151,7 @@ def __init__(
                     dilation=1,
                     padding=(kernel_size - 1),
                     dropout=dropout,
+                    no_weight_norm=no_weight_norm,
                 )
             )
 
@@ -167,6 +172,7 @@ def __init__(
                     dilation=dilation_size,
                     padding=(kernel_size - 1) * dilation_size,
                     dropout=dropout,
+                    no_weight_norm=no_weight_norm,
                 )
             ]
         self.later_layers = nn.Sequential(*layers)
diff --git a/src/dagobert/modelling/rl/networks.py b/src/dagobert/modelling/rl/networks.py
index 4082a7cf..061e7366 100644
--- a/src/dagobert/modelling/rl/networks.py
+++ b/src/dagobert/modelling/rl/networks.py
@@ -49,11 +49,11 @@ def __init__(
             dropout=dropout,
             time_feat_n=hparams.time_feat_n,
             time_embed_dim=hparams.time_embed_dim,
+            no_weight_norm=hparams.no_weight_norm,
         )
         self.linear_a = nn.Linear(n_actions + 1, num_channels[-1])
         self.linear1 = nn.Linear(hparams.mini_series_length, 1)
         self.linear2 = nn.Linear(num_channels[-1] * 2, output_size)
-        # self.linear2 = nn.Linear(num_channels[-1], output_size)
 
     def forward(self, state, past_pw):
         s1 = self.tcn(*state)
@@ -64,7 +64,6 @@ def forward(self, state, past_pw):
             s2 = torch.tanh(self.linear1(s1).squeeze(-1))
         # bring together the state and past_pw representations make residual connection
         return past_pw[:, 1:] + self.linear2(torch.cat([s2, a1], dim=1))
-        # return self.linear2(s2)
 
 
 class ActorContinous(nn.Module):
diff --git a/src/dagobert/modelling/rl/ppo.py b/src/dagobert/modelling/rl/ppo.py
index 6b5c187d..df900659 100644
--- a/src/dagobert/modelling/rl/ppo.py
+++ b/src/dagobert/modelling/rl/ppo.py
@@ -157,14 +157,16 @@ def generate_experience_buffer(
         # setup workers and pass them the env, agent, vars to work with
         max_worker_steps = int(self.hparams.steps_per_epoch / self.hparams.num_workers)
         parallel_experiences = ParallelExperiences()
-        from IPython import embed
 
-        embed()
+        self.agent.critic_net.cpu()
+        self.agent.critic_net.eval()
+        self.agent.actor_net.cpu()
+        self.agent.actor_net.eval()
         for i in range(self.hparams.num_workers):
             args = (
                 self.envs[i],
                 self.agent,
-                self.device,
+                "cpu",
                 max_worker_steps,
                 self.hparams.max_episode_length,
                 len(self.hparams.asset_names),

From 7c144d4efd089d9ce7f2a634b6f2e0112353eef7 Mon Sep 17 00:00:00 2001
From: MateMarcell <marcell-mate@hotmail.com>
Date: Fri, 15 Jan 2021 15:16:38 +0000
Subject: [PATCH 30/62] training_step is done for now

---
 .../modelling/augmentation/timegan.py         | 210 +++++++++---------
 1 file changed, 106 insertions(+), 104 deletions(-)

diff --git a/src/dagobert/modelling/augmentation/timegan.py b/src/dagobert/modelling/augmentation/timegan.py
index 7ee32983..7b853d1a 100644
--- a/src/dagobert/modelling/augmentation/timegan.py
+++ b/src/dagobert/modelling/augmentation/timegan.py
@@ -106,7 +106,6 @@ def __init__(self, hparams: Namespace):
         self.tgan_device = "cuda" if hparams.gpus > 0 else "cpu"
         # TODO: check if real data is the right one, get data in
         # TODO: any sanity checks on data, hypermparams
-        # TODO set up losses
         self.real_logging = None
         self.comet_logging = not self.hparams.no_comet_logger
 
@@ -183,124 +182,127 @@ def training_step(self, batch, batch_idx, optimizer_idx):
         Returns:
             Loss
         """
+        # TODO: is there any label to give back?
         x, label = batch
         batch_len = len(x)
-        z = get_noise(
-            batch_len,
-            self.hparams.mini_series_length,
-            self.hparams.z_dim,
-            device=self.tgan_device,
-        )
 
-        # embedding
         h = self.embedder(x)
-        x_tilde = self.recovery(h)
-        if optimizer_idx == 0:
-            # recover embedding
-            embed_loss0 = TimeGANLightning.embed_loss0(
-                x_tilde,
-                x,
-            )
-            self.log(
-                "embed_loss0",
-                embed_loss0,
-                on_step=False,
-                on_epoch=True,
-                prog_bar=True,
-                logger=True,
-            )
-            return embed_loss0
-
-        # generator
-        e_hat = self.generator(z)
-        h_hat = self.supervisor(e_hat)
-        h_hat_supervise = self.supervisor(h)
-
-        if optimizer_idx == 1:
-            # supervisor and generator
-            gen_sup_loss = TimeGANLightning.gen_loss_sup(
-                h_hat_supervise,
-                h,
-            )
+
+        # optimizers #0 & #3 update embedder nets
+        if optimizer_idx in [0, 3]:
+            x_tilde = self.recovery(h)
+            # optimize embedding via embedder and recovery nets
+            if optimizer_idx == 0:
+                e_loss = TimeGANLightning.embed_loss0(x_tilde, x)
+                self.log(
+                    "e_loss",
+                    e_loss,
+                    on_step=False,
+                    on_epoch=True,
+                    prog_bar=True,
+                    logger=True,
+                )
+                return e_loss
+
+            elif optimizer_idx == 3:
+                h_hat_supervise = self.supervisor(h)
+                embed_loss = TimeGANLightning.embedder_loss(
+                    x_tilde,
+                    x,
+                    h_hat_supervise,
+                    h,
+                )
+                self.log(
+                    "embed_loss",
+                    embed_loss,
+                    on_step=False,
+                    on_epoch=True,
+                    prog_bar=True,
+                    logger=True,
+                )
+                return embed_loss
+
+        # optimize supervisor
+        elif optimizer_idx == 1:
+            h_hat_supervise = self.supervisor(h)
+            supervise_loss = TimeGANLightning.supervisor_loss(h_hat_supervise, h)
             self.log(
-                "gen_sup_loss",
-                gen_sup_loss,
+                "supervise_loss",
+                supervise_loss,
                 on_step=False,
                 on_epoch=True,
                 prog_bar=True,
                 logger=True,
             )
-            return gen_sup_loss
-
-        # synthetic data
-        x_hat = self.recovery(h_hat)
+            return supervise_loss
 
         # TODO: If you need to control how often those optimizers step or override
         #   the default .step() schedule, override the optimizer_step() hook.
-        if optimizer_idx == 2:
-            with torch.no_grad():
-                y_fake = self.discriminator(h_hat)
-                y_fake_e = self.discriminator(e_hat)
-            gen_loss = TimeGANLightning.generator_loss(
-                y_fake,
-                y_fake_e,
-                h,
-                h_hat_supervise,
-                x,
-                x_hat,
-                self.hparams.emb_weight,
-            )
-            self.log(
-                "gen_loss",
-                gen_loss,
-                on_step=False,
-                on_epoch=True,
-                prog_bar=True,
-                logger=True,
-            )
-            return gen_loss
-
-        if optimizer_idx == 3:
-            embed_loss = TimeGANLightning.embed_loss(
-                x_tilde,
-                x,
-                h_hat_supervise,
-                h,
-            )
-            self.log(
-                "embed_loss",
-                embed_loss,
-                on_step=False,
-                on_epoch=True,
-                prog_bar=True,
-                logger=True,
-            )
-            return embed_loss
-
-        if optimizer_idx == 4:
-            e_hat = self.generator()
-            h_hat = self.supervisor(e_hat)
-
-            y_fake = self.discriminator(h_hat.detach())
-            y_fake_e = self.discriminator(e_hat.detach())
-            y_real = self.discriminator(h.detach())
-
-            disc_loss = TimeGANLightning.discriminator_loss(
-                y_fake, y_fake_e, y_real, self.hparams.emb_weight
-            )
-            self.log(
-                "disc_loss",
-                disc_loss,
-                on_step=False,
-                on_epoch=True,
-                prog_bar=True,
-                logger=True,
+        elif optimizer_idx in [2, 4]:
+            # random input to generator
+            z = get_noise(
+                batch_len,
+                self.hparams.mini_series_length,
+                self.hparams.z_dim,
+                device=self.tgan_device,
             )
-            return disc_loss
+            # update generator
+            if optimizer_idx == 2:
+
+                e_hat = self.generator(z)
+                h_hat = self.supervisor(e_hat)
+                h_hat_supervise = self.supervisor(h)
+
+                # synthetic data
+                x_hat = self.recovery(h_hat)
+                # no_grad to leave discriminator unchanged
+                with torch.no_grad():
+                    y_fake = self.discriminator(h_hat)
+                    y_fake_e = self.discriminator(e_hat)
+                gen_loss = TimeGANLightning.generator_loss(
+                    y_fake,
+                    y_fake_e,
+                    h,
+                    h_hat_supervise,
+                    x,
+                    x_hat,
+                    self.hparams.emb_weight,
+                )
+                self.log(
+                    "gen_loss",
+                    gen_loss,
+                    on_step=False,
+                    on_epoch=True,
+                    prog_bar=True,
+                    logger=True,
+                )
+                return gen_loss
+
+            # update discriminator
+            elif optimizer_idx == 4:
+                e_hat = self.generator(z)
+                h_hat = self.supervisor(e_hat)
+                # detach to update only discriminator
+                y_fake = self.discriminator(h_hat.detach())
+                y_fake_e = self.discriminator(e_hat.detach())
+                y_real = self.discriminator(h.detach())
+
+                disc_loss = TimeGANLightning.discriminator_loss(
+                    y_fake, y_fake_e, y_real, self.hparams.emb_weight
+                )
+                self.log(
+                    "disc_loss",
+                    disc_loss,
+                    on_step=False,
+                    on_epoch=True,
+                    prog_bar=True,
+                    logger=True,
+                )
+                return disc_loss
 
     def configure_optimizers(self) -> List[optim.Optimizer]:
         """
-        Optimizer setup. list of optimizers accessed by idx in training step.
+        Optimizer setup. List of optimizers accessed by idx in training step.
         """
         optimizers = []
         param_pairs = [
@@ -344,7 +346,7 @@ def embed_loss0(x_tilde, x):
         return e_loss0
 
     @staticmethod
-    def gen_loss_sup(h_hat_supervise, h):
+    def supervisor_loss(h_hat_supervise, h):
         """
 
         Args:
@@ -397,7 +399,7 @@ def generator_loss(
         return g_loss
 
     @staticmethod
-    def embed_loss(x_tilde, x, h_hat_supervise, h):
+    def embedder_loss(x_tilde, x, h_hat_supervise, h):
         """
 
         Args:

From 2c9edd4a13f68bd2d49a4f6d2090969cf13fb37d Mon Sep 17 00:00:00 2001
From: MateMarcell <marcell-mate@hotmail.com>
Date: Fri, 15 Jan 2021 17:29:48 +0000
Subject: [PATCH 31/62] start data

---
 .../modelling/augmentation/timegan.py         | 141 ++++++++++--------
 1 file changed, 82 insertions(+), 59 deletions(-)

diff --git a/src/dagobert/modelling/augmentation/timegan.py b/src/dagobert/modelling/augmentation/timegan.py
index 7b853d1a..44268be1 100644
--- a/src/dagobert/modelling/augmentation/timegan.py
+++ b/src/dagobert/modelling/augmentation/timegan.py
@@ -1,6 +1,8 @@
 """
 TimeGAN network, following the original implementation:
 https://bitbucket.org/mvdschaar/mlforhealthlabpub/src/master/alg/timegan/tgan.py.
+&
+https://papers.nips.cc/paper/2019/file/c9efe5f26cd17ba6216bbe2a7d26d490-Paper.pdf
 """
 from typing import List, Optional
 from argparse import Namespace
@@ -18,6 +20,7 @@
 import torch.optim as optim
 import torch.nn.functional as f
 from torch.nn.utils import weight_norm
+from torch.utils.data import Dataset, WeightedRandomSampler, RandomSampler, DataLoader
 
 from pytorch_lightning import LightningModule
 
@@ -86,7 +89,7 @@ class TimeGANLightning(LightningModule):
     """
 
     # ----------------------------------------------------------------------------------
-    # INIT, FORWARD, OPTIMIZER SETUP
+    # INIT, (FORWARD)
     # ----------------------------------------------------------------------------------
 
     def __init__(self, hparams: Namespace):
@@ -101,7 +104,7 @@ def __init__(self, hparams: Namespace):
 
         # define main vars (other than model)
         super().__init__()
-        # TODO: sanity check, define hparams
+        # TODO: pre sanity check, define hparams
         # lightning sets this to cuda too late for some of our setup to work
         self.tgan_device = "cuda" if hparams.gpus > 0 else "cpu"
         # TODO: check if real data is the right one, get data in
@@ -170,6 +173,11 @@ def __init__(self, hparams: Namespace):
             rnn=self.hparams.rnn,
             linear_activation=False,
         )
+        self = self.float()
+
+    # ----------------------------------------------------------------------------------
+    # OPTIMIZER SETUP & TRAIN
+    # ----------------------------------------------------------------------------------
 
     def training_step(self, batch, batch_idx, optimizer_idx):
         """
@@ -193,48 +201,48 @@ def training_step(self, batch, batch_idx, optimizer_idx):
             x_tilde = self.recovery(h)
             # optimize embedding via embedder and recovery nets
             if optimizer_idx == 0:
-                e_loss = TimeGANLightning.embed_loss0(x_tilde, x)
+                loss_e = TimeGANLightning.embed_loss0(x_tilde, x)
                 self.log(
-                    "e_loss",
-                    e_loss,
+                    "loss_e",
+                    loss_e,
                     on_step=False,
                     on_epoch=True,
                     prog_bar=True,
                     logger=True,
                 )
-                return e_loss
+                return loss_e
 
             elif optimizer_idx == 3:
                 h_hat_supervise = self.supervisor(h)
-                embed_loss = TimeGANLightning.embedder_loss(
+                loss_embed = TimeGANLightning.embedder_loss(
                     x_tilde,
                     x,
                     h_hat_supervise,
                     h,
                 )
                 self.log(
-                    "embed_loss",
-                    embed_loss,
+                    "loss_embed",
+                    loss_embed,
                     on_step=False,
                     on_epoch=True,
                     prog_bar=True,
                     logger=True,
                 )
-                return embed_loss
+                return loss_embed
 
         # optimize supervisor
         elif optimizer_idx == 1:
             h_hat_supervise = self.supervisor(h)
-            supervise_loss = TimeGANLightning.supervisor_loss(h_hat_supervise, h)
+            loss_supervisor = TimeGANLightning.supervisor_loss(h_hat_supervise, h)
             self.log(
-                "supervise_loss",
-                supervise_loss,
+                "loss_supervisor",
+                loss_supervisor,
                 on_step=False,
                 on_epoch=True,
                 prog_bar=True,
                 logger=True,
             )
-            return supervise_loss
+            return loss_supervisor
 
         # TODO: If you need to control how often those optimizers step or override
         #   the default .step() schedule, override the optimizer_step() hook.
@@ -259,7 +267,7 @@ def training_step(self, batch, batch_idx, optimizer_idx):
                 with torch.no_grad():
                     y_fake = self.discriminator(h_hat)
                     y_fake_e = self.discriminator(e_hat)
-                gen_loss = TimeGANLightning.generator_loss(
+                loss_gen = TimeGANLightning.generator_loss(
                     y_fake,
                     y_fake_e,
                     h,
@@ -269,14 +277,14 @@ def training_step(self, batch, batch_idx, optimizer_idx):
                     self.hparams.emb_weight,
                 )
                 self.log(
-                    "gen_loss",
-                    gen_loss,
+                    "loss_gen",
+                    loss_gen,
                     on_step=False,
                     on_epoch=True,
                     prog_bar=True,
                     logger=True,
                 )
-                return gen_loss
+                return loss_gen
 
             # update discriminator
             elif optimizer_idx == 4:
@@ -287,18 +295,18 @@ def training_step(self, batch, batch_idx, optimizer_idx):
                 y_fake_e = self.discriminator(e_hat.detach())
                 y_real = self.discriminator(h.detach())
 
-                disc_loss = TimeGANLightning.discriminator_loss(
+                loss_disc = TimeGANLightning.discriminator_loss(
                     y_fake, y_fake_e, y_real, self.hparams.emb_weight
                 )
                 self.log(
-                    "disc_loss",
-                    disc_loss,
+                    "loss_disc",
+                    loss_disc,
                     on_step=False,
                     on_epoch=True,
                     prog_bar=True,
                     logger=True,
                 )
-                return disc_loss
+                return loss_disc
 
     def configure_optimizers(self) -> List[optim.Optimizer]:
         """
@@ -325,21 +333,30 @@ def configure_optimizers(self) -> List[optim.Optimizer]:
     # ----------------------------------------------------------------------------------
     # SETUP FUNCTIONS
     # ----------------------------------------------------------------------------------
+    def train_dataloader(
+        self,
+    ) -> DataLoader:
+        """
+
+        Returns:
+
+        """
+        return Dataloader(dataset=dataset, batch_size=self.hparams.batch_size)
 
     # ----------------------------------------------------------------------------------
     # CALCULATION
     # ----------------------------------------------------------------------------------
-
     @staticmethod
     def embed_loss0(x_tilde, x):
         """
-
+        Loss guiding reversible mapping between feature and latent spaces to enable
+        embedding and recovery nets to reconstruct original data.
         Args:
-            x_tilde:
-            x:
+            x_tilde: decoded real samples
+            x: real samples
 
         Returns:
-
+            Loss
         """
         e_loss_t0 = nn.MSELoss()(x_tilde, x)
         e_loss0 = 10 * torch.sqrt(e_loss_t0)
@@ -348,13 +365,14 @@ def embed_loss0(x_tilde, x):
     @staticmethod
     def supervisor_loss(h_hat_supervise, h):
         """
-
+        This loss further ensures that generator produces similar stepwise transitions
+        (evaluated by ground-truth targets).
         Args:
-            h_hat_supervise:
-            h:
+            h_hat_supervise: supervisors output from feeding h (real embedding) through
+            h: real embedding defined by embedder net
 
         Returns:
-
+            Loss
         """
         gen_sup_loss = nn.MSELoss()(h_hat_supervise[:, 1:, :], h[:, 1:, :])
         return gen_sup_loss
@@ -370,25 +388,26 @@ def generator_loss(
         emb_weight,
     ):
         """
-
+        Loss of generator combining adversarial & supervisor losses together with
+        looking at difference between final synthetic output and original data.
         Args:
-            y_fake:
-            y_fake_e:
-            h:
-            h_hat_supervise:
-            x:
-            x_hat:
-            emb_weight:
+            y_fake: logits for classification of fakes (from h_hat)
+            y_fake_e: logits for classification of fake embeddings (from e_hat)
+            h: real embedding defined by embedder net
+            h_hat_supervise: supervisors output from feeding h (real embedding) through
+            x: real samples
+            x_hat: decoded samples of embedding created by generator
+            emb_weight: weight defining how much embedded fake contributes to loss
 
         Returns:
-
+            Loss
         """
         # adversarial
         g_loss_u = nn.BCELoss()(y_fake, torch.ones_like(y_fake))
         g_loss_u_e = nn.BCELoss()(y_fake_e, torch.ones_like(y_fake_e))
         w_g_loss_u_e = emb_weight * g_loss_u_e
         # supervisor
-        g_loss_s = nn.MSELoss()(h_hat_supervise[:, 1:, :], h[:, 1:, :])
+        g_loss_s = TimeGANLightning.supervisor_loss(h_hat_supervise, h)
         # 2 moments
         d = torch.sqrt(torch.var(x_hat, 0) + 1e-6) - torch.sqrt(torch.var(x, 0) + 1e-6)
         g_loss_v1 = torch.mean(torch.abs(d))
@@ -401,35 +420,39 @@ def generator_loss(
     @staticmethod
     def embedder_loss(x_tilde, x, h_hat_supervise, h):
         """
-
+        Loss to further improve reversible mapping between feature and latent space,
+        combined with
         Args:
-            x_tilde:
-            x:
-            h_hat_supervise:
-            h:
+            x_tilde: decoded real samples
+            x: real samples
+            h_hat_supervise: supervisors output from feeding h (real embedding) through
+            h: real embedding defined by embedder net
 
         Returns:
-
+            Loss
         """
-        e_loss_t0 = nn.MSELoss()(x_tilde, x)
-        e_loss0 = 10 * torch.sqrt(e_loss_t0)
-        e_loss = e_loss0 + 0.1 * nn.MSELoss()(h_hat_supervise[:, 1:, :], h[:, 1:, :])
+        e_loss0 = TimeGANLightning.embed_loss0(x_tilde, x)
+        e_loss = e_loss0 + 0.1 * TimeGANLightning.supervisor_loss(h_hat_supervise, h)
         return e_loss
 
     @staticmethod
     def discriminator_loss(y_fake, y_fake_e, y_real, emb_weight):
         """
-
+        Discriminator’s binary adversarial feedback, both on fake and real data.
         Args:
-            y_fake:
-            y_fake_e:
-            y_real:
-            emb_weight:
+            y_fake: logits for classification of fakes (from h_hat)
+            y_fake_e: logits for classification of fake embeddings (from e_hat)
+            y_real: logits for classification of real embeddings (from h)
+            emb_weight: weight defining how much embedded fake contributes to loss
 
         Returns:
-
+            Loss
         """
-        d_loss_fake_e = nn.BCELoss()(y_fake_e, torch.zeros_like(y_fake_e))
-        d_loss_fake = nn.BCELoss()(y_fake, torch.zeros_like(y_fake))
-        d_loss_real = nn.BCELoss()(y_real, torch.ones_like(y_real))
+        # TODO: is this the correct loss? discriminator returns logits w/out activation.
+        #   changed compared to original TF implementation
+        criterion = nn.BCEWithLogitsLoss()
+        d_loss_fake_e = criterion(y_fake_e, torch.zeros_like(y_fake_e))
+        d_loss_fake = criterion(y_fake, torch.zeros_like(y_fake))
+        d_loss_real = criterion(y_real, torch.ones_like(y_real))
+        # TODO: any use of dividing loss by (2 + emb_weight)?
         return emb_weight * d_loss_fake_e + d_loss_fake + d_loss_real

From 1123074ea2612bec0848a7f5a9e87adc770efe95 Mon Sep 17 00:00:00 2001
From: Daniel Homola <dani.homola@gmail.com>
Date: Sat, 16 Jan 2021 09:58:27 +0000
Subject: [PATCH 32/62] I cannot fucking believe it but I think I managed to
 crack this multiprocessing..

---
 src/dagobert/modelling/rl/ppo.py | 137 ++++++++++++++++---------------
 1 file changed, 73 insertions(+), 64 deletions(-)

diff --git a/src/dagobert/modelling/rl/ppo.py b/src/dagobert/modelling/rl/ppo.py
index df900659..02fe9f70 100644
--- a/src/dagobert/modelling/rl/ppo.py
+++ b/src/dagobert/modelling/rl/ppo.py
@@ -157,11 +157,13 @@ def generate_experience_buffer(
         # setup workers and pass them the env, agent, vars to work with
         max_worker_steps = int(self.hparams.steps_per_epoch / self.hparams.num_workers)
         parallel_experiences = ParallelExperiences()
-
         self.agent.critic_net.cpu()
         self.agent.critic_net.eval()
         self.agent.actor_net.cpu()
         self.agent.actor_net.eval()
+        from IPython import embed
+
+        embed()
         for i in range(self.hparams.num_workers):
             args = (
                 self.envs[i],
@@ -479,7 +481,7 @@ class ParallelExperiences:
 
     def __init__(self):
         """Class constructor."""
-        self.exp_queue = mp.Queue()
+        self.queue = mp.Queue()
         self.processes = []
 
     def collect_experiences(self) -> List[ExperienceBuffer]:
@@ -487,75 +489,82 @@ def collect_experiences(self) -> List[ExperienceBuffer]:
         buffers = []
         # gather results from workers using the queue and merge them into one
         for process in self.processes:
-            buffers.append(self.exp_queue.get())  # will block
+            buffers.append(self.queue.get())  # will block
         for process in self.processes:
             process.join()
         return buffers
 
-    def create_worker(self, *args):
+    def create_worker(self, *args, **kwargs):
         """Creates a new worker, with the args passed in for `_gather_experience`."""
-        process = mp.Process(target=self.gather_experience, args=args)
+        args_for_wrapper = [gather_experience, self.queue, args, kwargs]
+        process = mp.Process(target=self._wrapper, args=args_for_wrapper)
         self.processes.append(process)
         process.start()
 
-    def gather_experience(
-        self,
-        env: gym.Env,
-        agent: ActorCriticAgent,
-        device: torch.device,
-        max_steps: int,
-        max_episode_length: int,
-        asset_num: int,
-        gamma: float,
-        lam: float,
-    ):
-        """
-        Workhorse function of the parallel experience gathering. This function can be
-        called as many times as many CPUs are available on the system, to collect the
-        desired number of steps and store them into an `ExperienceBuffer` that is then
-        passed back (via a `multiprocessing.Queue` object) to the main process that
-        spawned the parallel processes.
-
-        Args:
-            env: An instance of the environment to act on.
-            agent: An instance of the PPO's `ActorCriticAgent`.
-            device: Device where the agent lives (GPU or CPU).
-            max_steps: Total number of steps (over multiple episodes) a worker can take.
-            max_episode_length: Maximum length of a trajectory / episode.
-            asset_num: Number of assets we are modelling (not including USD).
-            gamma: See docs of :func:`PPO.calc_advantage`
-            lam: See docs of :func:`PPO.calc_advantage`
+    @staticmethod
+    def _wrapper(func, queue, args, kwargs):
+        """This NEEDS to be a static method for multiprocessing to work"""
+        buffer = func(*args, **kwargs)
+        # add collected experience to the queue so it can be returned to master process
+        queue.put(buffer)
+
+
+def gather_experience(
+    env: gym.Env,
+    agent: ActorCriticAgent,
+    device: torch.device,
+    max_steps: int,
+    max_episode_length: int,
+    asset_num: int,
+    gamma: float,
+    lam: float,
+):
+    """
+    Workhorse function of the parallel experience gathering. This function can be
+    called as many times as many CPUs are available on the system, to collect the
+    desired number of steps and store them into an `ExperienceBuffer` that is then
+    passed back (via a `multiprocessing.Queue` object) to the main process that
+    spawned the parallel processes.
+
+    Args:
+        env: An instance of the environment to act on.
+        agent: An instance of the PPO's `ActorCriticAgent`.
+        device: Device where the agent lives (GPU or CPU).
+        max_steps: Total number of steps (over multiple episodes) a worker can take.
+        max_episode_length: Maximum length of a trajectory / episode.
+        asset_num: Number of assets we are modelling (not including USD).
+        gamma: See docs of :func:`PPO.calc_advantage`
+        lam: See docs of :func:`PPO.calc_advantage`
+
+    Returns:
+        Adds the results to `exp_queue` so it can be processed in the main process.
+    """
+    buffer = ExperienceBuffer()
+    state = env.reset()
+    past_pw = PPO._init_past_pw(asset_num, device)
+    for step in range(max_steps):
+        # get action, make step, get reward and info from env
+        pi, action, actor_logits, logp, value = agent(state, past_pw, device)
+        next_state, reward, done, info = env.step(action.cpu().numpy())
+
+        # store everything and update state, past_pw
+        buffer.append(state, past_pw, action, logp, reward, value, info)
+        state = next_state
+        past_pw = PPO._update_past_pw(info["portfolio_value"], actor_logits, device)
+
+        terminal = len(buffer.ep_rewards) == max_episode_length
+        if done or terminal:
+            buffer.qvals += PPO.discount_rewards(buffer.ep_rewards, gamma)[:-1]
+            buffer.advs += PPO.calc_advantage(
+                buffer.ep_rewards, buffer.ep_values, gamma, lam
+            )
+            buffer.done_episodes += 1
+            buffer.epoch_rewards += np.sum(buffer.ep_rewards)
 
-        Returns:
-            Adds the results to `exp_queue` so it can be processed in the main process.
-        """
-        buffer = ExperienceBuffer()
-        state = env.reset()
-        past_pw = PPO._init_past_pw(asset_num, device)
-        for step in range(max_steps):
-            # get action, make step, get reward and info from env
-            pi, action, actor_logits, logp, value = agent(state, past_pw, device)
-            next_state, reward, done, info = env.step(action.cpu().numpy())
-
-            # store everything and update state, past_pw
-            buffer.append(state, past_pw, action, logp, reward, value, info)
-            state = next_state
-            past_pw = PPO._update_past_pw(info["portfolio_value"], actor_logits, device)
-
-            terminal = len(buffer.ep_rewards) == max_episode_length
-            if done or terminal:
-                buffer.qvals += PPO.discount_rewards(buffer.ep_rewards, gamma)[:-1]
-                buffer.advs += PPO.calc_advantage(
-                    buffer.ep_rewards, buffer.ep_values, gamma, lam
-                )
-                buffer.done_episodes += 1
-                buffer.epoch_rewards += np.sum(buffer.ep_rewards)
-
-                # episode over, reset the env and the buffer
-                buffer.ep_rewards = []
-                buffer.ep_values = []
-                state = env.reset()
-                past_pw = PPO._init_past_pw(asset_num, device)
+            # episode over, reset the env and the buffer
+            buffer.ep_rewards = []
+            buffer.ep_values = []
+            state = env.reset()
+            past_pw = PPO._init_past_pw(asset_num, device)
 
-        # add collected experience to the queue so it can be returned to master process
-        self.exp_queue.put(buffer)
+    return buffer

From 86dcdfdcc68c2c2694eacd9b90dac38600be007a Mon Sep 17 00:00:00 2001
From: Daniel Homola <dani.homola@gmail.com>
Date: Sat, 16 Jan 2021 16:05:17 +0000
Subject: [PATCH 33/62] it works, even on windows but after a while it crashes
 with RuntimeError: Couldn't open shared event -> let's try it on linux now

---
 config/rl_config.yaml                 |   2 +-
 src/dagobert/modelling/rl/networks.py |   2 +-
 src/dagobert/modelling/rl/ppo.py      | 150 ++++++++++++--------------
 3 files changed, 72 insertions(+), 82 deletions(-)

diff --git a/config/rl_config.yaml b/config/rl_config.yaml
index cce7b3d7..170585df 100644
--- a/config/rl_config.yaml
+++ b/config/rl_config.yaml
@@ -4,7 +4,7 @@
 # LIGHTNING
 # --------------------------------------------------------------------------------------
 
-gpus: 0
+gpus: 1
 pin_memory: True
 profiler: True
 #val_check_interval: 0.5
diff --git a/src/dagobert/modelling/rl/networks.py b/src/dagobert/modelling/rl/networks.py
index 061e7366..40b81a9d 100644
--- a/src/dagobert/modelling/rl/networks.py
+++ b/src/dagobert/modelling/rl/networks.py
@@ -63,7 +63,7 @@ def forward(self, state, past_pw):
         else:
             s2 = torch.tanh(self.linear1(s1).squeeze(-1))
         # bring together the state and past_pw representations make residual connection
-        return past_pw[:, 1:] + self.linear2(torch.cat([s2, a1], dim=1))
+        return self.linear2(torch.cat([s2, a1], dim=1))
 
 
 class ActorContinous(nn.Module):
diff --git a/src/dagobert/modelling/rl/ppo.py b/src/dagobert/modelling/rl/ppo.py
index 02fe9f70..8a8ee757 100644
--- a/src/dagobert/modelling/rl/ppo.py
+++ b/src/dagobert/modelling/rl/ppo.py
@@ -3,11 +3,13 @@
 modified from https://github.com/sid-sundrani/ppo_lightning.
 """
 # pylint: disable=no-member
+import sys
 import logging
 from copy import deepcopy
 from pathlib import Path
 from typing import List, Tuple
 from argparse import Namespace
+from itertools import chain
 
 import gym
 import torch
@@ -36,6 +38,7 @@
 
 logger = logging.getLogger(__name__)
 mp = torch.multiprocessing.get_context("spawn")
+eps = np.finfo(float).eps
 
 
 def run_rl(args):
@@ -71,6 +74,7 @@ def run_rl(args):
     # define trainer and and lightning module
     args.multiprocessing = True if args.gpus != 1 else False
     args.num_workers = 1 if args.num_workers == 0 else args.num_workers
+    args.windows = True if "win" in sys.platform else False
     trainer = Trainer.from_argparse_args(
         args,
         logger=tcn_loggers,
@@ -113,11 +117,9 @@ def __init__(self, hparams: Namespace):
         self.critic = ActorCriticTCN(
             self.hparams, n_actions=n_actions, output_size=1, actor=False
         )
-        self.critic.share_memory()
         self.actor = ActorContinous(
             ActorCriticTCN(self.hparams, n_actions=n_actions, output_size=n_actions)
         )
-        self.actor.actor_net.share_memory()
         self.agent = ActorCriticAgent(self.actor, self.critic)
         self.buffer = ExperienceBuffer()
         self.avg_ep_reward = 0
@@ -154,21 +156,14 @@ def generate_experience_buffer(
            Tuple of Lists containing tensors for states, actions, log probs, qvals
                and advantage.
         """
-        # setup workers and pass them the env, agent, vars to work with
         max_worker_steps = int(self.hparams.steps_per_epoch / self.hparams.num_workers)
         parallel_experiences = ParallelExperiences()
-        self.agent.critic_net.cpu()
-        self.agent.critic_net.eval()
-        self.agent.actor_net.cpu()
-        self.agent.actor_net.eval()
-        from IPython import embed
-
-        embed()
+        device = self.setup_model_for_experience_gathering()
         for i in range(self.hparams.num_workers):
             args = (
                 self.envs[i],
                 self.agent,
-                "cpu",
+                device,
                 max_worker_steps,
                 self.hparams.max_episode_length,
                 len(self.hparams.asset_names),
@@ -177,15 +172,49 @@ def generate_experience_buffer(
             )
             parallel_experiences.create_worker(*args)
 
-        # collect experiences in parallel, then merge them and create dataset
+        # collect experiences in parallel, then merge them
         self.buffer.merge_buffers(parallel_experiences.collect_experiences())
-        self.buffer.yield_dataset()  # this will yield a dataset for dataloader
+        # update metrics we log about the current performance of the agent
+        self.avg_ep_reward = self.buffer.epoch_rewards / self.buffer.done_episodes + eps
+        self.avg_reward = self.buffer.epoch_rewards / self.hparams.steps_per_epoch
+        self.avg_ep_len = self.hparams.steps_per_epoch / self.buffer.done_episodes + eps
+
+        # yield a dataset for dataloader for updating actor/critic
+        self.setup_model_for_training()
+        for state, past_pw, action, logp_old, qval, adv in zip(
+            self.buffer.states,
+            self.buffer.past_pws,
+            self.buffer.actions,
+            self.buffer.logps,
+            self.buffer.qvals,
+            self.buffer.advs,
+        ):
+            yield state, past_pw, action, logp_old, qval, adv
         self.buffer.clear_buffer()
 
-        # finally update metrics we log
-        self.avg_ep_reward = self.buffer.epoch_rewards / self.buffer.done_episodes
-        self.avg_reward = self.buffer.epoch_rewards / self.hparams.steps_per_epoch
-        self.avg_ep_len = self.hparams.steps_per_epoch / self.buffer.done_episodes
+    def setup_model_for_experience_gathering(self):
+        """Helper function to move model to CPU if necessary"""
+        # dropout and batch-norm doesn't make sense for experience gathering
+        self.agent.critic_net.eval()
+        self.agent.actor_net.eval()
+        # we cannot use cuda tensor sharing on windows (necessary for multiprocessing)
+        if self.hparams.windows:
+            device = "cpu"
+            self.agent.critic_net.cpu()
+            self.agent.actor_net.cpu()
+        else:
+            device = self.device
+            self.agent.critic_net.share_memory()
+            self.agent.actor_net.share_memory()
+        return device
+
+    def setup_model_for_training(self):
+        """Helper function to move model back to GPU if necessary"""
+        if self.hparams.windows and self.hparams.gpus != 0:
+            self.agent.critic_net.cuda()
+            self.agent.actor_net.cuda()
+        self.agent.critic_net.train()
+        self.agent.actor_net.train()
 
     @staticmethod
     def discount_rewards(rewards: List[float], discount: float) -> List[float]:
@@ -237,17 +266,6 @@ def calc_advantage(
         adv = PPO.discount_rewards(delta, gamma * lam)
         return adv
 
-    @staticmethod
-    def normalise_advantage(batch_adv: List[float]) -> List[float]:
-        """
-        Normalise across all episodes within the epoch. Apparently this helps with
-        covergence.
-        """
-        # normalise advantage
-        adv = np.array(batch_adv)
-        adv = (adv - adv.mean()) / (adv.std() + np.finfo(float).eps)
-        return list(adv)
-
     @staticmethod
     def _init_past_pw(asset_num, device) -> torch.Tensor:
         """
@@ -322,7 +340,9 @@ def training_step(
             loss
         """
         state, past_pw, action, old_logp, qval, adv = batch
-        adv = PPO.normalise_advantage(adv)
+        # normalize advantages within batch
+        adv = (adv - adv.mean()) / adv.std()
+
         self.log("avg_ep_len", self.avg_ep_len, on_step=False, on_epoch=True)
         self.log("avg_ep_reward", self.avg_ep_reward, on_step=False, on_epoch=True)
         self.log("avg_reward", self.avg_reward, on_step=False, on_epoch=True)
@@ -375,20 +395,7 @@ class ExperienceBuffer:
 
     def __init__(self):
         """Class constructor"""
-        # step vars
-        self.states = []
-        self.past_pws = []
-        self.actions = []
-        self.advs = []
-        self.qvals = []
-        self.logps = []
-        self.infos = []
-
-        # episode vars
-        self.ep_rewards = []
-        self.ep_values = []
-        self.done_episodes = 0
-        self.epoch_rewards = 0
+        self.clear_buffer()
 
     def append(
         self,
@@ -430,46 +437,29 @@ def merge_buffers(self, buffers):
             buffers: List of smaller ExpereinceBuffers to merge together from parallel
                 processes.
         """
-        pass
-
-    def yield_dataset(
-        self,
-    ) -> Tuple[
-        List[torch.Tensor],
-        List[torch.Tensor],
-        List[torch.Tensor],
-        List[torch.Tensor],
-        List[torch.Tensor],
-    ]:
-        """
-        Yields an iterable dataset for Pytorch Lightning from the contents of the
-        ExperienceBuffer.
-
-        Yield:
-           Tuple of Lists containing tensors for states, actions, log probs, qvals and
-            advantage.
-        """
-        data = zip(
-            self.states,
-            self.past_pws,
-            self.actions,
-            self.logps,
-            self.qvals,
-            self.advs,
-        )
-        for state, past_pw, action, logp_old, qval, adv in data:
-            yield state, past_pw, action, logp_old, qval, adv
+        self.states = chain(*[buffer.states for buffer in buffers])
+        self.past_pws = chain(*[buffer.past_pws for buffer in buffers])
+        self.actions = chain(*[buffer.actions for buffer in buffers])
+        self.logps = chain(*[buffer.logps for buffer in buffers])
+        self.qvals = chain(*[buffer.qvals for buffer in buffers])
+        self.advs = chain(*[buffer.advs for buffer in buffers])
+        self.done_episodes = sum([buffer.done_episodes for buffer in buffers])
+        self.epoch_rewards = sum([buffer.epoch_rewards for buffer in buffers])
 
     def clear_buffer(self):
         """Resets the ExperienceBuffer."""
-        self.states.clear()
-        self.past_pws.clear()
-        self.actions.clear()
-        self.advs.clear()
-        self.logps.clear()
-        self.qvals.clear()
-        self.ep_rewards.clear()
-        self.ep_values.clear()
+        # step vars
+        self.states = []
+        self.past_pws = []
+        self.actions = []
+        self.advs = []
+        self.qvals = []
+        self.logps = []
+        self.infos = []
+
+        # episode / epoch vars
+        self.ep_rewards = []
+        self.ep_values = []
         self.done_episodes = 0
         self.epoch_rewards = 0
 

From f78c606d1a27db65a534ec1d1842dcb8a2700de7 Mon Sep 17 00:00:00 2001
From: Daniel Homola <dani.homola@gmail.com>
Date: Sun, 17 Jan 2021 08:49:45 +0000
Subject: [PATCH 34/62] now it works reliably both on win/linux!
 hallefuckingluja

---
 config/rl_config.yaml            | 21 +++++++++++----------
 src/dagobert/modelling/rl/ppo.py |  5 ++++-
 2 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/config/rl_config.yaml b/config/rl_config.yaml
index 170585df..af4a8ce6 100644
--- a/config/rl_config.yaml
+++ b/config/rl_config.yaml
@@ -23,7 +23,7 @@ tags:
   - RL_test
 no_comet_logger: True
 seed: 42
-batch_size: 256
+batch_size: 512
 
 # --------------------------------------------------------------------------------------
 # RL
@@ -35,15 +35,16 @@ asset_names:
 trading_cost: 0.002
 reward_type: return
 max_episode_length: 500
-steps_per_epoch: 5000
-n_optim_iters: 4
-gamma: 0.99
-lam: 0.95
+steps_per_epoch: 10000
+n_optim_iters: 8
+gamma: 0.95
+lam: 0.9
 lr_actor: 0.001
 lr_critic: 0.001
-clip_ratio: 0.25
+clip_ratio: 0.2
 target_kl: 0.01
 
+
 # don't change these, or preprocessing won't work 
 target_col: rl_return
 to_label: False
@@ -55,11 +56,11 @@ no_weight_norm: True
 # MODEL
 # --------------------------------------------------------------------------------------
 
-actor_num_channels: [50, 50, 50, 50, 50]
-actor_kernel_size: 5
+actor_num_channels: [100, 100, 100, 100, 100]
+actor_kernel_size: 3
 actor_dropout: 0.25
-critic_num_channels: [50, 50, 50, 50, 50]
-critic_kernel_size: 5
+critic_num_channels: [100, 100, 100, 100, 100]
+critic_kernel_size: 3
 critic_dropout: 0.25
 use_last_timepoint: True
 
diff --git a/src/dagobert/modelling/rl/ppo.py b/src/dagobert/modelling/rl/ppo.py
index 8a8ee757..411eb99c 100644
--- a/src/dagobert/modelling/rl/ppo.py
+++ b/src/dagobert/modelling/rl/ppo.py
@@ -37,6 +37,7 @@
 
 
 logger = logging.getLogger(__name__)
+torch.multiprocessing.set_sharing_strategy("file_system")
 mp = torch.multiprocessing.get_context("spawn")
 eps = np.finfo(float).eps
 
@@ -471,7 +472,7 @@ class ParallelExperiences:
 
     def __init__(self):
         """Class constructor."""
-        self.queue = mp.Queue()
+        self.queue = mp.JoinableQueue()
         self.processes = []
 
     def collect_experiences(self) -> List[ExperienceBuffer]:
@@ -480,6 +481,7 @@ def collect_experiences(self) -> List[ExperienceBuffer]:
         # gather results from workers using the queue and merge them into one
         for process in self.processes:
             buffers.append(self.queue.get())  # will block
+            self.queue.task_done()
         for process in self.processes:
             process.join()
         return buffers
@@ -497,6 +499,7 @@ def _wrapper(func, queue, args, kwargs):
         buffer = func(*args, **kwargs)
         # add collected experience to the queue so it can be returned to master process
         queue.put(buffer)
+        queue.join()
 
 
 def gather_experience(

From 5785c57f67af5e922f316f644cb6f09cd8fcf491 Mon Sep 17 00:00:00 2001
From: Daniel Homola <dani.homola@gmail.com>
Date: Wed, 20 Jan 2021 08:55:59 +0000
Subject: [PATCH 35/62] adding logging of weights/portfolio and making few
 fixes to how actions are sampled

---
 config/rl_config.yaml                    | 100 ++++++++++++---------
 src/dagobert/modelling/dl/data.py        |  20 +++--
 src/dagobert/modelling/rl/environment.py |  11 +--
 src/dagobert/modelling/rl/networks.py    |  34 +++++--
 src/dagobert/modelling/rl/ppo.py         | 109 +++++++++++++++++------
 src/dagobert/modelling/rl/rl_args.py     |  12 +++
 6 files changed, 193 insertions(+), 93 deletions(-)

diff --git a/config/rl_config.yaml b/config/rl_config.yaml
index af4a8ce6..7576287c 100644
--- a/config/rl_config.yaml
+++ b/config/rl_config.yaml
@@ -23,7 +23,7 @@ tags:
   - RL_test
 no_comet_logger: True
 seed: 42
-batch_size: 512
+batch_size: 240
 
 # --------------------------------------------------------------------------------------
 # RL
@@ -32,16 +32,18 @@ batch_size: 512
 asset_names: 
   - BTC
   - ETH
+  - XRP
+  - LTC
 trading_cost: 0.002
 reward_type: return
-max_episode_length: 500
-steps_per_epoch: 10000
-n_optim_iters: 8
-gamma: 0.95
-lam: 0.9
-lr_actor: 0.001
-lr_critic: 0.001
-clip_ratio: 0.2
+max_episode_length: 240
+steps_per_epoch: 24000
+n_optim_iters: 2
+gamma: 0.0
+lam: 0.90
+lr_actor: 0.0003
+lr_critic: 0.0003
+clip_ratio: 0.25
 target_kl: 0.01
 
 
@@ -56,13 +58,15 @@ no_weight_norm: True
 # MODEL
 # --------------------------------------------------------------------------------------
 
-actor_num_channels: [100, 100, 100, 100, 100]
+actor_num_channels: [50, 50, 50, 50, 50]
 actor_kernel_size: 3
-actor_dropout: 0.25
-critic_num_channels: [100, 100, 100, 100, 100]
+actor_dropout: 0.2
+# sample size - exp abs diff to mean | 20 - 5% | 50 - 3% | 100 - 2% | 500 - 1%
+actor_dirichlet_sample_size: 20
+critic_num_channels: [50, 50, 50, 50, 50]
 critic_kernel_size: 3
-critic_dropout: 0.25
-use_last_timepoint: True
+critic_dropout: 0.2
+use_last_timepoint: False
 
 # --------------------------------------------------------------------------------------
 # DATA
@@ -75,8 +79,14 @@ lookback: auto
 mini_series_length: auto
 
 df_train:
-  anchor: std_bar_BTCUSDT_tick_1.feather
-  df2: std_bar_ETHUSDT_tick_1.feather
+  anchor: std_bar_BTCUSDT_volume_100.feather
+  df2: std_bar_ETHUSDT_volume_500.feather
+  df3: std_bar_XRPUSDT_volume_125000.feather
+  df4: std_bar_LTCUSDT_volume_1000.feather
+  # anchor: std_bar_BTCUSDT_tick_1.feather
+  # df2: std_bar_ETHUSDT_tick_1.feather
+  # df3: std_bar_XRPUSDT_tick_1.feather
+  # df4: std_bar_LTCUSDT_tick_1.feather
 df_val: 
 df_test:
 cols_to_model:
@@ -86,14 +96,14 @@ cols_to_model:
     - high
     - low
     - close
-    - open_fd_0.0
-    - high_fd_0.0
-    - low_fd_0.0
-    - close_fd_0.0
-    - open_fd_tuned
-    - high_fd_tuned
-    - low_fd_tuned
-    - close_fd_tuned
+    # - open_fd_0.0
+    # - high_fd_0.0
+    # - low_fd_0.0
+    # - close_fd_0.0
+    # - open_fd_tuned
+    # - high_fd_tuned
+    # - low_fd_tuned
+    # - close_fd_tuned
     - cum_ticks
     - cum_dollar
     - volume
@@ -106,24 +116,26 @@ cols_to_model:
     - cos_date
     - sin_time
     - cos_time
-    - boll
-    - boll_lb
-    - boll_ub
-    - macd
-    - macds
-    - macdh
-    - wr_60
-    - rsi_60
-    - rsv_60
-    - atr_60
-    - cci_60
-    - kdjk_60
-    - kdjd_60
-    - kdjj_60
-    - pdi_60
-    - mdi_60
-    - vr_60
+    # - boll
+    # - boll_lb
+    # - boll_ub
+    # - macd
+    # - macds
+    # - macdh
+    # - wr_60
+    # - rsi_60
+    # - rsv_60
+    # - atr_60
+    # - cci_60
+    # - kdjk_60
+    # - kdjd_60
+    # - kdjj_60
+    # - pdi_60
+    # - mdi_60
+    # - vr_60
   df2:
+  df3:
+  df4:
     # the cols of the secondary DFs will automatically be set to anchor's if not defined
     
 time_feat_n: 1
@@ -138,9 +150,9 @@ augment_dfs_mix: 0
 # PREPROCESSING
 # --------------------------------------------------------------------------------------
 
-train_start_date: "2018-06-01"
-train_days: 730
-val_days: 60
+train_start_date: "2019-01-01"
+train_days: 500
+val_days: 30
 val_train_offset_days: 1
 val_puffer_days: 1
 test_days: 30
diff --git a/src/dagobert/modelling/dl/data.py b/src/dagobert/modelling/dl/data.py
index 969a2a56..dc9c77df 100644
--- a/src/dagobert/modelling/dl/data.py
+++ b/src/dagobert/modelling/dl/data.py
@@ -6,7 +6,6 @@
 from pathlib import Path
 from argparse import Namespace
 from typing import List, Tuple, Union, Iterable, Callable
-
 import torch
 import numpy as np
 import pandas as pd
@@ -571,15 +570,20 @@ class PortfolioCryptoDataset(CryptoDataset):
     is achieved by adding the rl_return target column to the cols_to_model at init, and
     then fishing it out for each sample before returning it.
 
-    This convulated way was used so we can repurpose and keep as much of the original
+    This convoluted way was used so we can repurposed and keep as much of the original
     CryptoDataset as possible, without extensive refactoring.
     """
 
-    def __init__(self, *args, **kw):
+    def __init__(self, *args, **kwargs):
         # for each instrument, we add the rl_return target col to their cols_to_model
-        for df_name, _ in kw[npa.cols_to_model].items():
-            kw[npa.cols_to_model][df_name].append(NRL.rl_return)
-        super().__init__(*args, **kw)
+        for df_name, _ in kwargs[npa.cols_to_model].items():
+            kwargs[npa.cols_to_model][df_name].append(NRL.rl_return)
+            # lazy way to check if we have datediff as first feature, if so, cumsum it
+            if kwargs[npa.cols_to_model][df_name][0] == "date_diff":
+                self.sum_date_diffs = True
+            else:
+                self.sum_date_diffs = False
+        super().__init__(*args, **kwargs)
 
     def __getitem__(self, idx):
         """
@@ -596,6 +600,10 @@ def __getitem__(self, idx):
         for i, X in enumerate(Xs):
             ys[i] = X[-1, -1]
             Xs[i] = X[:-1, :]
+            if self.sum_date_diffs:
+                # make the cumulative flow from right (present) to left (past)
+                cs = Xs[i][0][::-1].cumsum()[::-1]
+                Xs[i][0] = MinMaxScaler().fit_transform(cs.reshape([-1, 1])).ravel()
         return Xs, ys
 
 
diff --git a/src/dagobert/modelling/rl/environment.py b/src/dagobert/modelling/rl/environment.py
index 11efe152..760d9639 100644
--- a/src/dagobert/modelling/rl/environment.py
+++ b/src/dagobert/modelling/rl/environment.py
@@ -137,6 +137,9 @@ def step(self, w1: np.array, y1: np.array) -> Tuple[float, dict, bool]:
 
         # (eq16) cost to change portfolio:
         # excluding change in cash to avoid double counting for transaction cost
+        # TODO: isn't this unrealistic? this would only work if there's an efficient
+        # market between all pairs within the portfilio, i.e XRP -> LTC, althought
+        # this is what pgportfolio uses too
         mu = self.trading_cost * (np.abs(dw1[1:] - w1[1:])).sum()
 
         # (eq11) final portfolio value: I thought this should be w1 (at the end), but
@@ -167,7 +170,6 @@ def step(self, w1: np.array, y1: np.array) -> Tuple[float, dict, bool]:
             "portfolio_value": p1,
             "market_return": y1.mean(),
             "rate_of_return": rho1,
-            "weights_mean": w1.mean(),
             "weights_std": w1.std(),
             "rebalancing_cost": mu,
         }
@@ -245,13 +247,8 @@ def step(self, action: np.array):
             action: Portfolio weights for the N assets and the cash (first item).
                 They should all be between 0 and 1 (no shorting) and sum to 1.
         """
-        # cut and normalise action (just in case)
-        action = np.clip(action, 0, 1)
-        weights = action
-        weights /= weights.sum() + eps
-
         next_state, y1 = self.data.step()
-        reward, info, done = self.portfolio.step(weights, y1)
+        reward, info, done = self.portfolio.step(action, y1)
         self.infos.append(info)
 
         return next_state, reward, done, info
diff --git a/src/dagobert/modelling/rl/networks.py b/src/dagobert/modelling/rl/networks.py
index 40b81a9d..2a0365b5 100644
--- a/src/dagobert/modelling/rl/networks.py
+++ b/src/dagobert/modelling/rl/networks.py
@@ -2,6 +2,7 @@
 from argparse import Namespace
 from typing import Union, Tuple
 
+import numpy as np
 import torch
 from torch import nn
 from torch.distributions import Dirichlet
@@ -36,6 +37,7 @@ def __init__(
         super().__init__()
         self.hparams = hparams
         self.n_actions = n_actions
+        self.actor = actor
         num_inputs = [len(cols) for dataset, cols in hparams.cols_to_model.items()]
         num_channels = (
             hparams.actor_num_channels if actor else hparams.critic_num_channels
@@ -62,17 +64,17 @@ def forward(self, state, past_pw):
             s2 = s1[:, :, -1]
         else:
             s2 = torch.tanh(self.linear1(s1).squeeze(-1))
-        # bring together the state and past_pw representations make residual connection
+        # bring together the state and past_pw representations
         return self.linear2(torch.cat([s2, a1], dim=1))
 
 
-class ActorContinous(nn.Module):
+class ActorContinuous(nn.Module):
     """
-    Policy network, for continous action spaces, which returns a distribution
+    Policy network, for continuous action spaces, which returns a distribution
     and an action given an observation
     """
 
-    def __init__(self, actor_net):
+    def __init__(self, actor_net: TemporalConvNet, sample_size: int = 1):
         """
         The original PPO can be used for discrete action spaces with a Categorical
         distribution or for a continuous actions space with a multivariate Gaussian,
@@ -91,23 +93,41 @@ def __init__(self, actor_net):
         and sampling our actions from that. This by design returns a probability
         summing to one and there's no need to learn a separate std param.
 
+        We can make this more deterministic with the sample_size param, see docs.
+
         NOTE!
         I'm not sure how well this works or how legit it is, as I haven't found
         any papers or implementations actually doing this.
 
         Args:
             actor_net: Initialized actor net.
+            sample_size: Determines how deterministic our Dirichlet based sampling is.
+                At default (1), we return a single sample from the dist. With higher
+                sample sizes the returned weights are closer and closer to the actual
+                mean of the distribution. If it's set to zero, we return the mean and
+                basically the model becomes deterministic.
         """
         super().__init__()
         self.actor_net = actor_net
         self.inv_lin = InverseLinear()
+        self.sample_size = sample_size
 
     def forward(self, states, past_pw):
         # get params for Dirichlet, and drop batch dim if batch_size=1
         logits = self.actor_net(states, past_pw)
         concentrations = self.inv_lin(logits).squeeze(0)
         pi = Dirichlet(concentrations)
-        actions = pi.sample()
+
+        # take 20 samples - corresponds to +/- 5% compared to returning the mean
+        if self.sample_size > 0:
+            actions = pi.sample((self.sample_size,)).mean(dim=0)
+        else:
+            actions = pi.sample.mean
+
+        # very rarely we get actions that don't sum to 1 or are negative, fix it here
+        if actions.sum() != past_pw.shape[0]:
+            actions = torch.clip(actions, 0.01, 0.99)
+            actions = (actions.T / actions.sum(dim=1)).T
         return pi, actions, logits
 
     def get_log_prob(self, pi: Dirichlet, actions: torch.Tensor):
@@ -119,7 +139,7 @@ def get_log_prob(self, pi: Dirichlet, actions: torch.Tensor):
             pi: torch distribution
             actions: actions taken by distribution
         Returns:
-            log probability of the acition under pi
+            log probability of the action under pi
         """
         return pi.log_prob(actions)
 
@@ -170,7 +190,7 @@ def get_log_prob(self, pi: Dirichlet, actions: torch.Tensor) -> torch.Tensor:
             pi: torch distribution
             actions: actions taken by distribution
         Returns:
-            log probability of the acition under pi
+            log probability of the action under pi
         """
         return self.actor_net.get_log_prob(pi, actions)
 
diff --git a/src/dagobert/modelling/rl/ppo.py b/src/dagobert/modelling/rl/ppo.py
index 411eb99c..ff488143 100644
--- a/src/dagobert/modelling/rl/ppo.py
+++ b/src/dagobert/modelling/rl/ppo.py
@@ -14,6 +14,7 @@
 import gym
 import torch
 import numpy as np
+import pandas as pd
 import torch.optim as optim
 from torch.utils.data import DataLoader
 from pytorch_lightning import LightningModule
@@ -26,7 +27,7 @@
 from dagobert.modelling.rl import (
     RLEnv,
     ActorCriticTCN,
-    ActorContinous,
+    ActorContinuous,
     ActorCriticAgent,
 )
 from dagobert.modelling.dl import (
@@ -65,7 +66,7 @@ def run_rl(args):
 
     # setup callbacks
     checkpoint_callback = ModelCheckpoint(
-        monitor="avg_reward",
+        monitor="avg_total_reward",
         filename="_{epoch:02d}_{avg_reward:.10f}",
         dirpath=f"{args.log_dir}/models/{args.exp_name}_{tb_logger.version}",
         save_top_k=3,
@@ -118,14 +119,13 @@ def __init__(self, hparams: Namespace):
         self.critic = ActorCriticTCN(
             self.hparams, n_actions=n_actions, output_size=1, actor=False
         )
-        self.actor = ActorContinous(
-            ActorCriticTCN(self.hparams, n_actions=n_actions, output_size=n_actions)
+        self.actor = ActorContinuous(
+            ActorCriticTCN(self.hparams, n_actions=n_actions, output_size=n_actions),
+            self.hparams.actor_dirichlet_sample_size,
         )
         self.agent = ActorCriticAgent(self.actor, self.critic)
         self.buffer = ExperienceBuffer()
-        self.avg_ep_reward = 0
-        self.avg_ep_len = 0
-        self.avg_reward = 0
+        self.to_log = {}
 
     # ----------------------------------------------------------------------------------
     # EXPERIENCE COLLECTION FOR TRAIN DATALOADER
@@ -173,16 +173,13 @@ def generate_experience_buffer(
             )
             parallel_experiences.create_worker(*args)
 
-        # collect experiences in parallel, then merge them
+        # collect experiences in parallel, then merge them, calculate metrics
         self.buffer.merge_buffers(parallel_experiences.collect_experiences())
-        # update metrics we log about the current performance of the agent
-        self.avg_ep_reward = self.buffer.epoch_rewards / self.buffer.done_episodes + eps
-        self.avg_reward = self.buffer.epoch_rewards / self.hparams.steps_per_epoch
-        self.avg_ep_len = self.hparams.steps_per_epoch / self.buffer.done_episodes + eps
+        self.update_metrics_to_log()
 
-        # yield a dataset for dataloader for updating actor/critic
+        # yield a dataset for dataloader for updating actor/critic and clear buffer
         self.setup_model_for_training()
-        for state, past_pw, action, logp_old, qval, adv in zip(
+        for state, past_pw, action, old_logp, qval, adv in zip(
             self.buffer.states,
             self.buffer.past_pws,
             self.buffer.actions,
@@ -190,7 +187,7 @@ def generate_experience_buffer(
             self.buffer.qvals,
             self.buffer.advs,
         ):
-            yield state, past_pw, action, logp_old, qval, adv
+            yield state, past_pw, action, old_logp, qval, adv
         self.buffer.clear_buffer()
 
     def setup_model_for_experience_gathering(self):
@@ -217,8 +214,33 @@ def setup_model_for_training(self):
         self.agent.critic_net.train()
         self.agent.actor_net.train()
 
+    def update_metrics_to_log(self):
+        """Helper function recalculating metrics we track at end of each epoch"""
+        done_eps = self.buffer.done_episodes + eps
+        ep_rewards = self.buffer.epoch_rewards
+        e = "episode"
+        p = "portfolio"
+
+        # pytorch lightning model checkpoint needs metric name without /
+        self.to_log["avg_total_reward"] = ep_rewards / done_eps
+        self.to_log[f"{e}/avg_total_reward"] = ep_rewards / done_eps
+        self.to_log[f"{e}/avg_step_reward"] = ep_rewards / self.hparams.steps_per_epoch
+        self.to_log[f"{e}/avg_len"] = self.hparams.steps_per_epoch / done_eps
+        self.to_log[f"{p}/avg_value_epend"] = (
+            sum(list(self.buffer.p_ep_end_value)) / done_eps
+        )
+
+        # need this otherwise the generator won't work multiple times
+        infos = pd.DataFrame(list(self.buffer.infos)).mean()
+        self.to_log[f"{p}/avg_value"] = infos["portfolio_value"]
+        self.to_log[f"{p}/avg_weight_std"] = infos["weights_std"]
+        self.to_log[f"{p}/avg_rebalancing_cost"] = infos["rebalancing_cost"]
+        self.to_log[f"{p}/avg_market_return"] = infos["market_return"]
+        for w in infos.index[infos.index.str.contains("weight_")]:
+            self.to_log[f"weights/{w}"] = infos[w]
+
     @staticmethod
-    def discount_rewards(rewards: List[float], discount: float) -> List[float]:
+    def discount_rewards(rewards: List[float], gamma: float = 0.99) -> List[float]:
         """
         Calculate the discounted rewards of all rewards in list. This is used as
         Q-values for training the critic network so it becomes better approximating
@@ -226,6 +248,7 @@ def discount_rewards(rewards: List[float], discount: float) -> List[float]:
 
         Args:
             rewards: list of rewards/advantages
+            gamma: Gamma for discounting the long-term rewards.
 
         Returns:
             list of discounted rewards/advantages
@@ -234,7 +257,7 @@ def discount_rewards(rewards: List[float], discount: float) -> List[float]:
         cumul_reward = []
         sum_r = 0.0
         for r in reversed(rewards):
-            sum_r = (sum_r * discount) + r
+            sum_r = (sum_r * gamma) + r
             cumul_reward.append(sum_r)
         return list(reversed(cumul_reward))
 
@@ -292,10 +315,10 @@ def _update_past_pw(p1: float, action: torch.Tensor, device) -> torch.Tensor:
     # LOSSES AND OPTIMIZERS
     # ----------------------------------------------------------------------------------
 
-    def actor_loss(self, state, past_pw, action, logp_old, adv) -> torch.Tensor:
+    def actor_loss(self, state, past_pw, action, old_logp, adv) -> torch.Tensor:
         pi, _, _ = self.actor(state, past_pw)
         logp = self.actor.get_log_prob(pi, action)
-        old_new_diff = logp - logp_old
+        old_new_diff = logp - old_logp
         ratio = torch.exp(old_new_diff)
         # idea taken from spinningup PPO implemenetation to prevent exploding loss
         approx_kl = old_new_diff.mean().item()
@@ -344,20 +367,25 @@ def training_step(
         # normalize advantages within batch
         adv = (adv - adv.mean()) / adv.std()
 
-        self.log("avg_ep_len", self.avg_ep_len, on_step=False, on_epoch=True)
-        self.log("avg_ep_reward", self.avg_ep_reward, on_step=False, on_epoch=True)
-        self.log("avg_reward", self.avg_reward, on_step=False, on_epoch=True)
+        # log all metrics (other than loss)
+        for k, v in self.to_log.items():
+            self.log(k, v, on_step=False, on_epoch=True)
         if optimizer_idx == 0:
+
             loss_actor, approx_kl = self.actor_loss(
                 state, past_pw, action, old_logp, adv
             )
-            self.log("loss_actor", loss_actor, on_epoch=True, on_step=False)
-            self.log("approx_kl", approx_kl, on_epoch=True, on_step=False)
+            if torch.isnan(loss_actor):
+                from IPython import embed
+
+                embed()
+            self.log("loss/actor", loss_actor, on_epoch=True, on_step=False)
+            self.log("loss/approx_kl", approx_kl, on_epoch=True, on_step=False)
             return loss_actor
 
         elif optimizer_idx == 1:
             loss_critic = self.critic_loss(state, past_pw, qval)
-            self.log("loss_critic", loss_critic, on_epoch=True, on_step=False)
+            self.log("loss/critic", loss_critic, on_epoch=True, on_step=False)
             return loss_critic
 
     @staticmethod
@@ -430,6 +458,24 @@ def append(
         self.ep_rewards.append(reward)
         self.ep_values.append(value.item())
 
+    def shift_rewards(self):
+        """
+        The reward at time t was is realised as a consequence of action t-1. This is
+        special to our environment (see last paragraph of page 9 in the article:
+        https://arxiv.org/pdf/1706.10059.pdf). This means, at the end of each episode
+        we need to drop the very last element of state/action/logp/value/info and
+        shift the rewards by one to the right, i.e. making r0 align with a1, r1 with a2,
+        ... rn-1 with an.
+        """
+        self.ep_rewards = self.ep_rewards[1:]
+        self.states.pop(-1)
+        self.past_pws.pop(-1)
+        self.actions.pop(-1)
+        self.logps.pop(-1)
+        self.infos.pop(-1)
+        self.ep_rewards.pop(-1)
+        self.ep_values.pop(-1)
+
     def merge_buffers(self, buffers):
         """
         Merges the passed in ExperienceBuffers and overwrites the current state with it.
@@ -444,6 +490,8 @@ def merge_buffers(self, buffers):
         self.logps = chain(*[buffer.logps for buffer in buffers])
         self.qvals = chain(*[buffer.qvals for buffer in buffers])
         self.advs = chain(*[buffer.advs for buffer in buffers])
+        self.infos = chain(*[buffer.infos for buffer in buffers])
+        self.p_ep_end_value = chain(*[buffer.p_ep_end_value for buffer in buffers])
         self.done_episodes = sum([buffer.done_episodes for buffer in buffers])
         self.epoch_rewards = sum([buffer.epoch_rewards for buffer in buffers])
 
@@ -457,6 +505,7 @@ def clear_buffer(self):
         self.qvals = []
         self.logps = []
         self.infos = []
+        self.p_ep_end_value = []
 
         # episode / epoch vars
         self.ep_rewards = []
@@ -530,7 +579,7 @@ def gather_experience(
         lam: See docs of :func:`PPO.calc_advantage`
 
     Returns:
-        Adds the results to `exp_queue` so it can be processed in the main process.
+        Experience collected in this parallel worker.
     """
     buffer = ExperienceBuffer()
     state = env.reset()
@@ -547,17 +596,19 @@ def gather_experience(
 
         terminal = len(buffer.ep_rewards) == max_episode_length
         if done or terminal:
-            buffer.qvals += PPO.discount_rewards(buffer.ep_rewards, gamma)[:-1]
+            # this is specific to our special environment setup
+            # buffer.shift_rewards()
+            buffer.qvals += PPO.discount_rewards(buffer.ep_rewards, gamma)
             buffer.advs += PPO.calc_advantage(
                 buffer.ep_rewards, buffer.ep_values, gamma, lam
             )
             buffer.done_episodes += 1
             buffer.epoch_rewards += np.sum(buffer.ep_rewards)
+            buffer.p_ep_end_value.append(info["portfolio_value"])
 
-            # episode over, reset the env and the buffer
+            # episode over, reset the env and the episode  buffer
             buffer.ep_rewards = []
             buffer.ep_values = []
             state = env.reset()
             past_pw = PPO._init_past_pw(asset_num, device)
-
     return buffer
diff --git a/src/dagobert/modelling/rl/rl_args.py b/src/dagobert/modelling/rl/rl_args.py
index 32853ad6..4f2db348 100644
--- a/src/dagobert/modelling/rl/rl_args.py
+++ b/src/dagobert/modelling/rl/rl_args.py
@@ -110,6 +110,18 @@ def add_rl_specific_args(parent_parser):
         default=0.2,
         help="Clipping parameter for the PPO's policy upgrade cost function.",
     )
+    parser.add_argument(
+        "--actor_dirichlet_sample_size",
+        type=float,
+        default=1,
+        help=(
+            "Determines how deterministic our Dirichlet based sampling is. At default "
+            "(1), we return a single sample from the dist. With higher sample sizes "
+            "the returned weights are closer and closer to the actual mean of the "
+            "distribution. If it's set to zero, we return the mean and basically "
+            "the model becomes deterministic."
+        ),
+    )
 
     return parser
 

From 24b28376f834481657667b55a20a3b3fb1eea2dd Mon Sep 17 00:00:00 2001
From: MateMarcell <marcell-mate@hotmail.com>
Date: Thu, 21 Jan 2021 18:52:03 +0000
Subject: [PATCH 36/62] data, runner, cleanup

---
 config/tcn_config_data.yaml                   |   3 +-
 config/timegan_config.yaml                    |  35 +-
 notebooks/modelling/test_cryptodataset.ipynb  | 341 +++++++++++++++++-
 .../modelling/augmentation/timegan.py         | 175 +++++++--
 src/dagobert/modelling/dl/__init__.py         |   8 +-
 src/dagobert/modelling/dl/data.py             |  39 +-
 6 files changed, 550 insertions(+), 51 deletions(-)

diff --git a/config/tcn_config_data.yaml b/config/tcn_config_data.yaml
index da8eca4b..ea9a5926 100644
--- a/config/tcn_config_data.yaml
+++ b/config/tcn_config_data.yaml
@@ -59,7 +59,8 @@ no_sample_weights: False
 # DATA
 # --------------------------------------------------------------------------------------
 
-data_dir: "/home/ubuntu/dagobert/data/modelling"
+#data_dir: "/home/ubuntu/dagobert/data/modelling"
+data_dir: "C:/Users/u164428/Desktop/Dagobert/data/modelling"
 
 lookback: auto
 mini_series_length: auto
diff --git a/config/timegan_config.yaml b/config/timegan_config.yaml
index cd65fefd..9c664c4d 100644
--- a/config/timegan_config.yaml
+++ b/config/timegan_config.yaml
@@ -12,7 +12,7 @@ gpus: 0
 
 log_dir: logs
 num_workers: 4
-exp_name: Time-GAN
+exp_name: TGAN-test
 tags:
   - time_gan_test
 no_comet_logger: True
@@ -29,6 +29,12 @@ rnn: lstm
 # embedding weight in cost of generator loss
 emb_weight: 1
 
+# don't change these, or preprocessing won't work
+target_col:
+to_label: False
+no_sample_weights: True
+binariser_method:
+
 # --------------------------------------------------------------------------------------
 # MODEL
 # --------------------------------------------------------------------------------------
@@ -37,7 +43,7 @@ dropout: 0.2
 num_layers: 2
 hidden_size: 50
 z_dim: 50
-mini_series_length: 240
+mini_series_length: 50
 
 # --------------------------------------------------------------------------------------
 # DATA
@@ -47,6 +53,31 @@ mini_series_length: 240
 #data_dir: "/home/daniel/dagobert_data/modelling"
 data_dir: "C:/Users/u164428/Desktop/Dagobert/data/modelling"
 
+
+df_train:
+  anchor: std_bar_ETHUSDT_tick_1.feather
+
+cols_to_model:
+  anchor:
+    - date_diff
+    - open
+    - high
+    - low
+    - close
+#    - cum_ticks
+#    - cum_dollar
+#    - volume
+#    - cum_volume_buy
+#    - cum_volume_sell
+#    - cum_volume_quote
+#    - cum_volume_quote_buy
+#    - cum_volume_quote_sell
+#    - sin_date
+#    - cos_date
+#    - sin_time
+#    - cos_time
+
+
 # --------------------------------------------------------------------------------------
 # PREPROCESSING
 # --------------------------------------------------------------------------------------
diff --git a/notebooks/modelling/test_cryptodataset.ipynb b/notebooks/modelling/test_cryptodataset.ipynb
index c6d23254..3cf26b90 100644
--- a/notebooks/modelling/test_cryptodataset.ipynb
+++ b/notebooks/modelling/test_cryptodataset.ipynb
@@ -2,9 +2,18 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 2,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The autoreload extension is already loaded. To reload it, use:\n",
+      "  %reload_ext autoreload\n"
+     ]
+    }
+   ],
    "source": [
     "%load_ext autoreload\n",
     "%autoreload 2\n",
@@ -12,8 +21,9 @@
     "import pandas as pd\n",
     "import numpy as np\n",
     "import torch\n",
+    "from pathlib import Path\n",
     "\n",
-    "from dagobert.modelling.dl import CryptoDataset\n",
+    "from dagobert.modelling.dl import CryptoDataset, GeneratorCryptoDataset\n",
     "from dagobert.preprocessing.utils import set_dt_index"
    ]
   },
@@ -27,7 +37,8 @@
      "output_type": "stream",
      "text": [
       "  dev\n",
-      "* test/cryptodata\n"
+      "  feat/orderbook_data\n",
+      "* feat/tgan\n"
      ]
     }
    ],
@@ -39,11 +50,229 @@
    "cell_type": "code",
    "execution_count": 3,
    "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "# conda environments:\n",
+      "#\n",
+      "base                     C:\\Users\\u164428\\AppData\\Local\\Continuum\\anaconda3\n",
+      "dagobert              *  C:\\Users\\u164428\\AppData\\Local\\Continuum\\anaconda3\\envs\\dagobert\n",
+      "tensorenviron            C:\\Users\\u164428\\AppData\\Local\\Continuum\\anaconda3\\envs\\tensorenviron\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "! conda env list"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
    "outputs": [],
    "source": [
     "df = pd.read_feather(\"C:/Users/u164428/Desktop/Dagobert/data/modelling/std_bar_XRPUSDT_volume_125000.feather\")"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>date_diff</th>\n",
+       "      <th>sin_time</th>\n",
+       "      <th>cos_time</th>\n",
+       "      <th>sin_date</th>\n",
+       "      <th>cos_date</th>\n",
+       "      <th>date_time</th>\n",
+       "      <th>open</th>\n",
+       "      <th>close</th>\n",
+       "      <th>high</th>\n",
+       "      <th>low</th>\n",
+       "      <th>cum_ticks</th>\n",
+       "      <th>cum_dollar</th>\n",
+       "      <th>volume</th>\n",
+       "      <th>cum_volume_buy</th>\n",
+       "      <th>cum_volume_sell</th>\n",
+       "      <th>cum_volume_quote</th>\n",
+       "      <th>cum_volume_quote_buy</th>\n",
+       "      <th>cum_volume_quote_sell</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>-0.946930</td>\n",
+       "      <td>0.321439</td>\n",
+       "      <td>0.587785</td>\n",
+       "      <td>-0.809017</td>\n",
+       "      <td>2018-05-26 19:15:00</td>\n",
+       "      <td>0.62049</td>\n",
+       "      <td>0.62073</td>\n",
+       "      <td>0.62230</td>\n",
+       "      <td>0.61990</td>\n",
+       "      <td>118.0</td>\n",
+       "      <td>82411.086333</td>\n",
+       "      <td>132750.45</td>\n",
+       "      <td>65058.60</td>\n",
+       "      <td>67691.85</td>\n",
+       "      <td>82434.100899</td>\n",
+       "      <td>40405.129359</td>\n",
+       "      <td>42028.971540</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>840.0</td>\n",
+       "      <td>-0.925541</td>\n",
+       "      <td>0.378649</td>\n",
+       "      <td>0.587785</td>\n",
+       "      <td>-0.809017</td>\n",
+       "      <td>2018-05-26 19:29:00</td>\n",
+       "      <td>0.62074</td>\n",
+       "      <td>0.62124</td>\n",
+       "      <td>0.62181</td>\n",
+       "      <td>0.62000</td>\n",
+       "      <td>119.0</td>\n",
+       "      <td>81044.155573</td>\n",
+       "      <td>130494.81</td>\n",
+       "      <td>88327.54</td>\n",
+       "      <td>42167.27</td>\n",
+       "      <td>81042.379975</td>\n",
+       "      <td>54868.965179</td>\n",
+       "      <td>26173.414796</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>420.0</td>\n",
+       "      <td>-0.913545</td>\n",
+       "      <td>0.406737</td>\n",
+       "      <td>0.587785</td>\n",
+       "      <td>-0.809017</td>\n",
+       "      <td>2018-05-26 19:36:00</td>\n",
+       "      <td>0.62069</td>\n",
+       "      <td>0.62151</td>\n",
+       "      <td>0.62157</td>\n",
+       "      <td>0.62001</td>\n",
+       "      <td>71.0</td>\n",
+       "      <td>79926.853613</td>\n",
+       "      <td>128650.06</td>\n",
+       "      <td>126545.46</td>\n",
+       "      <td>2104.60</td>\n",
+       "      <td>79913.083642</td>\n",
+       "      <td>78606.055488</td>\n",
+       "      <td>1307.028154</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>420.0</td>\n",
+       "      <td>-0.900698</td>\n",
+       "      <td>0.434445</td>\n",
+       "      <td>0.587785</td>\n",
+       "      <td>-0.809017</td>\n",
+       "      <td>2018-05-26 19:43:00</td>\n",
+       "      <td>0.62151</td>\n",
+       "      <td>0.62225</td>\n",
+       "      <td>0.62225</td>\n",
+       "      <td>0.62010</td>\n",
+       "      <td>97.0</td>\n",
+       "      <td>84404.854894</td>\n",
+       "      <td>135782.73</td>\n",
+       "      <td>129644.41</td>\n",
+       "      <td>6138.32</td>\n",
+       "      <td>84370.106540</td>\n",
+       "      <td>80558.691512</td>\n",
+       "      <td>3811.415028</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>840.0</td>\n",
+       "      <td>-0.872496</td>\n",
+       "      <td>0.488621</td>\n",
+       "      <td>0.587785</td>\n",
+       "      <td>-0.809017</td>\n",
+       "      <td>2018-05-26 19:57:00</td>\n",
+       "      <td>0.62108</td>\n",
+       "      <td>0.62056</td>\n",
+       "      <td>0.62224</td>\n",
+       "      <td>0.61925</td>\n",
+       "      <td>118.0</td>\n",
+       "      <td>91415.801332</td>\n",
+       "      <td>147278.78</td>\n",
+       "      <td>93035.02</td>\n",
+       "      <td>54243.76</td>\n",
+       "      <td>91370.899395</td>\n",
+       "      <td>57733.407535</td>\n",
+       "      <td>33637.491861</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   date_diff  sin_time  cos_time  sin_date  cos_date           date_time  \\\n",
+       "0        0.0 -0.946930  0.321439  0.587785 -0.809017 2018-05-26 19:15:00   \n",
+       "1      840.0 -0.925541  0.378649  0.587785 -0.809017 2018-05-26 19:29:00   \n",
+       "2      420.0 -0.913545  0.406737  0.587785 -0.809017 2018-05-26 19:36:00   \n",
+       "3      420.0 -0.900698  0.434445  0.587785 -0.809017 2018-05-26 19:43:00   \n",
+       "4      840.0 -0.872496  0.488621  0.587785 -0.809017 2018-05-26 19:57:00   \n",
+       "\n",
+       "      open    close     high      low  cum_ticks    cum_dollar     volume  \\\n",
+       "0  0.62049  0.62073  0.62230  0.61990      118.0  82411.086333  132750.45   \n",
+       "1  0.62074  0.62124  0.62181  0.62000      119.0  81044.155573  130494.81   \n",
+       "2  0.62069  0.62151  0.62157  0.62001       71.0  79926.853613  128650.06   \n",
+       "3  0.62151  0.62225  0.62225  0.62010       97.0  84404.854894  135782.73   \n",
+       "4  0.62108  0.62056  0.62224  0.61925      118.0  91415.801332  147278.78   \n",
+       "\n",
+       "   cum_volume_buy  cum_volume_sell  cum_volume_quote  cum_volume_quote_buy  \\\n",
+       "0        65058.60         67691.85      82434.100899          40405.129359   \n",
+       "1        88327.54         42167.27      81042.379975          54868.965179   \n",
+       "2       126545.46          2104.60      79913.083642          78606.055488   \n",
+       "3       129644.41          6138.32      84370.106540          80558.691512   \n",
+       "4        93035.02         54243.76      91370.899395          57733.407535   \n",
+       "\n",
+       "   cum_volume_quote_sell  \n",
+       "0           42028.971540  \n",
+       "1           26173.414796  \n",
+       "2            1307.028154  \n",
+       "3            3811.415028  \n",
+       "4           33637.491861  "
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df = df.iloc[:, :18]\n",
+    "df.head()"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 73,
@@ -1389,6 +1618,110 @@
    "source": [
     "x.searchsorted(5, side=\"left\"), x.searchsorted(5, side=\"right\")"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# TimeGAN"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "BATCH_SIZE = 4"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "std_bar_XRPUSDT_tick_1.feather doesn't have enough bars to use with the anchor after restricting it to the max date of the anchor's index:std_bar_XRPUSDT_tick_1.feather: 1279233 bars / anchor: 1624303 bars.\n",
+      "The samples from this augment_dfs will be less unique as we approachthe end date of the anchor 2020-10-17T00:06:00.000000000.\n",
+      "std_bar_XRPUSDT_tick_1.feather doesn't have adequate time-coverage for anchor DF. This could lead to non-unique samples from this augment_dfs.\n",
+      "\n",
+      "Anchor min/max dates: 2017-08-19T00:39:00.000000000/2020-10-17T00:06:00.000000000. \n",
+      "std_bar_XRPUSDT_tick_1.feather min/max dates: 2018-05-06 00:09:00/2020-10-17 00:06:00.\n"
+     ]
+    }
+   ],
+   "source": [
+    "dataset = GeneratorCryptoDataset(\n",
+    "        df_to_load = {\"anchor\": \"std_bar_ETHUSDT_tick_1.feather\",\n",
+    "                      \"df2\": \"std_bar_XRPUSDT_tick_1.feather\"},\n",
+    "        cols_to_model = {\"anchor\": [\"date_diff\", \"open\", \"high\", \"low\", \"close\"],\n",
+    "                        \"df2\": [\"date_diff\", \"open\"]},\n",
+    "        target_col = None,\n",
+    "        mini_series_length = 3,\n",
+    "        last_y = True,\n",
+    "        date_col = \"date_time\",\n",
+    "        data_dir =  Path(\"C:/Users/u164428/Desktop/Dagobert/data/modelling\"),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data_loaded = list(torch.utils.data.DataLoader(dataset, batch_size = BATCH_SIZE))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "541435"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(data_loaded)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "torch.Size([3, 3, 7])"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data_loaded[0].shape "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
diff --git a/src/dagobert/modelling/augmentation/timegan.py b/src/dagobert/modelling/augmentation/timegan.py
index 44268be1..cf621d5c 100644
--- a/src/dagobert/modelling/augmentation/timegan.py
+++ b/src/dagobert/modelling/augmentation/timegan.py
@@ -7,6 +7,7 @@
 from typing import List, Optional
 from argparse import Namespace
 import logging
+from copy import deepcopy
 from pathlib import Path
 
 import numpy as np
@@ -23,11 +24,67 @@
 from torch.utils.data import Dataset, WeightedRandomSampler, RandomSampler, DataLoader
 
 from pytorch_lightning import LightningModule
-
-from dagobert.modelling.dl import AdaBelief
+from pytorch_lightning.trainer import seed_everything
+from pytorch_lightning.callbacks import ModelCheckpoint
+from pytorch_lightning import Trainer, Callback, loggers
+
+from dagobert.naming import NStudy, NPreprocessingArgs as npa
+from dagobert.modelling.dl import (
+    GeneratorCryptoDataset,
+    TemporalConvNet,
+    Preprocessing,
+    AdaBelief,
+    LogCoshLoss,
+    FocalLoss,
+)
 from dagobert.modelling.augmentation.utils import get_noise
 
 
+logger = logging.getLogger(__name__)
+
+
+def run_tgan(args):
+    # setup loggers
+    seed_everything(args.seed)
+    tb_logger_name = None
+    comet_name = args.exp_name
+    gan_loggers = []
+    tb_logger = loggers.TensorBoardLogger(
+        save_dir=Path(args.log_dir), name=args.exp_name, version=tb_logger_name
+    )
+    gan_loggers.append(tb_logger)
+    if not args.no_comet_logger:
+        gan_loggers.append(
+            loggers.CometLogger(
+                api_key=NStudy.comet_api_key,
+                workspace=NStudy.comet_workspace,
+                save_dir=args.log_dir,
+                project_name=NStudy.comet_project_name,
+                experiment_name=f"{comet_name}_{tb_logger.version}",
+            )
+        )
+
+    # setup callbacks
+    checkpoint_callback = ModelCheckpoint(
+        monitor="loss_gen",
+        filename="_{epoch:02d}_{avg_reward:.10f}",
+        dirpath=f"{args.log_dir}/models/{args.exp_name}_{tb_logger.version}",
+        save_top_k=3,
+        mode="max",
+    )
+
+    # define trainer and and lightning module
+    args.multiprocessing = True if args.gpus != 1 else False
+    trainer = Trainer.from_argparse_args(
+        args,
+        logger=gan_loggers,
+        checkpoint_callback=checkpoint_callback,
+    )
+    model = TimeGANLightning(args)
+    trainer.fit(model)
+    # trainer.test()
+
+
 class RnnBlock(nn.Module):
     """
     Class for creating 5 components of TimeGAN.
@@ -105,6 +162,7 @@ def __init__(self, hparams: Namespace):
         # define main vars (other than model)
         super().__init__()
         # TODO: pre sanity check, define hparams
+        hparams = TimeGANLightning._pre_sanity_check(hparams)
         # lightning sets this to cuda too late for some of our setup to work
         self.tgan_device = "cuda" if hparams.gpus > 0 else "cpu"
         # TODO: check if real data is the right one, get data in
@@ -190,8 +248,7 @@ def training_step(self, batch, batch_idx, optimizer_idx):
         Returns:
             Loss
         """
-        # TODO: is there any label to give back?
-        x, label = batch
+        x = batch
         batch_len = len(x)
 
         h = self.embedder(x)
@@ -256,35 +313,35 @@ def training_step(self, batch, batch_idx, optimizer_idx):
             )
             # update generator
             if optimizer_idx == 2:
-
-                e_hat = self.generator(z)
-                h_hat = self.supervisor(e_hat)
-                h_hat_supervise = self.supervisor(h)
-
-                # synthetic data
-                x_hat = self.recovery(h_hat)
-                # no_grad to leave discriminator unchanged
-                with torch.no_grad():
-                    y_fake = self.discriminator(h_hat)
-                    y_fake_e = self.discriminator(e_hat)
-                loss_gen = TimeGANLightning.generator_loss(
-                    y_fake,
-                    y_fake_e,
-                    h,
-                    h_hat_supervise,
-                    x,
-                    x_hat,
-                    self.hparams.emb_weight,
-                )
-                self.log(
-                    "loss_gen",
-                    loss_gen,
-                    on_step=False,
-                    on_epoch=True,
-                    prog_bar=True,
-                    logger=True,
-                )
-                return loss_gen
+                for i in range(2):
+                    e_hat = self.generator(z)
+                    h_hat = self.supervisor(e_hat)
+                    h_hat_supervise = self.supervisor(h)
+
+                    # synthetic data
+                    x_hat = self.recovery(h_hat)
+                    # no_grad to leave discriminator unchanged
+                    with torch.no_grad():
+                        y_fake = self.discriminator(h_hat)
+                        y_fake_e = self.discriminator(e_hat)
+                    loss_gen = TimeGANLightning.generator_loss(
+                        y_fake,
+                        y_fake_e,
+                        h,
+                        h_hat_supervise,
+                        x,
+                        x_hat,
+                        self.hparams.emb_weight,
+                    )
+                    self.log(
+                        "loss_gen",
+                        loss_gen,
+                        on_step=False,
+                        on_epoch=True,
+                        prog_bar=True,
+                        logger=True,
+                    )
+                    return loss_gen
 
             # update discriminator
             elif optimizer_idx == 4:
@@ -330,18 +387,43 @@ def configure_optimizers(self) -> List[optim.Optimizer]:
                 optimizers.append(optimizer)
         return optimizers
 
+    def train_dataloader(self):
+        return self._get_dataloader(self.hparams.df_train, "train")
+
+    def val_dataloader(self):
+        return self._get_dataloader(self.hparams.df_val, "val")
+
     # ----------------------------------------------------------------------------------
     # SETUP FUNCTIONS
     # ----------------------------------------------------------------------------------
-    def train_dataloader(
-        self,
-    ) -> DataLoader:
+    def _get_dataloader(self, dfs_to_load: dict, prefix: str) -> DataLoader:
         """
-
+        Returns a dataloader for train and validation sets.
+        Args:
+            dfs_to_load: Either train, validation or test DFs to load.
+            prefix: Name of phase, either train or val.
         Returns:
-
+            Instantiated DataLoader.
         """
-        return Dataloader(dataset=dataset, batch_size=self.hparams.batch_size)
+        # define dataset and plot it
+        if prefix == "train":
+            shuffle = True
+        else:
+            shuffle = False
+        dataset = GeneratorCryptoDataset(
+            df_to_load=dfs_to_load,
+            cols_to_model=self.hparams.cols_to_model,
+            target_col=self.hparams.target_col,
+            mini_series_length=self.hparams.mini_series_length,
+            last_y=self.hparams.last_y,
+            data_dir=self.hparams.data_dir,
+        )
+        return DataLoader(
+            dataset,
+            batch_size=self.hparams.batch_size,
+            shuffle=shuffle,
+            num_workers=self.hparams.num_workers,
+        )
 
     # ----------------------------------------------------------------------------------
     # CALCULATION
@@ -456,3 +538,18 @@ def discriminator_loss(y_fake, y_fake_e, y_real, emb_weight):
         d_loss_real = criterion(y_real, torch.ones_like(y_real))
         # TODO: any use of dividing loss by (2 + emb_weight)?
         return emb_weight * d_loss_fake_e + d_loss_fake + d_loss_real
+
+    @staticmethod
+    def _pre_sanity_check(hparams: Namespace):
+        # ensure we have the rl specific target column in the config
+        if hparams.target_col:
+            raise ValueError("target_col has to be None for GAn development.")
+
+        # fill in the same cols for any df that doesn't have the cols_to_model defined
+        if len(hparams.cols_to_model) > 1:
+            for df_name, cols in hparams.cols_to_model.items():
+                if df_name != npa.anchor and (cols is None or len(cols) == 0):
+                    hparams.cols_to_model[df_name] = deepcopy(
+                        hparams.cols_to_model[npa.anchor]
+                    )
+        return hparams
diff --git a/src/dagobert/modelling/dl/__init__.py b/src/dagobert/modelling/dl/__init__.py
index 4dd9f930..a2849ec8 100644
--- a/src/dagobert/modelling/dl/__init__.py
+++ b/src/dagobert/modelling/dl/__init__.py
@@ -1,4 +1,10 @@
-from .data import CryptoDataset, PortfolioCryptoDataset, ExperienceSourceDataset
+from .data import (
+    CryptoDataset,
+    PortfolioCryptoDataset,
+    ExperienceSourceDataset,
+    GeneratorCryptoDataset,
+)
+
 from .tcn_net import TemporalConvNet
 from .utils import LogCoshLoss, FocalLoss, MixedNormalPDFLoss
 from .adabelief import AdaBelief
diff --git a/src/dagobert/modelling/dl/data.py b/src/dagobert/modelling/dl/data.py
index 91e96391..b670c3bd 100644
--- a/src/dagobert/modelling/dl/data.py
+++ b/src/dagobert/modelling/dl/data.py
@@ -146,6 +146,7 @@ def __init__(
             simple_lookahead_reg: If True, instead of returning a 0/1 label, we return
                 the log return between the end of the mini-series and the lookahead
                 time as a label for a regression task.
+
         """
         self.df_to_load = df_to_load
         self.cols_to_model = cols_to_model
@@ -200,7 +201,7 @@ def __getitem__(self, idx):
 
     def _load_df_anchor(
         self,
-    ) -> pd.DatetimeIndex:
+    ) -> pd.DataFrame:
         """
         Loads the anchor DF, and returns it. We use the anchor df for plotting and to
         extract the master index which we measure everything else against in batching.
@@ -266,7 +267,7 @@ def _load_dfs_indices_targets(self) -> Tuple[dict, dict, list]:
             if df_name == npa.anchor:
                 targets.append(self._get_target(df))
 
-        # load augmnet DFs - dict of list of paths
+        # load augment DFs - dict of list of paths
         if self.augment_dfs:
             for df_name, df_paths in self.augment_dfs.items():
                 if isinstance(df_paths, str):
@@ -334,6 +335,9 @@ def _get_target(self, df: pd.DataFrame) -> np.array:
                 .bfill()
                 .ffill()
             ).values
+        # dummy solution for GAN - benefit is that rest of code is unchanged
+        elif not self.target_col:
+            return np.zeros(len(df))
         else:
             return df[self.target_col].values
 
@@ -387,7 +391,7 @@ def _get_from_upto_idxs(
         Returns the from and upto idx for a given sample in the batch given the idx.
         Since we are indexing with numerical idxes and not dates, if we have multiple
         dfs in df_train (e.g. anchor and df2), we need to ensure that df2's from and
-        upto idx-es are at not leaking info from the future and are from roughly the
+        upto idx-es are not leaking info from the future and are from roughly the
         same date time period. The same holds for the situation when we replaced anchor
         df with one of its augment_dfs. Therefore we always return lists of from_idxs
         and upto_idxs for each df in `batch_dfs`.
@@ -571,7 +575,7 @@ class PortfolioCryptoDataset(CryptoDataset):
     is achieved by adding the rl_return target column to the cols_to_model at init, and
     then fishing it out for each sample before returning it.
 
-    This convulated way was used so we can repurpose and keep as much of the original
+    This convuluted way was used so we can repurpose and keep as much of the original
     CryptoDataset as possible, without extensive refactoring.
     """
 
@@ -616,3 +620,30 @@ def __init__(self, generate_batch: Callable):
     def __iter__(self) -> Iterable:
         iterator = self.generate_batch()
         return iterator
+
+
+class GeneratorCryptoDataset(CryptoDataset):
+    """
+    This extends :class:`dagobert.modelling.dl.data.CryptoDataset` to make it
+    suitable for synthetic data generation through generative adversarial learning.
+
+    Instead of returning an array of Xs and single y, this returns for only X.
+    This convuluted way was used so we can repurpose and keep as much of the original
+    CryptoDataset as possible, without extensive refactoring.
+    """
+
+    def __init__(self, *args, **kw):
+        super().__init__(*args, **kw)
+
+    def __getitem__(self, idx):
+        """
+        We don't need to calculate or fetch y, as we only need X to be modelled.
+        """
+        idx = idx.tolist() if torch.is_tensor(idx) else idx
+        batch_dfs, batch_indices, _ = self._get_batch_dfs_indices_target()
+        from_idx, upto_idx = self._get_from_upto_idxs(idx, batch_indices)
+        Xs = self._get_Xs(batch_dfs, from_idx, upto_idx)
+        # from IPython import embed
+        # embed()
+        X = np.concatenate(Xs).T
+        return X

From 232bdd31c7e2902f30a708956ff1faa7faf39373 Mon Sep 17 00:00:00 2001
From: MateMarcell <marcell-mate@hotmail.com>
Date: Fri, 22 Jan 2021 17:40:23 +0000
Subject: [PATCH 37/62] preproc, volume scaling

---
 config/timegan_config.yaml                    |  5 +++
 .../modelling/augmentation/timegan.py         |  5 +++
 src/dagobert/modelling/dl/preprocessing.py    | 32 +++++++++++++------
 3 files changed, 32 insertions(+), 10 deletions(-)

diff --git a/config/timegan_config.yaml b/config/timegan_config.yaml
index 9c664c4d..cf1b838c 100644
--- a/config/timegan_config.yaml
+++ b/config/timegan_config.yaml
@@ -56,7 +56,10 @@ data_dir: "C:/Users/u164428/Desktop/Dagobert/data/modelling"
 
 df_train:
   anchor: std_bar_ETHUSDT_tick_1.feather
+df_val:
+df_test:
 
+# the cols of the secondary DFs will automatically be set to anchor's if not defined
 cols_to_model:
   anchor:
     - date_diff
@@ -81,3 +84,5 @@ cols_to_model:
 # --------------------------------------------------------------------------------------
 # PREPROCESSING
 # --------------------------------------------------------------------------------------
+
+scaling_method: minmax
\ No newline at end of file
diff --git a/src/dagobert/modelling/augmentation/timegan.py b/src/dagobert/modelling/augmentation/timegan.py
index cf621d5c..6dadf8df 100644
--- a/src/dagobert/modelling/augmentation/timegan.py
+++ b/src/dagobert/modelling/augmentation/timegan.py
@@ -121,6 +121,7 @@ def __init__(
                 dropout=dropout,
                 batch_first=batch_first,
             )
+        self.linear_input_size = linear_input_size
         self.tanh = nn.Tanh()
         self.linear = nn.Linear(linear_input_size, linear_output_size)
         self.sigmoid = nn.Sigmoid()
@@ -129,6 +130,8 @@ def __init__(
     def forward(self, x):
         rnn_out, _hidden = self.rnn(x)
         rnn_out = self.tanh(rnn_out)
+        # todo: is there reshaping needed?
+        # rnn_out = rnn_out.reshape(-1, self.linear_input_size)
         output = self.linear(rnn_out)
         if self.linear_activation:
             output = self.sigmoid(output)
@@ -165,6 +168,8 @@ def __init__(self, hparams: Namespace):
         hparams = TimeGANLightning._pre_sanity_check(hparams)
         # lightning sets this to cuda too late for some of our setup to work
         self.tgan_device = "cuda" if hparams.gpus > 0 else "cpu"
+        # prepare datafiles if necessary
+        hparams = Preprocessing().preprocess_train_dfs(hparams)
         # TODO: check if real data is the right one, get data in
         # TODO: any sanity checks on data, hypermparams
         self.real_logging = None
diff --git a/src/dagobert/modelling/dl/preprocessing.py b/src/dagobert/modelling/dl/preprocessing.py
index ffa6916a..5654064b 100644
--- a/src/dagobert/modelling/dl/preprocessing.py
+++ b/src/dagobert/modelling/dl/preprocessing.py
@@ -187,7 +187,7 @@ def preprocess_train_dfs(hparams: Namespace) -> Namespace:
         supplied parameters. If the `df_train` is defined, but the `df_val` and
         `df_test` are not, then we split the train data into 3, scale them, add sample
         weights to the train portion and if required binarise the label too. If a
-        particular combination of preprocessing parameters where already used and
+        particular combination of preprocessing parameters were already used and
         therefore we have an existing file already on the machine, we skip the input DF.
 
         Args:
@@ -466,12 +466,13 @@ def _get_scalers_for_train_dfs(hparams) -> dict:
     def _get_scalers_from_cols(cols: list, scaling_method: str) -> list:
         """
         For a given dataset's columns, this works out which columns to scale together
-        (OHLC). It will also creates a scaler for the non-OHLC columns. Furthermore
-        it will create groups of the supplied OHLC columns depending on the fractional
-        differencing suffix _fd_x at the end of theses columns. Then it will add the
-        first element of each group with a scaler then the rest of the group
-        with the same scaler instance. This is to ensure that OHLC columns are scaled
-        together and not independently.
+        (OHLC & volume). It will also creates a scaler for the non-OHLC/volume columns.
+        Furthermore, it will create groups of the supplied OHLC columns depending on the
+        fractional differencing suffix _fd_x at the end of these columns. It will also
+        create groups of the supplied volume cols based on base or quote quantity.
+        Then it will add the first element of each group with a scaler then the rest of
+        the group with the same scaler instance. This is to ensure that OHLC & volume
+        columns are scaled together and not independently.
 
         Args:
             cols: Columns to work on and check for OHLC columns.
@@ -489,10 +490,14 @@ def _instantiate_scaler(scaling_method):
                 return MinMaxScaler()
 
         scalers = []
-        # extrack OHLC cols
+        # extract OHLC and volume cols
         s_cols = pd.Series(list(cols))
         ohlc_cols = s_cols[s_cols.str.contains("open|high|low|close")]
-        non_ohlc_cols = list(s_cols[~s_cols.str.contains("open|high|low|close")])
+        vol_cols = list(
+            s_cols[(s_cols.str.contains("volume")) & (~s_cols.str.contains("quote"))]
+        )
+        vol_quote_cols = list(s_cols[s_cols.str.contains("volume_quote")])
+        non_ohlc_cols = list(s_cols[~s_cols.str.contains("open|high|low|close|volume")])
 
         # find groups of OHLC cols for multiple fd values (easiest to with a df)
         dh_ohlc_cols_data = [x.split("_fd_") for x in ohlc_cols] + [["na", "na"]]
@@ -508,7 +513,7 @@ def _instantiate_scaler(scaling_method):
             boolean_group_mask = df_ohlc_cols.suffix == unique_group_suffix
             ohlc_cols_groups.append(list(ohlc_cols.values[boolean_group_mask]))
 
-        # add scaler to non OHLC cols
+        # add scaler to non OHLC/volume cols
         scalers.append((_instantiate_scaler(scaling_method), non_ohlc_cols))
 
         # add scalers to OHLC col groups: one for the 1st col; then same for rest
@@ -516,6 +521,13 @@ def _instantiate_scaler(scaling_method):
             ohlc_scaler = _instantiate_scaler(scaling_method)
             scalers.append((ohlc_scaler, ohlc_cols_group.pop(0)))
             scalers.append((ohlc_scaler, ohlc_cols_group))
+
+        # add scaler to volume cols, like OHLC
+        for vol_group in [vol_cols, vol_quote_cols]:
+            vol_scaler = _instantiate_scaler(scaling_method)
+            scalers.append((vol_scaler, vol_group.pop(0)))
+            scalers.append((vol_scaler, vol_group))
+
         return scalers
 
     @staticmethod

From 57b4f564086afec81dd0e5656a099d7d71eb652d Mon Sep 17 00:00:00 2001
From: Daniel Homola <dani.homola@gmail.com>
Date: Sat, 23 Jan 2021 18:39:20 +0000
Subject: [PATCH 38/62] adding immediate reward like in the pgportfolio paper@

---
 config/rl_config.yaml                    | 36 +++++++-------
 notebooks/modelling/rl_env.ipynb         | 60 ++++++++++++++++--------
 src/dagobert/modelling/rl/__init__.py    |  2 +-
 src/dagobert/modelling/rl/environment.py |  4 +-
 src/dagobert/modelling/rl/networks.py    |  8 ++--
 src/dagobert/modelling/rl/ppo.py         | 45 ++++++++++++------
 6 files changed, 94 insertions(+), 61 deletions(-)

diff --git a/config/rl_config.yaml b/config/rl_config.yaml
index 7576287c..b5f29836 100644
--- a/config/rl_config.yaml
+++ b/config/rl_config.yaml
@@ -23,7 +23,7 @@ tags:
   - RL_test
 no_comet_logger: True
 seed: 42
-batch_size: 240
+batch_size: 256
 
 # --------------------------------------------------------------------------------------
 # RL
@@ -36,13 +36,13 @@ asset_names:
   - LTC
 trading_cost: 0.002
 reward_type: return
-max_episode_length: 240
-steps_per_epoch: 24000
-n_optim_iters: 2
-gamma: 0.0
-lam: 0.90
-lr_actor: 0.0003
-lr_critic: 0.0003
+max_episode_length: 500
+steps_per_epoch: 10000
+n_optim_iters: 4
+gamma: 0
+lam: 0
+lr_actor: 0.0001
+lr_critic: 0.0002
 clip_ratio: 0.25
 target_kl: 0.01
 
@@ -59,12 +59,12 @@ no_weight_norm: True
 # --------------------------------------------------------------------------------------
 
 actor_num_channels: [50, 50, 50, 50, 50]
-actor_kernel_size: 3
+actor_kernel_size: 5
 actor_dropout: 0.2
 # sample size - exp abs diff to mean | 20 - 5% | 50 - 3% | 100 - 2% | 500 - 1%
 actor_dirichlet_sample_size: 20
 critic_num_channels: [50, 50, 50, 50, 50]
-critic_kernel_size: 3
+critic_kernel_size: 5
 critic_dropout: 0.2
 use_last_timepoint: False
 
@@ -79,14 +79,14 @@ lookback: auto
 mini_series_length: auto
 
 df_train:
-  anchor: std_bar_BTCUSDT_volume_100.feather
-  df2: std_bar_ETHUSDT_volume_500.feather
-  df3: std_bar_XRPUSDT_volume_125000.feather
-  df4: std_bar_LTCUSDT_volume_1000.feather
-  # anchor: std_bar_BTCUSDT_tick_1.feather
-  # df2: std_bar_ETHUSDT_tick_1.feather
-  # df3: std_bar_XRPUSDT_tick_1.feather
-  # df4: std_bar_LTCUSDT_tick_1.feather
+  # anchor: std_bar_BTCUSDT_volume_100.feather
+  # df2: std_bar_ETHUSDT_volume_500.feather
+  # df3: std_bar_XRPUSDT_volume_125000.feather
+  # df4: std_bar_LTCUSDT_volume_1000.feather
+  anchor: std_bar_BTCUSDT_tick_1.feather
+  df2: std_bar_ETHUSDT_tick_1.feather
+  df3: std_bar_XRPUSDT_tick_1.feather
+  df4: std_bar_LTCUSDT_tick_1.feather
 df_val: 
 df_test:
 cols_to_model:
diff --git a/notebooks/modelling/rl_env.ipynb b/notebooks/modelling/rl_env.ipynb
index e4e1b0f2..e0172cb0 100644
--- a/notebooks/modelling/rl_env.ipynb
+++ b/notebooks/modelling/rl_env.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -28,7 +28,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
@@ -37,7 +37,7 @@
        "array([0.48192771, 0.26506024, 0.25301205])"
       ]
      },
-     "execution_count": 9,
+     "execution_count": 3,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -61,7 +61,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
@@ -70,7 +70,7 @@
        "0.00020481927710843396"
       ]
      },
-     "execution_count": 7,
+     "execution_count": 4,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -87,7 +87,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 53,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
@@ -96,7 +96,7 @@
        "0.9997951807228915"
       ]
      },
-     "execution_count": 53,
+     "execution_count": 9,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -107,7 +107,27 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 45,
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "1"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "p0"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
    "metadata": {},
    "outputs": [
     {
@@ -116,7 +136,7 @@
        "1.0372875000000001"
       ]
      },
-     "execution_count": 45,
+     "execution_count": 18,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -129,7 +149,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 46,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [
     {
@@ -138,7 +158,7 @@
        "0.03728750000000014"
       ]
      },
-     "execution_count": 46,
+     "execution_count": 12,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -160,7 +180,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 41,
    "metadata": {},
    "outputs": [
     {
@@ -169,11 +189,11 @@
      "text": [
       "p1 1.0366750000000002\n",
       "rho 0.036675000000000235\n",
-      "p1 1.0674461056875002\n",
-      "rho 0.029682500000000056\n",
-      "p1 1.0911367376956023\n",
-      "rho 0.022193750000000012\n",
-      "p1 1.0911367376956023\n",
+      "p1 1.0519309684687503\n",
+      "rho 0.014716249999999986\n",
+      "p1 1.05953774428449\n",
+      "rho 0.007231250000000022\n",
+      "p1 1.05953774428449\n",
       "rho 0.0\n"
      ]
     }
@@ -196,16 +216,16 @@
     "w1 = np.array([.8, .1, .1])\n",
     "p1 = step(y1, w1, w0, p0)\n",
     "\n",
-    "y2 = np.array([1, 1.2, 1.1])\n",
+    "y2 = np.array([1, 1.1, 1.05])\n",
     "w2 = np.array([.9, .05, .05])\n",
     "p2 = step(y2, w2, w1, p1)\n",
     "\n",
     "\n",
-    "y3 = np.array([1, 1.3, 1.15])\n",
+    "y3 = np.array([1, 1.1, 1.05])\n",
     "w3 = np.array([1, 0, 0])\n",
     "p3 = step(y3, w3, w2, p2)\n",
     "\n",
-    "y4 = np.array([1, 1.5, 1.5])\n",
+    "y4 = np.array([1, 1.1, 1.05])\n",
     "w4 = np.array([1, 0, 0])\n",
     "p4 = step(y4, w4, w3, p3)\n"
    ]
diff --git a/src/dagobert/modelling/rl/__init__.py b/src/dagobert/modelling/rl/__init__.py
index d4900664..fd00b0bd 100644
--- a/src/dagobert/modelling/rl/__init__.py
+++ b/src/dagobert/modelling/rl/__init__.py
@@ -1,3 +1,3 @@
 from .environment import RLData, RLPortfolio, RLEnv
-from .networks import ActorCriticTCN, ActorCriticAgent, ActorContinous
+from .networks import ActorCriticTCN, ActorCriticAgent, ActorContinuous
 from .ppo import PPO
diff --git a/src/dagobert/modelling/rl/environment.py b/src/dagobert/modelling/rl/environment.py
index 760d9639..f31b1cb7 100644
--- a/src/dagobert/modelling/rl/environment.py
+++ b/src/dagobert/modelling/rl/environment.py
@@ -48,6 +48,7 @@ def __init__(
             train_val_test: Whether we are training, validating or testing, it must be
                 either train, val or test.
         """
+        # TODO: make multi head environment
         self.hparams = hparams
 
         if train_val_test == "train":
@@ -137,9 +138,6 @@ def step(self, w1: np.array, y1: np.array) -> Tuple[float, dict, bool]:
 
         # (eq16) cost to change portfolio:
         # excluding change in cash to avoid double counting for transaction cost
-        # TODO: isn't this unrealistic? this would only work if there's an efficient
-        # market between all pairs within the portfilio, i.e XRP -> LTC, althought
-        # this is what pgportfolio uses too
         mu = self.trading_cost * (np.abs(dw1[1:] - w1[1:])).sum()
 
         # (eq11) final portfolio value: I thought this should be w1 (at the end), but
diff --git a/src/dagobert/modelling/rl/networks.py b/src/dagobert/modelling/rl/networks.py
index 2a0365b5..69c11abc 100644
--- a/src/dagobert/modelling/rl/networks.py
+++ b/src/dagobert/modelling/rl/networks.py
@@ -122,12 +122,12 @@ def forward(self, states, past_pw):
         if self.sample_size > 0:
             actions = pi.sample((self.sample_size,)).mean(dim=0)
         else:
-            actions = pi.sample.mean
+            actions = pi.mean
 
         # very rarely we get actions that don't sum to 1 or are negative, fix it here
-        if actions.sum() != past_pw.shape[0]:
-            actions = torch.clip(actions, 0.01, 0.99)
-            actions = (actions.T / actions.sum(dim=1)).T
+        if actions.sum() != past_pw.shape[0] or torch.any(actions < 0):
+            actions = torch.clamp(actions, 0.01, 0.99)
+            actions = (actions.T / actions.sum(dim=-1)).T
         return pi, actions, logits
 
     def get_log_prob(self, pi: Dirichlet, actions: torch.Tensor):
diff --git a/src/dagobert/modelling/rl/ppo.py b/src/dagobert/modelling/rl/ppo.py
index ff488143..e93e3000 100644
--- a/src/dagobert/modelling/rl/ppo.py
+++ b/src/dagobert/modelling/rl/ppo.py
@@ -157,6 +157,7 @@ def generate_experience_buffer(
            Tuple of Lists containing tensors for states, actions, log probs, qvals
                and advantage.
         """
+        # TODO: make this optional and multi head env the default and set no_weight_norm
         max_worker_steps = int(self.hparams.steps_per_epoch / self.hparams.num_workers)
         parallel_experiences = ParallelExperiences()
         device = self.setup_model_for_experience_gathering()
@@ -254,12 +255,12 @@ def discount_rewards(rewards: List[float], gamma: float = 0.99) -> List[float]:
             list of discounted rewards/advantages
         """
         assert isinstance(rewards[0], float)
-        cumul_reward = []
+        cum_r = []
         sum_r = 0.0
         for r in reversed(rewards):
             sum_r = (sum_r * gamma) + r
-            cumul_reward.append(sum_r)
-        return list(reversed(cumul_reward))
+            cum_r.append(sum_r)
+        return list(reversed(cum_r))
 
     @staticmethod
     def calc_advantage(
@@ -269,9 +270,9 @@ def calc_advantage(
         lam: float = 0.95,
     ) -> List[float]:
         """
-        Calculate the advantage given rewards, state values, and last value of episode.
+        Calculate the advantage given rewards and state values for an episode.
         The advantage compares how much better the actor did compared to what the
-        critic thought the given state is worth in reward.
+        critic thought the given state is worth in reward (value).
 
         Args:
             rewards: list of episode rewards
@@ -342,6 +343,7 @@ def configure_optimizers(self) -> List[optim.Optimizer]:
         return optimizer_actor, optimizer_critic
 
     def optimizer_step(self, *args, **kwargs):
+        # TODO: figure out a way to do kl divergence clipping
         for _ in range(self.hparams.n_optim_iters):
             super().optimizer_step(*args, **kwargs)
 
@@ -363,10 +365,11 @@ def training_step(
         Returns:
             loss
         """
+        # TODO: investigate rewards, advantage and why we get negative actor loss
+
         state, past_pw, action, old_logp, qval, adv = batch
         # normalize advantages within batch
         adv = (adv - adv.mean()) / adv.std()
-
         # log all metrics (other than loss)
         for k, v in self.to_log.items():
             self.log(k, v, on_step=False, on_epoch=True)
@@ -405,7 +408,7 @@ def _pre_sanity_check(hparams: Namespace):
 
 
 # --------------------------------------------------------------------------------------
-# HELPER CLASSES FOR PARALLEL EXPERIENCE COLLECTION
+# PARALLEL EXPERIENCE COLLECTION
 #
 # Moving this to another module would result in circular dependencies. Been there,
 # done that, it was painful, so let's just leave these here.
@@ -581,6 +584,8 @@ def gather_experience(
     Returns:
         Experience collected in this parallel worker.
     """
+    from datetime import datetime
+
     buffer = ExperienceBuffer()
     state = env.reset()
     past_pw = PPO._init_past_pw(asset_num, device)
@@ -595,14 +600,24 @@ def gather_experience(
         past_pw = PPO._update_past_pw(info["portfolio_value"], actor_logits, device)
 
         terminal = len(buffer.ep_rewards) == max_episode_length
-        if done or terminal:
-            # this is specific to our special environment setup
-            # buffer.shift_rewards()
-            buffer.qvals += PPO.discount_rewards(buffer.ep_rewards, gamma)
-            buffer.advs += PPO.calc_advantage(
-                buffer.ep_rewards, buffer.ep_values, gamma, lam
-            )
-            buffer.done_episodes += 1
+        if done or terminal or step == max_steps - 1:
+            # if rewards are immediate, we need this is due to our special environment
+            # where the immediate reward of a_0 can only calculate at t_1.
+            # if gamma == 0:
+            #     buffer.shift_rewards()
+            # buffer.qvals += PPO.discount_rewards(buffer.ep_rewards, gamma)
+            # buffer.advs += PPO.calc_advantage(
+            #     buffer.ep_rewards, buffer.ep_values, gamma, lam
+            # )
+
+            # according to the PGPortfolio paper, reward should be the sum of portfolio
+            # values, divided by length of episode - no discounting no BS, same for adv
+            epr = buffer.ep_rewards
+            epr = np.ones_like(epr) * sum(epr) / len(epr)
+            buffer.qvals += list(epr)
+            buffer.advs += list(epr - np.array(buffer.ep_values))
+            if done or terminal:
+                buffer.done_episodes += 1
             buffer.epoch_rewards += np.sum(buffer.ep_rewards)
             buffer.p_ep_end_value.append(info["portfolio_value"])
 

From e92e857826f1c9a288f961ce7e8b594e1197a3e0 Mon Sep 17 00:00:00 2001
From: Daniel Homola <dani.homola@gmail.com>
Date: Tue, 26 Jan 2021 21:39:32 +0000
Subject: [PATCH 39/62] making ppo env and reward calculation as close to the
 pgportfolio paper as possible

---
 config/rl_config.yaml                    | 11 ++++----
 src/dagobert/modelling/rl/environment.py |  1 -
 src/dagobert/modelling/rl/networks.py    |  9 ++++++-
 src/dagobert/modelling/rl/ppo.py         | 33 ++++++++++++------------
 4 files changed, 29 insertions(+), 25 deletions(-)

diff --git a/config/rl_config.yaml b/config/rl_config.yaml
index b5f29836..1d398528 100644
--- a/config/rl_config.yaml
+++ b/config/rl_config.yaml
@@ -36,13 +36,13 @@ asset_names:
   - LTC
 trading_cost: 0.002
 reward_type: return
-max_episode_length: 500
-steps_per_epoch: 10000
-n_optim_iters: 4
+max_episode_length: 2048
+steps_per_epoch: 24576
+n_optim_iters: 6
 gamma: 0
 lam: 0
 lr_actor: 0.0001
-lr_critic: 0.0002
+lr_critic: 0.0003
 clip_ratio: 0.25
 target_kl: 0.01
 
@@ -62,7 +62,7 @@ actor_num_channels: [50, 50, 50, 50, 50]
 actor_kernel_size: 5
 actor_dropout: 0.2
 # sample size - exp abs diff to mean | 20 - 5% | 50 - 3% | 100 - 2% | 500 - 1%
-actor_dirichlet_sample_size: 20
+actor_dirichlet_sample_size: 0
 critic_num_channels: [50, 50, 50, 50, 50]
 critic_kernel_size: 5
 critic_dropout: 0.2
@@ -73,7 +73,6 @@ use_last_timepoint: False
 # --------------------------------------------------------------------------------------
 
 data_dir: "C:/Work/dagobert/data/modelling"
-#data_dir: "/home/daniel/dagobert_data/modelling"
 
 lookback: auto
 mini_series_length: auto
diff --git a/src/dagobert/modelling/rl/environment.py b/src/dagobert/modelling/rl/environment.py
index f31b1cb7..5bde2888 100644
--- a/src/dagobert/modelling/rl/environment.py
+++ b/src/dagobert/modelling/rl/environment.py
@@ -175,7 +175,6 @@ def step(self, w1: np.array, y1: np.array) -> Tuple[float, dict, bool]:
         for i, name in enumerate(["USD"] + self.asset_names):
             info["weight_" + name] = w1[i]
             info["price_" + name] = y1[i]
-
         self.infos.append(info)
         return reward, info, done
 
diff --git a/src/dagobert/modelling/rl/networks.py b/src/dagobert/modelling/rl/networks.py
index 69c11abc..238a03db 100644
--- a/src/dagobert/modelling/rl/networks.py
+++ b/src/dagobert/modelling/rl/networks.py
@@ -56,6 +56,7 @@ def __init__(
         self.linear_a = nn.Linear(n_actions + 1, num_channels[-1])
         self.linear1 = nn.Linear(hparams.mini_series_length, 1)
         self.linear2 = nn.Linear(num_channels[-1] * 2, output_size)
+        self.linear_m = nn.Linear(num_channels[-1] * 2, 1)
 
     def forward(self, state, past_pw):
         s1 = self.tcn(*state)
@@ -65,7 +66,13 @@ def forward(self, state, past_pw):
         else:
             s2 = torch.tanh(self.linear1(s1).squeeze(-1))
         # bring together the state and past_pw representations
-        return self.linear2(torch.cat([s2, a1], dim=1))
+        if self.actor:
+            # m decides whether we update old weights or not by mixing past_pw and new
+            m = torch.sigmoid(self.linear_m(torch.cat([s2, a1], dim=1)))
+            past_w = past_pw[:, 1:]
+            return m * past_w + (1 - m) * self.linear2(torch.cat([s2, a1], dim=1))
+        else:
+            return self.linear2(torch.cat([s2, a1], dim=1))
 
 
 class ActorContinuous(nn.Module):
diff --git a/src/dagobert/modelling/rl/ppo.py b/src/dagobert/modelling/rl/ppo.py
index e93e3000..d27bc733 100644
--- a/src/dagobert/modelling/rl/ppo.py
+++ b/src/dagobert/modelling/rl/ppo.py
@@ -227,16 +227,17 @@ def update_metrics_to_log(self):
         self.to_log[f"{e}/avg_total_reward"] = ep_rewards / done_eps
         self.to_log[f"{e}/avg_step_reward"] = ep_rewards / self.hparams.steps_per_epoch
         self.to_log[f"{e}/avg_len"] = self.hparams.steps_per_epoch / done_eps
-        self.to_log[f"{p}/avg_value_epend"] = (
+        self.to_log[f"{p}/avg_value_ep_end"] = (
             sum(list(self.buffer.p_ep_end_value)) / done_eps
         )
-
+        self.to_log[f"{p}/avg_market_return_ep_end"] = (
+            sum(list(self.buffer.p_ep_end_market_return)) / done_eps
+        )
         # need this otherwise the generator won't work multiple times
         infos = pd.DataFrame(list(self.buffer.infos)).mean()
         self.to_log[f"{p}/avg_value"] = infos["portfolio_value"]
         self.to_log[f"{p}/avg_weight_std"] = infos["weights_std"]
         self.to_log[f"{p}/avg_rebalancing_cost"] = infos["rebalancing_cost"]
-        self.to_log[f"{p}/avg_market_return"] = infos["market_return"]
         for w in infos.index[infos.index.str.contains("weight_")]:
             self.to_log[f"weights/{w}"] = infos[w]
 
@@ -326,7 +327,7 @@ def actor_loss(self, state, past_pw, action, old_logp, adv) -> torch.Tensor:
         clip_ratio = torch.clamp(
             ratio, 1 - self.hparams.clip_ratio, 1 + self.hparams.clip_ratio
         )
-        loss_actor = -(torch.min(ratio * adv, clip_ratio * adv)).mean()
+        loss_actor = -torch.min(ratio * adv, clip_ratio * adv).mean()
         return loss_actor, approx_kl
 
     def critic_loss(self, state, past_pw, qval) -> torch.Tensor:
@@ -365,23 +366,17 @@ def training_step(
         Returns:
             loss
         """
-        # TODO: investigate rewards, advantage and why we get negative actor loss
-
         state, past_pw, action, old_logp, qval, adv = batch
         # normalize advantages within batch
-        adv = (adv - adv.mean()) / adv.std()
+        # adv = (adv - adv.mean()) / adv.std()
+
         # log all metrics (other than loss)
         for k, v in self.to_log.items():
             self.log(k, v, on_step=False, on_epoch=True)
         if optimizer_idx == 0:
-
             loss_actor, approx_kl = self.actor_loss(
                 state, past_pw, action, old_logp, adv
             )
-            if torch.isnan(loss_actor):
-                from IPython import embed
-
-                embed()
             self.log("loss/actor", loss_actor, on_epoch=True, on_step=False)
             self.log("loss/approx_kl", approx_kl, on_epoch=True, on_step=False)
             return loss_actor
@@ -476,7 +471,6 @@ def shift_rewards(self):
         self.actions.pop(-1)
         self.logps.pop(-1)
         self.infos.pop(-1)
-        self.ep_rewards.pop(-1)
         self.ep_values.pop(-1)
 
     def merge_buffers(self, buffers):
@@ -495,6 +489,9 @@ def merge_buffers(self, buffers):
         self.advs = chain(*[buffer.advs for buffer in buffers])
         self.infos = chain(*[buffer.infos for buffer in buffers])
         self.p_ep_end_value = chain(*[buffer.p_ep_end_value for buffer in buffers])
+        self.p_ep_end_market_return = chain(
+            *[buffer.p_ep_end_market_return for buffer in buffers]
+        )
         self.done_episodes = sum([buffer.done_episodes for buffer in buffers])
         self.epoch_rewards = sum([buffer.epoch_rewards for buffer in buffers])
 
@@ -509,6 +506,7 @@ def clear_buffer(self):
         self.logps = []
         self.infos = []
         self.p_ep_end_value = []
+        self.p_ep_end_market_return = []
 
         # episode / epoch vars
         self.ep_rewards = []
@@ -601,15 +599,15 @@ def gather_experience(
 
         terminal = len(buffer.ep_rewards) == max_episode_length
         if done or terminal or step == max_steps - 1:
-            # if rewards are immediate, we need this is due to our special environment
-            # where the immediate reward of a_0 can only calculate at t_1.
-            # if gamma == 0:
-            #     buffer.shift_rewards()
             # buffer.qvals += PPO.discount_rewards(buffer.ep_rewards, gamma)
             # buffer.advs += PPO.calc_advantage(
             #     buffer.ep_rewards, buffer.ep_values, gamma, lam
             # )
 
+            # if rewards are immediate, we need this is due to our special environment
+            # where the immediate reward of a_0 can only calculate at t_1.
+            if gamma == 0:
+                buffer.shift_rewards()
             # according to the PGPortfolio paper, reward should be the sum of portfolio
             # values, divided by length of episode - no discounting no BS, same for adv
             epr = buffer.ep_rewards
@@ -620,6 +618,7 @@ def gather_experience(
                 buffer.done_episodes += 1
             buffer.epoch_rewards += np.sum(buffer.ep_rewards)
             buffer.p_ep_end_value.append(info["portfolio_value"])
+            buffer.p_ep_end_market_return.append(np.array(info["market_return"]).prod())
 
             # episode over, reset the env and the episode  buffer
             buffer.ep_rewards = []

From 9506e3ff06d69020540d25cdcc22a56e71f1f167 Mon Sep 17 00:00:00 2001
From: Daniel Homola <dani.homola@gmail.com>
Date: Tue, 26 Jan 2021 22:21:46 +0000
Subject: [PATCH 40/62] making ppo env and reward calculation as close to the
 pgportfolio paper as possible

---
 config/rl_config.yaml                    |  3 ++-
 src/dagobert/modelling/rl/environment.py | 19 ++++++++++++++-----
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/config/rl_config.yaml b/config/rl_config.yaml
index 1d398528..55ed4acf 100644
--- a/config/rl_config.yaml
+++ b/config/rl_config.yaml
@@ -17,7 +17,7 @@ auto_scale_batch_size:
 # --------------------------------------------------------------------------------------
 
 log_dir: logs
-num_workers: 4
+num_workers: 1
 exp_name: RL-PPO-TCN
 tags:
   - RL_test
@@ -36,6 +36,7 @@ asset_names:
   - LTC
 trading_cost: 0.002
 reward_type: return
+num_env_heads: 4
 max_episode_length: 2048
 steps_per_epoch: 24576
 n_optim_iters: 6
diff --git a/src/dagobert/modelling/rl/environment.py b/src/dagobert/modelling/rl/environment.py
index 5bde2888..e6a57db3 100644
--- a/src/dagobert/modelling/rl/environment.py
+++ b/src/dagobert/modelling/rl/environment.py
@@ -45,10 +45,10 @@ def __init__(
                 - augment_prob
                 - augment_method
                 - augment_dfs_mix
+                - num_env_heads
             train_val_test: Whether we are training, validating or testing, it must be
                 either train, val or test.
         """
-        # TODO: make multi head environment
         self.hparams = hparams
 
         if train_val_test == "train":
@@ -69,21 +69,30 @@ def __init__(
             augment_dfs_mix=self.hparams.augment_dfs_mix,
         )
         self.dataset_len = len(self.dataset)
-        self.idx = np.random.randint(self.dataset_len - self.hparams.max_episode_length)
+        self._reset_idxs()
 
     def step(self):
-        Xs, ys = self.dataset[self.idx]
+        from IPython import embed
+
+        embed()
+        Xs, ys = self.dataset[self.idxs]
         # add cash price (always 1) to the new price vector
         y1 = np.concatenate([[1.0], ys])
         # turn Xs into a batch of 1, ready to be fed into the actor/critic
         Xs = [torch.Tensor(x).unsqueeze(0) for x in Xs]
-        self.idx += 1
+        self.idxs += 1
         return Xs, y1
 
     def reset(self):
-        self.idx = np.random.randint(self.dataset_len - self.hparams.max_episode_length)
+        self._reset_idxs()
         return self.step()
 
+    def _reset_idxs(self):
+        self.idxs = [
+            np.random.randint(self.dataset_len - self.hparams.max_episode_length)
+            for _ in self.hparams.num_env_heads
+        ]
+
 
 class RLPortfolio(object):
     """

From 6c31d1852d06a534fc236d2458e4209b936e1d5b Mon Sep 17 00:00:00 2001
From: MateMarcell <marcell-mate@hotmail.com>
Date: Fri, 29 Jan 2021 17:38:04 +0000
Subject: [PATCH 41/62] sorting shit

---
 .../modelling/augmentation/tgan_args.py       | 186 +++++++++++++++++
 .../modelling/augmentation/tgan_runner.py     |  38 ++++
 .../modelling/augmentation/timegan.py         | 195 +++++++++++++++++-
 src/dagobert/modelling/augmentation/utils.py  |  47 +++++
 4 files changed, 458 insertions(+), 8 deletions(-)
 create mode 100644 src/dagobert/modelling/augmentation/tgan_args.py
 create mode 100644 src/dagobert/modelling/augmentation/tgan_runner.py

diff --git a/src/dagobert/modelling/augmentation/tgan_args.py b/src/dagobert/modelling/augmentation/tgan_args.py
new file mode 100644
index 00000000..32853ad6
--- /dev/null
+++ b/src/dagobert/modelling/augmentation/tgan_args.py
@@ -0,0 +1,186 @@
+"""
+All custom arguments and hyper-parameters for the reinforcement learning module.
+"""
+
+from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
+
+from pytorch_lightning import Trainer
+
+from dagobert.modelling.dl.tcn import TCNLightning
+from dagobert.modelling.dl.tcn_args import (
+    add_run_specific_args,
+    add_data_specific_args,
+    add_preprocessing_specific_args,
+)
+from dagobert.naming import (
+    NInputDataCols,
+    NAugmentationMethods,
+    NBarriers,
+    NPreprocessingArgs,
+)
+
+
+def add_rl_specific_args(parent_parser):
+    parser = ArgumentParser(
+        parents=[parent_parser],
+        add_help=False,
+        formatter_class=ArgumentDefaultsHelpFormatter,
+    )
+
+    # this is just a place-holder so it's easier to read the million params in the cmd
+    parser.add_argument("--RL_PARAMS", help="====================================")
+    parser.add_argument(
+        "--asset_names",
+        type=str,
+        nargs="+",
+        default=["BTC", "ETH"],
+        help=(
+            "Names of instruments to include in the portfolio, corresponding to "
+            "anchor, df2, df3, etc."
+        ),
+    )
+    parser.add_argument(
+        "--trading_cost",
+        type=float,
+        default=0.002,
+        help="Commission rate of making trades + an estimated cost of slippage.",
+    )
+    parser.add_argument(
+        "--reward_type",
+        type=str,
+        default="return",
+        help=(
+            "Determines the overall reward to maximise by the agent. Either return or "
+            "sharpe. See RLPortfolio class for more details."
+        ),
+    )
+    parser.add_argument(
+        "--max_episode_length",
+        type=int,
+        default=1000,
+        help=(
+            "Maximum number of interactions between the agent and the environment in "
+            "an episode."
+        ),
+    )
+    parser.add_argument(
+        "--steps_per_epoch",
+        type=int,
+        default=10000,
+        help=(
+            "How many action-state pairs to rollout for trajectory collection per "
+            "epoch. I.e. if all episodes run to their max_episode_length, we'll have "
+            "steps_per_epoch/max_episode_length number of unique episodes/trajectories."
+        ),
+    )
+    parser.add_argument(
+        "--n_optim_iters",
+        type=int,
+        default=4,
+        help=(
+            "How many steps of gradient descent to perform on each batch. This might "
+            "seem weird, but it helps sampling efficiency, done by the original PPO "
+            "implementation and the Google ablation study found it to be useful."
+        ),
+    )
+    parser.add_argument(
+        "--gamma", type=float, default=0.99, help="Discounting of rewards."
+    )
+    parser.add_argument(
+        "--lam",
+        type=float,
+        default=0.95,
+        help="Lambda parameter in the advantage discounting equation.",
+    )
+    parser.add_argument(
+        "--lr_actor",
+        type=float,
+        default=0.0003,
+        help="Learning rate for the actor/policy network.",
+    )
+    parser.add_argument(
+        "--lr_critic",
+        type=float,
+        default=0.001,
+        help="Learning rate for the critic/value network.",
+    )
+    parser.add_argument(
+        "--clip_ratio",
+        type=float,
+        default=0.2,
+        help="Clipping parameter for the PPO's policy upgrade cost function.",
+    )
+
+    return parser
+
+
+def add_model_specific_args(parent_parser):
+    parser = ArgumentParser(
+        parents=[parent_parser],
+        add_help=False,
+        formatter_class=ArgumentDefaultsHelpFormatter,
+    )
+
+    # this is just a place-holder so it's easier to read the million params in the cmd
+    parser.add_argument("--MODEL_PARAMS", help="====================================")
+    parser.add_argument(
+        "--actor_num_channels",
+        type=int,
+        nargs="+",
+        default=[50, 50, 50, 50, 50],
+        help=(
+            "Determines the number of layers (depth) of the actor / policy network and "
+            "the hidden unit count in each layer."
+        ),
+    )
+    parser.add_argument(
+        "--critic_num_channels",
+        type=int,
+        nargs="+",
+        default=[50, 50, 50, 50, 50],
+        help=(
+            "Determines the number of layers (depth) of the critic / value network and "
+            "the hidden unit count in each layer."
+        ),
+    )
+    parser.add_argument("--actor_kernel_size", type=int, default=5, help=" ")
+    parser.add_argument("--critic_kernel_size", type=int, default=5, help=" ")
+    parser.add_argument("--actor_dropout", type=float, default=0, help=" ")
+    parser.add_argument("--critic_dropout", type=float, default=0, help=" ")
+    parser.add_argument(
+        "--no_class_weights",
+        action="store_true",
+        help=(
+            "Set this to True so we can leverage the Preprocessing pipeline written "
+            "for the supervised DL module."
+        ),
+    )
+    parser.add_argument(
+        "--use_last_timepoint",
+        action="store_true",
+        help=(
+            "If this flag is used the only the network's representation "
+            "corresponding at the latest time-point is used to predict the outcome."
+            "By default, we combine all representations across the sequence length"
+            "to make a prediction from, instead of just using the last one."
+        ),
+    )
+    return parser
+
+
+def get_all_args():
+    parser = ArgumentParser(
+        description="Lightning RL module",
+        formatter_class=ArgumentDefaultsHelpFormatter,
+    )
+
+    # add model params of lightning trainer (this HAS to be first)
+    parser = Trainer.add_argparse_args(parser)
+
+    # add model and run specific params
+    parser = add_rl_specific_args(parser)
+    parser = add_model_specific_args(parser)
+    parser = add_run_specific_args(parser)
+    parser = add_data_specific_args(parser)
+    parser = add_preprocessing_specific_args(parser)
+    return parser.parse_args()
diff --git a/src/dagobert/modelling/augmentation/tgan_runner.py b/src/dagobert/modelling/augmentation/tgan_runner.py
new file mode 100644
index 00000000..3ae6c027
--- /dev/null
+++ b/src/dagobert/modelling/augmentation/tgan_runner.py
@@ -0,0 +1,38 @@
+"""
+Dagobert's runner for TimeGAN.
+
+This module is driven by the `dagobert-tgan` command which can be parametrised by
+command line arguments, but it's much more convenient to use YAML configs for this,
+see the `tcn_args.py` and `tgan_args.py` for more detail.
+"""
+import logging
+from pathlib import Path
+
+from dagobert.utils import setup_logging
+from dagobert.runner_utils import load_config, update_args
+from dagobert.modelling.augmentation.tgan_args import get_all_args
+from dagobert.modelling.augmentation.timegan import run_tgan
+
+
+logger = logging.getLogger(__name__)
+
+
+def run():
+    """
+    Initialise a TimeGan network and train it.
+    """
+
+    # parse arguments and setup logging
+    args = get_all_args()
+    setup_logging(logger, "dagobert-tgan", logging.INFO, args.log_dir)
+
+    # load config yaml if exist
+    if args.config_path != "":
+        config = load_config(Path(args.config_path))
+        args = update_args(args, config)
+
+    run_tgan(args)
+
+
+if __name__ == "__main__":
+    run()
diff --git a/src/dagobert/modelling/augmentation/timegan.py b/src/dagobert/modelling/augmentation/timegan.py
index 6dadf8df..c750aa49 100644
--- a/src/dagobert/modelling/augmentation/timegan.py
+++ b/src/dagobert/modelling/augmentation/timegan.py
@@ -10,6 +10,7 @@
 from copy import deepcopy
 from pathlib import Path
 
+from sklearn.manifold import TSNE
 import numpy as np
 import pandas as pd
 import matplotlib
@@ -37,7 +38,7 @@
     LogCoshLoss,
     FocalLoss,
 )
-from dagobert.modelling.augmentation.utils import get_noise
+from dagobert.modelling.augmentation.utils import get_noise, pca_analysis
 
 
 logger = logging.getLogger(__name__)
@@ -70,7 +71,7 @@ def run_tgan(args):
         filename="_{epoch:02d}_{avg_reward:.10f}",
         dirpath=f"{args.log_dir}/models/{args.exp_name}_{tb_logger.version}",
         save_top_k=3,
-        mode="max",
+        mode="min",
     )
 
     # define trainer and and lightning module
@@ -141,11 +142,13 @@ def forward(self, x):
 class TimeGANLightning(LightningModule):
     """
     Lightning model made of 5 RNN nets working together:
-        - Embedding network between original feature space to latent space.
+        - Embedding network between original feature space to latent space, provides
+        lower-dimensional adversarial learning space.
         - Recovery network from latent space to original space.
         - Generator function: generate time-series data in latent space.
         - Discriminate the original and synthetic time-series data
-        - Supervisor generating next sequence using the previous sequence.
+        - Supervisor generating next sequence using the previous sequence to better
+        capture temporal dynamics
     """
 
     # ----------------------------------------------------------------------------------
@@ -170,7 +173,7 @@ def __init__(self, hparams: Namespace):
         self.tgan_device = "cuda" if hparams.gpus > 0 else "cpu"
         # prepare datafiles if necessary
         hparams = Preprocessing().preprocess_train_dfs(hparams)
-        # TODO: check if real data is the right one, get data in
+
         # TODO: any sanity checks on data, hypermparams
         self.real_logging = None
         self.comet_logging = not self.hparams.no_comet_logger
@@ -361,14 +364,20 @@ def training_step(self, batch, batch_idx, optimizer_idx):
                     y_fake, y_fake_e, y_real, self.hparams.emb_weight
                 )
                 self.log(
-                    "loss_disc",
+                    "loss_disc/train",
                     loss_disc,
                     on_step=False,
                     on_epoch=True,
                     prog_bar=True,
                     logger=True,
                 )
-                return loss_disc
+                # pytorch lightning needs to have "loss" in the return dict
+                return {
+                    "loss_disc/train": loss_disc,
+                    "y_fake/train": y_fake,
+                    "y_fake_e/train": y_fake_e,
+                    "y_real/train": y_real,
+                }
 
     def configure_optimizers(self) -> List[optim.Optimizer]:
         """
@@ -395,6 +404,57 @@ def configure_optimizers(self) -> List[optim.Optimizer]:
     def train_dataloader(self):
         return self._get_dataloader(self.hparams.df_train, "train")
 
+    def training_epoch_end(self, outputs):
+        return self._epoch_end(outputs, "train")
+
+    def validation_step(self, batch, batch_idx):
+        x = batch
+        batch_len = len(x)
+
+        # noise
+        z = get_noise(
+            batch_len,
+            self.hparams.mini_series_length,
+            self.hparams.z_dim,
+            device=self.tgan_device,
+        )
+        z = z.to(self.generator.model[0].weight.dtype)
+
+        # generate fake data and compare with validation set
+        h = self.embedder(x)
+        e_hat = self.generator(z)
+        h_hat = self.supervisor(e_hat)
+        x_hat = self.recovery(h_hat)
+
+        y_fake = self.discriminator(h_hat.detach())
+        y_fake_e = self.discriminator(e_hat.detach())
+        y_real = self.discriminator(h.detach())
+
+        pca_x, pca_x_hat = pca_analysis(x, x_hat)
+
+        loss_disc = TimeGANLightning.discriminator_loss(
+            y_fake, y_fake_e, y_real, self.hparams.emb_weight
+        )
+        self.log(
+            "loss_disc/val",
+            loss_disc,
+            on_step=False,
+            on_epoch=True,
+            prog_bar=True,
+            logger=True,
+        )
+        return {
+            "loss_disc/val": loss_disc,
+            "y_fake/val": y_fake,
+            "y_fake_e/val": y_fake_e,
+            "y_real/val": y_real,
+            "pca_x/val": pca_x,
+            "pca_x_hat/val": pca_x_hat,
+        }
+
+    def validation_epoch_end(self, outputs):
+        return self._epoch_end(outputs, "val")
+
     def val_dataloader(self):
         return self._get_dataloader(self.hparams.df_val, "val")
 
@@ -431,7 +491,7 @@ def _get_dataloader(self, dfs_to_load: dict, prefix: str) -> DataLoader:
         )
 
     # ----------------------------------------------------------------------------------
-    # CALCULATION
+    # LOSS CALCULATION
     # ----------------------------------------------------------------------------------
     @staticmethod
     def embed_loss0(x_tilde, x):
@@ -544,6 +604,125 @@ def discriminator_loss(y_fake, y_fake_e, y_real, emb_weight):
         # TODO: any use of dividing loss by (2 + emb_weight)?
         return emb_weight * d_loss_fake_e + d_loss_fake + d_loss_real
 
+    # ----------------------------------------------------------------------------------
+    # OTHER CALCULATION
+    # ----------------------------------------------------------------------------------
+    def _epoch_end(self, outputs, prefix="val"):
+        """
+        We average the loss across all batches, calculate metrics based on all batches
+        and log them. Finally, we make plots using all the y_true and y_preds.
+        Args:
+            outputs:
+            prefix:
+
+        Returns:
+
+        """
+        avg_loss = []
+        y_true = []
+        y_fake = []
+        y_fake_e = []
+        for x in outputs:
+            avg_loss.append(x[f"loss_disc/{prefix}"])
+            y_true.append(x[f"y_true/{prefix}"])
+            y_fake.append(x[f"y_fake/{prefix}"])
+            y_fake_e.append(x[f"y_fake_e/{prefix}"])
+        # log sampled images
+        self._make_plots(y_true, y_fake, prefix)
+
+    def _calculate_metrics(self, y_true, y_pred, prefix):
+        """
+        Calculates and logs various metrics for regression and classification use cases.
+        """
+        if self.hparams.output_size == 1:
+            y_pred = y_pred.squeeze(-1)
+            y_true = y_true.squeeze(-1)
+
+        if self.hparams.regression or self.hparams.mix_density_net:
+            mae = plm.mean_absolute_error(y_pred, y_true)
+            self.log(f"mean_absolute_error/{prefix}", mae)
+            spearman = spearmanr(t2n(y_pred), t2n(y_true)).correlation
+            self.log(f"spearman_r/{prefix}", spearman)
+        else:
+            if self.hparams.output_size == 1:
+                self.log(f"au_roc/{prefix}", plm.auroc(y_pred, y_true))
+                prec, rec, _ = plm.precision_recall_curve(y_pred, y_true)
+                self.log(f"au_pr/{prefix}", plm.auc(rec, prec))
+            elif self.hparams.output_size == 3:
+                y_pred = torch.argmax(torch.softmax(y_pred, dim=1), dim=1)
+                cm = cm_from_tensor(y_true, y_pred)
+                self.log(f"triple_barrier_error/{prefix}", triple_barrier_error(cm))
+                self.log(f"non_vertical_error/{prefix}", non_vertical_error(cm))
+
+    # ----------------------------------------------------------------------------------
+    # PLOTTING AND LOGGING FUNCTIONS
+    # ----------------------------------------------------------------------------------
+    def _make_plots(self, y_true, y_pred, prefix):
+        """
+        Makes the following useful summary plots of true and predicted ys:
+            - scatter plot of y_true and y_pred
+            - histogram of y_true, y_pred
+            - AUPR, AUROC
+            - confusion matrices for classification
+        """
+        # SCATTER
+        if self.hparams.regression or self.hparams.mix_density_net:
+            self._log_image(
+                f"true v pred scatter/{prefix}",
+                plot_from_tensor(y_true, y_pred),
+                self.current_epoch,
+            )
+        else:
+            # HISTOGRAM
+            if self.hparams.output_size == 1:
+                y_pred_class = (torch.sigmoid(y_pred) > 0.5).int()
+                y_pred_for_hist = torch.sigmoid(y_pred)
+            else:
+                y_pred_class = torch.argmax(torch.softmax(y_pred, dim=1), dim=1)
+                y_pred_for_hist = y_pred_class
+            self._log_image(
+                f"true v pred hist/{prefix}",
+                hist_from_tensor(y_true, y_pred_for_hist),
+                self.current_epoch,
+            )
+
+            # PR, ROC
+            if self.hparams.output_size == 1:
+                y_pred = y_pred.squeeze(-1)
+                y_true = y_true.squeeze(-1)
+                fpr, tr, _ = plm.roc(y_pred, y_true)
+                self._log_image(
+                    f"roc/{prefix}",
+                    plot_from_tensor(fpr, tr, "line", "FPR", "TPR"),
+                    self.current_epoch,
+                )
+                prec, rec, thr = plm.precision_recall_curve(y_pred, y_true)
+                self._log_image(
+                    f"pr/{prefix}",
+                    plot_from_tensor(rec, prec, "line", "recall", "precision"),
+                    self.current_epoch,
+                )
+
+                # CM -  high confidence binary classification
+                proba_filter = self.hparams.confident_binary_proba_threshold
+                if y_pred.max() >= proba_filter or y_pred.min() <= (1.0 - proba_filter):
+                    self._log_image(
+                        f"high confidence true v pred cm/{prefix}",
+                        plot_cm(cm_from_tensor(y_true, y_pred, proba_filter)),
+                        self.current_epoch,
+                    )
+
+            # CMs - all classification
+            self._log_image(
+                f"true v pred cm/{prefix}",
+                plot_cm(cm_from_tensor(y_true, y_pred_class)),
+                self.current_epoch,
+            )
+
+    # ----------------------------------------------------------------------------------
+    # SANITY CHECK FUNCTIONS
+    # ----------------------------------------------------------------------------------
+
     @staticmethod
     def _pre_sanity_check(hparams: Namespace):
         # ensure we have the rl specific target column in the config
diff --git a/src/dagobert/modelling/augmentation/utils.py b/src/dagobert/modelling/augmentation/utils.py
index 72e162eb..9137d97d 100644
--- a/src/dagobert/modelling/augmentation/utils.py
+++ b/src/dagobert/modelling/augmentation/utils.py
@@ -1,6 +1,8 @@
 """Util functions for TimeGAN and other augmentation related tasks"""
 
 import torch
+import numpy as np
+from sklearn.decomposition import PCA
 
 
 def get_noise(n_samples: int, mini_series_length: int, z_dim: int, device: str = "cpu"):
@@ -19,3 +21,48 @@ def get_noise(n_samples: int, mini_series_length: int, z_dim: int, device: str =
         Tensor of filled with random numbers from uniform distribution.
     """
     return torch.rand(n_samples, mini_series_length, z_dim, device=device)
+
+
+def pca_analysis(x, x_hat, components: int = 2):
+    """
+    PCA on 2 (real and synthetic) datasets
+    Args:
+        x: real data of shape (batch, time, feature)
+        x_hat: synthetic data of the same shape
+        components: number of pca components to keep
+
+    Returns:
+    2 arrays of PCA-reduced real and synthetic data
+    """
+    x = np.asarray(x)
+    x_hat = np.asarray(x_hat)
+
+    x = np.mean(x, 2)
+    x_hat = np.mean(x_hat, 2)
+
+    # Parameters
+    No = x.shape[0]
+    colors = ["red" for i in range(No)] + ["blue" for i in range(No)]
+
+    # PCA Analysis
+    pca = PCA(n_components=components)
+    pca.fit(x)
+    pca_results = pca.transform(x)
+    pca_hat_results = pca.transform(x_hat)
+    return pca_results, pca_hat_results
+    # Plotting
+    """
+    f, ax = plt.subplots(1)
+
+    plt.scatter(pca_results[:, 0], pca_results[:, 1], c=colors[:No], alpha=0.2,
+                label="Original")
+    plt.scatter(pca_hat_results[:, 0], pca_hat_results[:, 1], c=colors[No:],
+                alpha=0.2, label="Synthetic")
+
+    ax.legend()
+
+    plt.title('PCA plot')
+    plt.xlabel('x-pca')
+    plt.ylabel('y_pca')
+    plt.show()
+    """

From c504ca743428712f661d1da80c14757e1c51f6b9 Mon Sep 17 00:00:00 2001
From: MateMarcell <marcell-mate@hotmail.com>
Date: Fri, 29 Jan 2021 19:21:33 +0000
Subject: [PATCH 42/62] plots

---
 .../modelling/augmentation/timegan.py         | 154 ++++++++----------
 src/dagobert/modelling/augmentation/utils.py  |  24 ---
 src/dagobert/modelling/dl/data.py             |   4 +-
 src/dagobert/modelling/utils.py               |  30 ++++
 4 files changed, 103 insertions(+), 109 deletions(-)

diff --git a/src/dagobert/modelling/augmentation/timegan.py b/src/dagobert/modelling/augmentation/timegan.py
index c750aa49..f26acde9 100644
--- a/src/dagobert/modelling/augmentation/timegan.py
+++ b/src/dagobert/modelling/augmentation/timegan.py
@@ -28,6 +28,7 @@
 from pytorch_lightning.trainer import seed_everything
 from pytorch_lightning.callbacks import ModelCheckpoint
 from pytorch_lightning import Trainer, Callback, loggers
+from pytorch_lightning.metrics import functional as plm
 
 from dagobert.naming import NStudy, NPreprocessingArgs as npa
 from dagobert.modelling.dl import (
@@ -39,6 +40,20 @@
     FocalLoss,
 )
 from dagobert.modelling.augmentation.utils import get_noise, pca_analysis
+from dagobert.modelling.utils import (
+    triple_barrier_error,
+    non_vertical_error,
+    t2n,
+    cm_from_tensor,
+    hist_from_tensor,
+    plot_from_tensor,
+    plot_cm,
+    fig_to_tb,
+    fig_to_comet,
+    plot_pca,
+    update_lookback,
+    plot_anchor_sample,
+)
 
 
 logger = logging.getLogger(__name__)
@@ -430,7 +445,7 @@ def validation_step(self, batch, batch_idx):
         y_fake_e = self.discriminator(e_hat.detach())
         y_real = self.discriminator(h.detach())
 
-        pca_x, pca_x_hat = pca_analysis(x, x_hat)
+        pca_x, pca_x_hat = pca_analysis(t2n(x), t2n(x_hat))
 
         loss_disc = TimeGANLightning.discriminator_loss(
             y_fake, y_fake_e, y_real, self.hparams.emb_weight
@@ -483,6 +498,7 @@ def _get_dataloader(self, dfs_to_load: dict, prefix: str) -> DataLoader:
             last_y=self.hparams.last_y,
             data_dir=self.hparams.data_dir,
         )
+        self._plot_dataset(*dataset.plot(), prefix)
         return DataLoader(
             dataset,
             batch_size=self.hparams.batch_size,
@@ -619,105 +635,77 @@ def _epoch_end(self, outputs, prefix="val"):
 
         """
         avg_loss = []
-        y_true = []
+        y_real = []
         y_fake = []
         y_fake_e = []
+        pca_x = []
+        pca_x_hat = []
         for x in outputs:
             avg_loss.append(x[f"loss_disc/{prefix}"])
-            y_true.append(x[f"y_true/{prefix}"])
+            y_real.append(x[f"y_real/{prefix}"])
             y_fake.append(x[f"y_fake/{prefix}"])
             y_fake_e.append(x[f"y_fake_e/{prefix}"])
+            pca_x.append(x[f"pca_x/{prefix}"])
+            pca_x_hat.append(x[f"pca_x_hat/{prefix}"])
         # log sampled images
-        self._make_plots(y_true, y_fake, prefix)
-
-    def _calculate_metrics(self, y_true, y_pred, prefix):
-        """
-        Calculates and logs various metrics for regression and classification use cases.
-        """
-        if self.hparams.output_size == 1:
-            y_pred = y_pred.squeeze(-1)
-            y_true = y_true.squeeze(-1)
-
-        if self.hparams.regression or self.hparams.mix_density_net:
-            mae = plm.mean_absolute_error(y_pred, y_true)
-            self.log(f"mean_absolute_error/{prefix}", mae)
-            spearman = spearmanr(t2n(y_pred), t2n(y_true)).correlation
-            self.log(f"spearman_r/{prefix}", spearman)
-        else:
-            if self.hparams.output_size == 1:
-                self.log(f"au_roc/{prefix}", plm.auroc(y_pred, y_true))
-                prec, rec, _ = plm.precision_recall_curve(y_pred, y_true)
-                self.log(f"au_pr/{prefix}", plm.auc(rec, prec))
-            elif self.hparams.output_size == 3:
-                y_pred = torch.argmax(torch.softmax(y_pred, dim=1), dim=1)
-                cm = cm_from_tensor(y_true, y_pred)
-                self.log(f"triple_barrier_error/{prefix}", triple_barrier_error(cm))
-                self.log(f"non_vertical_error/{prefix}", non_vertical_error(cm))
+        self._make_plots(y_real, y_fake, pca_x, pca_x_hat, prefix)
 
     # ----------------------------------------------------------------------------------
     # PLOTTING AND LOGGING FUNCTIONS
     # ----------------------------------------------------------------------------------
-    def _make_plots(self, y_true, y_pred, prefix):
+    def _plot_dataset(
+        self, fig_close: Figure, fig_data: Figure, fig_target: Figure, prefix: str
+    ):
         """
-        Makes the following useful summary plots of true and predicted ys:
-            - scatter plot of y_true and y_pred
-            - histogram of y_true, y_pred
-            - AUPR, AUROC
-            - confusion matrices for classification
+        Plots the close price and the target column of the train/val/test datasets.
+
+        Args:
+            fig_close: First element of the returned tuple of `CryptoDataset.plot()`
+            fig_data: Second element of the returned tuple of `CryptoDataset.plot()`
+            prefix: One of train, val, test.
         """
-        # SCATTER
-        if self.hparams.regression or self.hparams.mix_density_net:
-            self._log_image(
-                f"true v pred scatter/{prefix}",
-                plot_from_tensor(y_true, y_pred),
-                self.current_epoch,
-            )
-        else:
-            # HISTOGRAM
-            if self.hparams.output_size == 1:
-                y_pred_class = (torch.sigmoid(y_pred) > 0.5).int()
-                y_pred_for_hist = torch.sigmoid(y_pred)
-            else:
-                y_pred_class = torch.argmax(torch.softmax(y_pred, dim=1), dim=1)
-                y_pred_for_hist = y_pred_class
-            self._log_image(
-                f"true v pred hist/{prefix}",
-                hist_from_tensor(y_true, y_pred_for_hist),
-                self.current_epoch,
-            )
+        self._log_image(f"anchor_close/{prefix}", fig_close, 0)
 
-            # PR, ROC
-            if self.hparams.output_size == 1:
-                y_pred = y_pred.squeeze(-1)
-                y_true = y_true.squeeze(-1)
-                fpr, tr, _ = plm.roc(y_pred, y_true)
-                self._log_image(
-                    f"roc/{prefix}",
-                    plot_from_tensor(fpr, tr, "line", "FPR", "TPR"),
-                    self.current_epoch,
-                )
-                prec, rec, thr = plm.precision_recall_curve(y_pred, y_true)
-                self._log_image(
-                    f"pr/{prefix}",
-                    plot_from_tensor(rec, prec, "line", "recall", "precision"),
-                    self.current_epoch,
+    def _log_image(self, image_name, image_data, i):
+        """
+        Logs any generated image to both tensorboard and comet.
+        """
+        if self.real_logging:
+            self.logger.experiment[0].add_image(image_name, fig_to_tb(image_data), i)
+            if self.comet_logging:
+                self.logger.experiment[1].log_image(
+                    fig_to_comet(image_data), name=image_name, step=i
                 )
 
-                # CM -  high confidence binary classification
-                proba_filter = self.hparams.confident_binary_proba_threshold
-                if y_pred.max() >= proba_filter or y_pred.min() <= (1.0 - proba_filter):
-                    self._log_image(
-                        f"high confidence true v pred cm/{prefix}",
-                        plot_cm(cm_from_tensor(y_true, y_pred, proba_filter)),
-                        self.current_epoch,
-                    )
+    def _log_graph(self, datasets: GeneratorCryptoDataset):
+        """
+        Logs the graph of the model to both tensorboard and comet.
+        """
+        examples_dataloader = DataLoader(datasets, batch_size=32)
+        example_shapes = [xi.shape for xi in next(iter(examples_dataloader))[0]]
+        examples = [torch.rand(*s).float().to(self.tgan_device) for s in example_shapes]
+        if self.real_logging:
+            self.logger.experiment[0].add_graph(self, examples)
 
-            # CMs - all classification
-            self._log_image(
-                f"true v pred cm/{prefix}",
-                plot_cm(cm_from_tensor(y_true, y_pred_class)),
-                self.current_epoch,
-            )
+    def _make_plots(self, y_real, y_fake, pca_x, pca_x_hat, prefix):
+        """
+        Makes following useful summary plots:
+            - plotting 2-dim PCA for visualising diversity learned
+            - (discriminator's) histogram of y_true, y_fake, y_fake_e
+        """
+        # PCA SCATTER
+        self._log_image(
+            f"real v fake PCA-scatter/{prefix}",
+            plot_pca(pca_x, pca_x_hat),
+            self.current_epoch,
+        )
+
+        # HISTOGRAM
+        self._log_image(
+            f"real v fake hist/{prefix}",
+            hist_from_tensor(y_real, y_fake),
+            self.current_epoch,
+        )
 
     # ----------------------------------------------------------------------------------
     # SANITY CHECK FUNCTIONS
diff --git a/src/dagobert/modelling/augmentation/utils.py b/src/dagobert/modelling/augmentation/utils.py
index 9137d97d..11351f66 100644
--- a/src/dagobert/modelling/augmentation/utils.py
+++ b/src/dagobert/modelling/augmentation/utils.py
@@ -34,35 +34,11 @@ def pca_analysis(x, x_hat, components: int = 2):
     Returns:
     2 arrays of PCA-reduced real and synthetic data
     """
-    x = np.asarray(x)
-    x_hat = np.asarray(x_hat)
-
     x = np.mean(x, 2)
     x_hat = np.mean(x_hat, 2)
 
-    # Parameters
-    No = x.shape[0]
-    colors = ["red" for i in range(No)] + ["blue" for i in range(No)]
-
-    # PCA Analysis
     pca = PCA(n_components=components)
     pca.fit(x)
     pca_results = pca.transform(x)
     pca_hat_results = pca.transform(x_hat)
     return pca_results, pca_hat_results
-    # Plotting
-    """
-    f, ax = plt.subplots(1)
-
-    plt.scatter(pca_results[:, 0], pca_results[:, 1], c=colors[:No], alpha=0.2,
-                label="Original")
-    plt.scatter(pca_hat_results[:, 0], pca_hat_results[:, 1], c=colors[No:],
-                alpha=0.2, label="Synthetic")
-
-    ax.legend()
-
-    plt.title('PCA plot')
-    plt.xlabel('x-pca')
-    plt.ylabel('y_pca')
-    plt.show()
-    """
diff --git a/src/dagobert/modelling/dl/data.py b/src/dagobert/modelling/dl/data.py
index 779b44e6..5de98d3e 100644
--- a/src/dagobert/modelling/dl/data.py
+++ b/src/dagobert/modelling/dl/data.py
@@ -640,8 +640,8 @@ class GeneratorCryptoDataset(CryptoDataset):
     CryptoDataset as possible, without extensive refactoring.
     """
 
-    def __init__(self, *args, **kw):
-        super().__init__(*args, **kw)
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
 
     def __getitem__(self, idx):
         """
diff --git a/src/dagobert/modelling/utils.py b/src/dagobert/modelling/utils.py
index e23befd4..90bf465c 100644
--- a/src/dagobert/modelling/utils.py
+++ b/src/dagobert/modelling/utils.py
@@ -421,3 +421,33 @@ def plot_anchor_sample(i, obj, x):
     cols = obj.hparams.cols_to_model["anchor"]
     df = pd.DataFrame(x[0][i].detach().cpu().numpy().T, columns=cols)
     df.plot(subplots=True, layout=(int(np.ceil((len(cols) / 4))), 4))
+
+
+def plot_pca(pca_x, pca_x_hat):
+    """
+    Plot PCA-reduced x and x_hat to visualise similarity. Overlap suggests similarity.
+    Args:
+        pca_x: 2-component-PCA of x
+        pca_x_hat: 2-component-PCA of x_hat
+
+    Returns:
+    Scatter plot showing 2-component-PCA of x & x_hat.
+    """
+
+    f, ax = plt.subplots(1)
+    length = pca_x.shape[0]
+    colors = ["red" for i in range(length)] + ["blue" for i in range(length)]
+    plt.scatter(pca_x[:, 0], pca_x[:, 1], c=colors[:length], alpha=0.2, label="Real")
+    plt.scatter(
+        pca_x_hat[:, 0],
+        pca_x_hat[:, 1],
+        c=colors[length:],
+        alpha=0.2,
+        label="Synthetic",
+    )
+    ax.legend()
+    plt.title("PCA plot")
+    plt.xlabel("x-pca")
+    plt.ylabel("y_pca")
+    plt.close()
+    return f

From d48c2fb18647e5e0dcdce32dfea5da3a988514cb Mon Sep 17 00:00:00 2001
From: Daniel Homola <dani.homola@gmail.com>
Date: Sat, 30 Jan 2021 17:49:45 +0000
Subject: [PATCH 43/62] adding multi head environment to speed up experience
 gathering without multiprocessing

---
 config/rl_config.yaml                    |  16 +-
 src/dagobert/modelling/dl/tcn_args.py    |  13 --
 src/dagobert/modelling/rl/environment.py | 106 +++++-----
 src/dagobert/modelling/rl/ppo.py         | 242 ++++++++++++++---------
 src/dagobert/modelling/rl/rl_args.py     |  48 ++++-
 5 files changed, 265 insertions(+), 160 deletions(-)

diff --git a/config/rl_config.yaml b/config/rl_config.yaml
index 55ed4acf..8f1bd392 100644
--- a/config/rl_config.yaml
+++ b/config/rl_config.yaml
@@ -23,7 +23,7 @@ tags:
   - RL_test
 no_comet_logger: True
 seed: 42
-batch_size: 256
+batch_size: 500
 
 # --------------------------------------------------------------------------------------
 # RL
@@ -36,12 +36,15 @@ asset_names:
   - LTC
 trading_cost: 0.002
 reward_type: return
-num_env_heads: 4
-max_episode_length: 2048
-steps_per_epoch: 24576
+num_env_heads: 20
+num_env_workers: 1
+normalize_advantages: True
+pgportfolio: True
+max_episode_length: 1000
+steps_per_epoch: 100000
 n_optim_iters: 6
-gamma: 0
-lam: 0
+gamma: 0.99
+lam: 0.95
 lr_actor: 0.0001
 lr_critic: 0.0003
 clip_ratio: 0.25
@@ -53,7 +56,6 @@ target_col: rl_return
 to_label: False
 no_sample_weights: True
 binariser_method: 
-no_weight_norm: True
 
 # --------------------------------------------------------------------------------------
 # MODEL
diff --git a/src/dagobert/modelling/dl/tcn_args.py b/src/dagobert/modelling/dl/tcn_args.py
index 21df5792..d08b0a0b 100644
--- a/src/dagobert/modelling/dl/tcn_args.py
+++ b/src/dagobert/modelling/dl/tcn_args.py
@@ -188,19 +188,6 @@ def add_model_specific_args(parent_parser):
             "multi-class (3) classification with CrossEntropyLoss."
         ),
     )
-    parser.add_argument(
-        "--no_weight_norm",
-        action="store_true",
-        help=(
-            " Weight norm is registered as a pre_forward_hook on the 1D convolutional "
-            "layers of the TemporalBlock, and these cannot be serialised when training "
-            "with parallel processes interacting with the model concurrently. If True, "
-            "we add weight normalisation around these layers, and TCN cannot be used "
-            "in a multiprocessing setting. If False, then it can be used, even staying "
-            "on GPU in linux (CPU only on Windows)."
-        ),
-    )
-
     parser.add_argument(
         "--no_class_weights",
         action="store_true",
diff --git a/src/dagobert/modelling/rl/environment.py b/src/dagobert/modelling/rl/environment.py
index e6a57db3..6218ee38 100644
--- a/src/dagobert/modelling/rl/environment.py
+++ b/src/dagobert/modelling/rl/environment.py
@@ -10,6 +10,7 @@
 import torch
 import numpy as np
 from torch.utils.data import Dataset, DataLoader
+from torch.utils.data.dataloader import default_collate
 
 from dagobert.naming import NPreprocessingArgs as npa
 from dagobert.modelling.dl import PortfolioCryptoDataset
@@ -21,8 +22,10 @@
 
 class RLData(object):
     """
-    Leverages the data class and configuration methods from the `dagobert.modelling.dl`
-    module as much as possible.
+    Creates a multi-head data reader, meaning, we can concurrently return the next
+    state of the environment for arbitrarily many times (see step function). Leverages
+    the data class and configuration methods from the `dagobert.modelling.dl` module as
+    much as possible.
     """
 
     def __init__(
@@ -34,8 +37,8 @@ def __init__(
         Class constructor.
 
         Args:
-            hparams: Hyparams parsed by the rl_runner. Similar to how `TCNLightning` is
-                initialized with the following fields:
+            hparams: Hyperparams parsed by the rl_runner. Similar to how `TCNLightning`
+                is initialized with the following fields:
                 - max_episode_length
                 - cols_to_model
                 - target_col
@@ -50,7 +53,6 @@ def __init__(
                 either train, val or test.
         """
         self.hparams = hparams
-
         if train_val_test == "train":
             augment_dfs = self.hparams.augment_dfs
             augment_method = self.hparams.augment_method
@@ -69,13 +71,11 @@ def __init__(
             augment_dfs_mix=self.hparams.augment_dfs_mix,
         )
         self.dataset_len = len(self.dataset)
+        self.latest_idx = self.dataset_len - self.hparams.max_episode_length
         self._reset_idxs()
 
-    def step(self):
-        from IPython import embed
-
-        embed()
-        Xs, ys = self.dataset[self.idxs]
+    def step2(self):
+        Xs, ys = self.dataset[self.idxs[0]]
         # add cash price (always 1) to the new price vector
         y1 = np.concatenate([[1.0], ys])
         # turn Xs into a batch of 1, ready to be fed into the actor/critic
@@ -83,15 +83,30 @@ def step(self):
         self.idxs += 1
         return Xs, y1
 
+    def step(self):
+        Xs = []
+        ys = []
+        for idx in self.idxs:
+            X, y = self.dataset[idx]
+            # Xs.append([torch.Tensor(x).unsqueeze(0) for x in X])
+            # making sure have float32 data so we don't get torch.float64 tensors later
+            Xs.append([x.astype("float32") for x in X])
+            ys.append(y)
+        self.idxs += 1
+
+        # add cash price (always 1) to the new price vector (a column of ones)
+        ys = np.vstack(ys)
+        y1 = np.ones((ys.shape[0], ys.shape[1] + 1))
+        y1[:, 1:] = ys
+        return default_collate(Xs), y1
+
     def reset(self):
         self._reset_idxs()
         return self.step()
 
     def _reset_idxs(self):
-        self.idxs = [
-            np.random.randint(self.dataset_len - self.hparams.max_episode_length)
-            for _ in self.hparams.num_env_heads
-        ]
+        # reset all head's starting index
+        self.idxs = np.random.randint(self.latest_idx, size=self.hparams.num_env_heads)
 
 
 class RLPortfolio(object):
@@ -116,10 +131,12 @@ def __init__(self, hparams: Namespace):
             hparams: Hyparams parsed by the rl_runner. Similar to how `TCNLightning` is
                 initialized with the following fields:
                 - asset_names
+                - num_env_heads
                 - trading_cost
                 - reward_type
         """
         self.asset_names = hparams.asset_names
+        self.num_env_heads = hparams.num_env_heads
         self.asset_n = len(self.asset_names)
         self.trading_cost = hparams.trading_cost
         self.reward_type = hparams.reward_type
@@ -140,19 +157,17 @@ def step(self, w1: np.array, y1: np.array) -> Tuple[float, dict, bool]:
         """
         w0 = self.w0
         p0 = self.p0
-        assert y1[0] == 1, "Cash price has to remain constant: 1."
 
         # (eq7) since we last acted prices changed, so weights evolve into
-        dw1 = (y1 * w0) / (np.dot(y1, w0) + eps)
+        new_price_old_weights_sum = np.sum(y1 * w0, axis=1)
+        dw1 = ((y1 * w0).T / (new_price_old_weights_sum + eps)).T
 
         # (eq16) cost to change portfolio:
         # excluding change in cash to avoid double counting for transaction cost
-        mu = self.trading_cost * (np.abs(dw1[1:] - w1[1:])).sum()
+        mu = self.trading_cost * (np.abs(dw1[:, 1:] - w1[:, 1:])).sum(axis=1)
 
-        # (eq11) final portfolio value: I thought this should be w1 (at the end), but
-        # then think through how the env actually models the world (see Figure 1), w0
-        # (which is the original implementation) makes sense here.
-        p1 = p0 * (1 - mu) * np.dot(y1, w0)
+        # (eq11) final portfolio value
+        p1 = p0 * (1 - mu) * new_price_old_weights_sum
 
         # (eq9 & 10) rate of return log rate of return
         rho1 = p1 / p0 - 1  # rate of returns
@@ -161,37 +176,38 @@ def step(self, w1: np.array, y1: np.array) -> Tuple[float, dict, bool]:
         # (eq22) immediate reward is log rate of return scaled by episode length
         if self.reward_type == "return":
             reward = r1
-        # TODO: implement the differentiable sharpe ratio reward like so https://quant.stackexchange.com/a/38040
+        # TODO: implement the differentiable sharpe ratio reward
+        # https://quant.stackexchange.com/a/38040
 
         # remember for next step
         self.w0 = w1
         self.p0 = p1
 
-        # if we run out of money, we're done
-        done = p1 <= 0
-
-        # should only return single values, not list
-        info = {
-            "reward": reward,
-            "log_return": r1,
-            "portfolio_value": p1,
-            "market_return": y1.mean(),
-            "rate_of_return": rho1,
-            "weights_std": w1.std(),
-            "rebalancing_cost": mu,
-        }
-        # record weights and prices
-        for i, name in enumerate(["USD"] + self.asset_names):
-            info["weight_" + name] = w1[i]
-            info["price_" + name] = y1[i]
-        self.infos.append(info)
-        return reward, info, done
+        # if we run out of money we're done: all env heads are linked here unfortunately
+        done = np.any(p1 <= 0)
+
+        infos = []
+        for i in range(self.num_env_heads):
+            info = {
+                "reward": reward[i],
+                "log_return": r1[i],
+                "portfolio_value": p1[i],
+                "market_return": y1[i].mean(),
+                "rate_of_return": rho1[i],
+                "weights_std": w1[i].std(),
+                "rebalancing_cost": mu[i],
+            }
+            # record weights and prices
+            for j, name in enumerate(["USD"] + self.asset_names):
+                info["weight_" + name] = w1[i, j]
+                info["price_" + name] = y1[i, j]
+            infos.append(info)
+        return reward, infos, done
 
     def reset(self):
-        self.infos = []
-        self.w0 = np.zeros(self.asset_n + 1)
-        self.w0[0] = 1
-        self.p0 = 1.0
+        self.w0 = np.zeros((self.num_env_heads, self.asset_n + 1))
+        self.w0[:, 0] = 1
+        self.p0 = np.ones(self.num_env_heads)
 
 
 class RLEnv(gym.Env):
diff --git a/src/dagobert/modelling/rl/ppo.py b/src/dagobert/modelling/rl/ppo.py
index d27bc733..5d7155e0 100644
--- a/src/dagobert/modelling/rl/ppo.py
+++ b/src/dagobert/modelling/rl/ppo.py
@@ -75,7 +75,6 @@ def run_rl(args):
 
     # define trainer and and lightning module
     args.multiprocessing = True if args.gpus != 1 else False
-    args.num_workers = 1 if args.num_workers == 0 else args.num_workers
     args.windows = True if "win" in sys.platform else False
     trainer = Trainer.from_argparse_args(
         args,
@@ -114,7 +113,7 @@ def __init__(self, hparams: Namespace):
         self.hparams = TCNLightning._check_mini_series_lookback(hparams)
 
         # create env, policy/value networks and experience buffer + tracking vars
-        self.envs = [RLEnv(self.hparams) for _ in range(self.hparams.num_workers)]
+        self.envs = [RLEnv(self.hparams) for _ in range(self.hparams.num_env_workers)]
         n_actions = self.envs[0].action_space.shape[0]
         self.critic = ActorCriticTCN(
             self.hparams, n_actions=n_actions, output_size=1, actor=False
@@ -147,39 +146,60 @@ def generate_experience_buffer(
         List[torch.Tensor],
     ]:
         """
-        Logic for generating trajectory data to train policy and value networks. This
-        is done leveraging the `Process` and `Queue` classes of the `multiprocessing`
-        module of Python. We'll launch hparams.num_workers number of new processes,
-        each replicating the environment in memory, so this can get expensive where
-        `num_assets` in the portfolio is large.
+        Logic for generating trajectory data to train policy and value networks. If
+        `num_env_workers` > 1, this is done leveraging the `Process` and `Queue`
+        classes of the `multiprocessing` module of Python. We'll launch
+        hparams.num_env_workers number of new processes, each replicating the
+        environment in memory, so this can get expensive where `num_assets` in the
+        portfolio is large. If `num_env_workers` = 1, we simply collect experience with
+        the main environment in the main process, running PPO.
 
         Yield:
            Tuple of Lists containing tensors for states, actions, log probs, qvals
                and advantage.
         """
-        # TODO: make this optional and multi head env the default and set no_weight_norm
-        max_worker_steps = int(self.hparams.steps_per_epoch / self.hparams.num_workers)
-        parallel_experiences = ParallelExperiences()
         device = self.setup_model_for_experience_gathering()
-        for i in range(self.hparams.num_workers):
-            args = (
-                self.envs[i],
+        # spawn multiple processes and gather experience in parallel
+        if self.hparams.num_env_workers > 1:
+            max_steps_per_worker = int(
+                self.hparams.steps_per_epoch
+                / self.hparams.num_env_workers
+                / self.self.hparams.num_env_heads
+            )
+            parallel_experiences = ParallelExperiences()
+            for i in range(self.hparams.num_env_workers):
+                args = (
+                    self.envs[i],
+                    self.agent,
+                    device,
+                    max_steps_per_worker,
+                    self.hparams.max_episode_length,
+                    len(self.hparams.asset_names),
+                    self.hparams.num_env_heads,
+                    self.hparams.gamma,
+                    self.hparams.lam,
+                    self.hparams.pgportfolio,
+                )
+                parallel_experiences.create_worker(*args)
+            # collect experiences in parallel, then merge them, calculate metrics
+            self.buffer.merge_buffers(parallel_experiences.collect_experiences())
+        else:
+            self.buffer = gather_experience(
+                self.envs[0],
                 self.agent,
                 device,
-                max_worker_steps,
+                int(self.hparams.steps_per_epoch / self.hparams.num_env_heads),
                 self.hparams.max_episode_length,
                 len(self.hparams.asset_names),
+                self.hparams.num_env_heads,
                 self.hparams.gamma,
                 self.hparams.lam,
+                self.hparams.pgportfolio,
             )
-            parallel_experiences.create_worker(*args)
-
-        # collect experiences in parallel, then merge them, calculate metrics
-        self.buffer.merge_buffers(parallel_experiences.collect_experiences())
         self.update_metrics_to_log()
+        self.setup_model_for_training()
 
         # yield a dataset for dataloader for updating actor/critic and clear buffer
-        self.setup_model_for_training()
         for state, past_pw, action, old_logp, qval, adv in zip(
             self.buffer.states,
             self.buffer.past_pws,
@@ -192,23 +212,28 @@ def generate_experience_buffer(
         self.buffer.clear_buffer()
 
     def setup_model_for_experience_gathering(self):
-        """Helper function to move model to CPU if necessary"""
+        """Moves model to CPU if necessary for parallel experience gathering."""
         # dropout and batch-norm doesn't make sense for experience gathering
         self.agent.critic_net.eval()
         self.agent.actor_net.eval()
-        # we cannot use cuda tensor sharing on windows (necessary for multiprocessing)
         if self.hparams.windows:
-            device = "cpu"
-            self.agent.critic_net.cpu()
-            self.agent.actor_net.cpu()
+            if self.hparams.num_env_workers > 1:
+                # we cannot use cuda tensor sharing on windows for multiprocessing
+                device = "cpu"
+                self.agent.critic_net.cpu()
+                self.agent.actor_net.cpu()
+            else:
+                device = self.device
         else:
             device = self.device
-            self.agent.critic_net.share_memory()
-            self.agent.actor_net.share_memory()
+            if self.hparams.num_env_workers > 1:
+                # use cuda tensor sharing on linux
+                self.agent.critic_net.share_memory()
+                self.agent.actor_net.share_memory()
         return device
 
     def setup_model_for_training(self):
-        """Helper function to move model back to GPU if necessary"""
+        """Moves model back to GPU if necessary after parallel experience gathering."""
         if self.hparams.windows and self.hparams.gpus != 0:
             self.agent.critic_net.cuda()
             self.agent.actor_net.cuda()
@@ -219,25 +244,27 @@ def update_metrics_to_log(self):
         """Helper function recalculating metrics we track at end of each epoch"""
         done_eps = self.buffer.done_episodes + eps
         ep_rewards = self.buffer.epoch_rewards
-        e = "episode"
-        p = "portfolio"
 
         # pytorch lightning model checkpoint needs metric name without /
+        e = "episode"
         self.to_log["avg_total_reward"] = ep_rewards / done_eps
         self.to_log[f"{e}/avg_total_reward"] = ep_rewards / done_eps
         self.to_log[f"{e}/avg_step_reward"] = ep_rewards / self.hparams.steps_per_epoch
         self.to_log[f"{e}/avg_len"] = self.hparams.steps_per_epoch / done_eps
+
+        # need this otherwise the generator won't work multiple times
+        p = "portfolio"
+        infos = pd.DataFrame(list(self.buffer.infos)).mean()
         self.to_log[f"{p}/avg_value_ep_end"] = (
             sum(list(self.buffer.p_ep_end_value)) / done_eps
         )
         self.to_log[f"{p}/avg_market_return_ep_end"] = (
             sum(list(self.buffer.p_ep_end_market_return)) / done_eps
         )
-        # need this otherwise the generator won't work multiple times
-        infos = pd.DataFrame(list(self.buffer.infos)).mean()
         self.to_log[f"{p}/avg_value"] = infos["portfolio_value"]
-        self.to_log[f"{p}/avg_weight_std"] = infos["weights_std"]
         self.to_log[f"{p}/avg_rebalancing_cost"] = infos["rebalancing_cost"]
+
+        self.to_log["weights/avg_weight_std"] = infos["weights_std"]
         for w in infos.index[infos.index.str.contains("weight_")]:
             self.to_log[f"weights/{w}"] = infos[w]
 
@@ -292,27 +319,6 @@ def calc_advantage(
         adv = PPO.discount_rewards(delta, gamma * lam)
         return adv
 
-    @staticmethod
-    def _init_past_pw(asset_num, device) -> torch.Tensor:
-        """
-        Init past portfolio value and weights to [1, 1, 0, ..., 0], since after the
-        portfolio is reset for each trajector p0=1, w0[0]=1 (USD relative price is
-        always 1).
-        """
-        past_pw = torch.ones(asset_num + 2).to(device)
-        # past_pw[:2] = 1
-        return past_pw.unsqueeze(0)
-
-    @staticmethod
-    def _update_past_pw(p1: float, action: torch.Tensor, device) -> torch.Tensor:
-        """
-        After each interaction, update the past weight / portfolio value vector as for
-        the next interaction the actor and critic networks take that in along with the
-        new state to form their outputs.
-        """
-        p1 = torch.Tensor([p1]).to(device)
-        return torch.cat([p1.unsqueeze(0), action], -1)
-
     # ----------------------------------------------------------------------------------
     # LOSSES AND OPTIMIZERS
     # ----------------------------------------------------------------------------------
@@ -367,8 +373,10 @@ def training_step(
             loss
         """
         state, past_pw, action, old_logp, qval, adv = batch
+
         # normalize advantages within batch
-        # adv = (adv - adv.mean()) / adv.std()
+        if self.hparams.normalize_advantages:
+            adv = (adv - adv.mean()) / adv.std()
 
         # log all metrics (other than loss)
         for k, v in self.to_log.items():
@@ -388,9 +396,11 @@ def training_step(
 
     @staticmethod
     def _pre_sanity_check(hparams: Namespace):
-        # ensure we have the rl specific target column in the config
         if hparams.target_col != NRL.rl_return:
             raise ValueError("target_col has to be rl_return for RL tasks.")
+        if hparams.num_env_workers > 1 and not hparams.no_weight_norm:
+            hparams.no_weight_norm = True
+            logger.warning("We set no_weight_norm=True as you have num_env_workers>1.")
 
         # fill in the same cols for any df that doesn't have the cols_to_model defined
         if len(hparams.cols_to_model) > 1:
@@ -455,6 +465,7 @@ def append(
         self.infos.append(info)
         self.ep_rewards.append(reward)
         self.ep_values.append(value.item())
+        self.ep_market_returns.append(info["market_return"])
 
     def shift_rewards(self):
         """
@@ -472,10 +483,12 @@ def shift_rewards(self):
         self.logps.pop(-1)
         self.infos.pop(-1)
         self.ep_values.pop(-1)
+        self.ep_market_returns.pop(-1)
 
     def merge_buffers(self, buffers):
         """
         Merges the passed in ExperienceBuffers and overwrites the current state with it.
+        Used when experience is gathered by multiple workers in parallel.
 
         Args:
             buffers: List of smaller ExpereinceBuffers to merge together from parallel
@@ -511,6 +524,7 @@ def clear_buffer(self):
         # episode / epoch vars
         self.ep_rewards = []
         self.ep_values = []
+        self.ep_market_returns = []
         self.done_episodes = 0
         self.epoch_rewards = 0
 
@@ -518,6 +532,9 @@ def clear_buffer(self):
 class ParallelExperiences:
     """
     Parallelised experience gathering, idea from https://stackoverflow.com/a/45829852
+    Used to spawn parallel processes for each `env_worker` which can independently can
+    interact with a copy of the environment and return its rewards, logps, values, etc
+    from the rollout.
     """
 
     def __init__(self):
@@ -559,15 +576,18 @@ def gather_experience(
     max_steps: int,
     max_episode_length: int,
     asset_num: int,
+    num_env_heads: int,
     gamma: float,
     lam: float,
+    pgportfolio: bool = True,
 ):
     """
     Workhorse function of the parallel experience gathering. This function can be
     called as many times as many CPUs are available on the system, to collect the
     desired number of steps and store them into an `ExperienceBuffer` that is then
     passed back (via a `multiprocessing.Queue` object) to the main process that
-    spawned the parallel processes.
+    spawned the parallel processes. Crucially, this also works if we only have a
+    single worker i.e. the main process of PPO.
 
     Args:
         env: An instance of the environment to act on.
@@ -576,53 +596,87 @@ def gather_experience(
         max_steps: Total number of steps (over multiple episodes) a worker can take.
         max_episode_length: Maximum length of a trajectory / episode.
         asset_num: Number of assets we are modelling (not including USD).
+        num_env_heads: Number of environment heads we use to interact with the env.
         gamma: See docs of :func:`PPO.calc_advantage`
         lam: See docs of :func:`PPO.calc_advantage`
+        pgportfolio: If True, we calculate the q-values and advantages according to
+            https://arxiv.org/pdf/1706.10059.pdf, else we use the traditional PPO algo.
 
     Returns:
         Experience collected in this parallel worker.
     """
-    from datetime import datetime
-
-    buffer = ExperienceBuffer()
     state = env.reset()
-    past_pw = PPO._init_past_pw(asset_num, device)
-    for step in range(max_steps):
+    buffers = [ExperienceBuffer() for _ in range(num_env_heads)]
+    past_pw = init_past_pw(num_env_heads, asset_num, device)
+
+    for step in range(max_steps + 1):
+        episode_end = step > 0 and step % max_episode_length == 0
+
         # get action, make step, get reward and info from env
         pi, action, actor_logits, logp, value = agent(state, past_pw, device)
         next_state, reward, done, info = env.step(action.cpu().numpy())
 
-        # store everything and update state, past_pw
-        buffer.append(state, past_pw, action, logp, reward, value, info)
+        # update past portfolio value / weights for next round
+        p1 = torch.Tensor([i["portfolio_value"] for i in info]).to(device).unsqueeze(0)
+        past_pw = torch.cat([p1.T, actor_logits], -1)
+
+        # store everything, we need to do this for each environment head separately
+        for i, buffer in enumerate(buffers):
+            buffer.append(
+                [s[i] for s in state],
+                past_pw[i],
+                action[i],
+                logp[i],
+                reward[i],
+                value[i],
+                info[i],
+            )
         state = next_state
-        past_pw = PPO._update_past_pw(info["portfolio_value"], actor_logits, device)
-
-        terminal = len(buffer.ep_rewards) == max_episode_length
-        if done or terminal or step == max_steps - 1:
-            # buffer.qvals += PPO.discount_rewards(buffer.ep_rewards, gamma)
-            # buffer.advs += PPO.calc_advantage(
-            #     buffer.ep_rewards, buffer.ep_values, gamma, lam
-            # )
-
-            # if rewards are immediate, we need this is due to our special environment
-            # where the immediate reward of a_0 can only calculate at t_1.
-            if gamma == 0:
-                buffer.shift_rewards()
-            # according to the PGPortfolio paper, reward should be the sum of portfolio
-            # values, divided by length of episode - no discounting no BS, same for adv
-            epr = buffer.ep_rewards
-            epr = np.ones_like(epr) * sum(epr) / len(epr)
-            buffer.qvals += list(epr)
-            buffer.advs += list(epr - np.array(buffer.ep_values))
-            if done or terminal:
-                buffer.done_episodes += 1
-            buffer.epoch_rewards += np.sum(buffer.ep_rewards)
-            buffer.p_ep_end_value.append(info["portfolio_value"])
-            buffer.p_ep_end_market_return.append(np.array(info["market_return"]).prod())
-
-            # episode over, reset the env and the episode  buffer
-            buffer.ep_rewards = []
-            buffer.ep_values = []
-            state = env.reset()
-            past_pw = PPO._init_past_pw(asset_num, device)
-    return buffer
+
+        if done or episode_end or step == max_steps - 1:
+            for buffer in buffers:
+                # according to the PGPortfolio paper, reward should be the sum of
+                # immediate rewards (portfolio returns p1/p0) div by length of episode
+                if pgportfolio:
+                    if gamma == 0:
+                        buffer.shift_rewards()
+                    epr = buffer.ep_rewards
+                    epr = np.ones_like(epr) * sum(epr) / len(epr)
+                    buffer.qvals += list(epr)
+                    buffer.advs += list(epr - np.array(buffer.ep_values))
+                # classic PPO qval and reward estimation
+                else:
+                    buffer.qvals += PPO.discount_rewards(buffer.ep_rewards, gamma)
+                    buffer.advs += PPO.calc_advantage(
+                        buffer.ep_rewards, buffer.ep_values, gamma, lam
+                    )
+                if done or episode_end:
+                    buffer.done_episodes += 1
+                buffer.epoch_rewards += np.sum(buffer.ep_rewards)
+                buffer.p_ep_end_value.append(buffer.infos[-1]["portfolio_value"])
+                buffer.p_ep_end_market_return.append(
+                    np.array(buffer.ep_market_returns).prod()
+                )
+
+                # episode over, reset the env and the episode  buffer
+                buffer.ep_rewards = []
+                buffer.ep_values = []
+                buffer.ep_market_returns = []
+                state = env.reset()
+                past_pw = init_past_pw(num_env_heads, asset_num, device)
+
+    # merge buffers from each environment head and return new buffer
+    merged_buffers = ExperienceBuffer()
+    merged_buffers.merge_buffers(buffers)
+    return merged_buffers
+
+
+def init_past_pw(
+    num_env_heads: int, asset_num: int, device: torch.device
+) -> torch.Tensor:
+    """Init past portfolio value and weights (Dirichlet concentrations) as all ones."""
+    # init past portfolio value and weights
+    past_pw = torch.ones(num_env_heads, asset_num + 2).to(device)
+    if num_env_heads == 1:
+        past_pw = past_pw.unsqueeze(0)
+    return past_pw
diff --git a/src/dagobert/modelling/rl/rl_args.py b/src/dagobert/modelling/rl/rl_args.py
index 4f2db348..6fd9a604 100644
--- a/src/dagobert/modelling/rl/rl_args.py
+++ b/src/dagobert/modelling/rl/rl_args.py
@@ -54,6 +54,41 @@ def add_rl_specific_args(parent_parser):
             "sharpe. See RLPortfolio class for more details."
         ),
     )
+    parser.add_argument(
+        "--num_env_heads",
+        type=int,
+        default=1,
+        help=(
+            "Number of heads we want to read the environment with concurrently. This "
+            "is an easy and cheap way to parallelize experience gathering on its own "
+            "does not require multiple processes to be spawn."
+        ),
+    )
+    parser.add_argument(
+        "--num_env_workers",
+        type=int,
+        default=1,
+        help=(
+            "Number parallel processes to spawn to gather experience. This represents "
+            "second layer of concurrency (num_env_heads being the first and simplest). "
+            "If this is set to higher than 1, we will have to turn no_weight_norm=True."
+        ),
+    )
+
+    parser.add_argument(
+        "--pgportfolio",
+        action="store_true",
+        help=(
+            "If True, we calculate the q-values and advantages according to "
+            "https://arxiv.org/pdf/1706.10059.pdf, else we use traditional PPO algo."
+        ),
+    )
+
+    parser.add_argument(
+        "--normalize_advantages",
+        action="store_true",
+        help="If used, we normalize the advantages in each batch of the learning phase.",
+    )
     parser.add_argument(
         "--max_episode_length",
         type=int,
@@ -122,7 +157,6 @@ def add_rl_specific_args(parent_parser):
             "the model becomes deterministic."
         ),
     )
-
     return parser
 
 
@@ -167,6 +201,18 @@ def add_model_specific_args(parent_parser):
             "for the supervised DL module."
         ),
     )
+    parser.add_argument(
+        "--no_weight_norm",
+        action="store_true",
+        help=(
+            " Weight norm is registered as a pre_forward_hook on the 1D convolutional "
+            "layers of the TemporalBlock, and these cannot be serialised when training "
+            "with parallel processes interacting with the model concurrently. If True, "
+            "we add weight normalisation around these layers, and TCN cannot be used "
+            "in a multiprocessing setting. If False, then it can be used, even staying "
+            "on GPU in linux (CPU only on Windows)."
+        ),
+    )
     parser.add_argument(
         "--use_last_timepoint",
         action="store_true",

From 629b4a91e2ff029bda020c34ecd1b0e2904e7edb Mon Sep 17 00:00:00 2001
From: Daniel Homola <dani.homola@gmail.com>
Date: Sun, 31 Jan 2021 12:45:40 +0000
Subject: [PATCH 44/62] num_env_heads works with 1 or many, num_env_workers
 works with 1 or many, created portfolio_vs_market reward, cleaned up
 pgportfolio qvals/adv calc

---
 config/rl_config.yaml                    | 15 ++++---
 src/dagobert/modelling/rl/environment.py | 13 +++++-
 src/dagobert/modelling/rl/ppo.py         | 55 +++++++++---------------
 3 files changed, 41 insertions(+), 42 deletions(-)

diff --git a/config/rl_config.yaml b/config/rl_config.yaml
index 8f1bd392..cf24c076 100644
--- a/config/rl_config.yaml
+++ b/config/rl_config.yaml
@@ -24,6 +24,7 @@ tags:
 no_comet_logger: True
 seed: 42
 batch_size: 500
+max_epochs: 100
 
 # --------------------------------------------------------------------------------------
 # RL
@@ -35,19 +36,19 @@ asset_names:
   - XRP
   - LTC
 trading_cost: 0.002
-reward_type: return
+reward_type: portfolio_vs_market
 num_env_heads: 20
 num_env_workers: 1
 normalize_advantages: True
-pgportfolio: True
-max_episode_length: 1000
-steps_per_epoch: 100000
-n_optim_iters: 6
+pgportfolio: False
+max_episode_length: 2000
+steps_per_epoch: 80000
+n_optim_iters: 4
 gamma: 0.99
 lam: 0.95
 lr_actor: 0.0001
 lr_critic: 0.0003
-clip_ratio: 0.25
+clip_ratio: 0.2
 target_kl: 0.01
 
 
@@ -65,7 +66,7 @@ actor_num_channels: [50, 50, 50, 50, 50]
 actor_kernel_size: 5
 actor_dropout: 0.2
 # sample size - exp abs diff to mean | 20 - 5% | 50 - 3% | 100 - 2% | 500 - 1%
-actor_dirichlet_sample_size: 0
+actor_dirichlet_sample_size: 20
 critic_num_channels: [50, 50, 50, 50, 50]
 critic_kernel_size: 5
 critic_dropout: 0.2
diff --git a/src/dagobert/modelling/rl/environment.py b/src/dagobert/modelling/rl/environment.py
index 6218ee38..7ce7a15c 100644
--- a/src/dagobert/modelling/rl/environment.py
+++ b/src/dagobert/modelling/rl/environment.py
@@ -157,6 +157,10 @@ def step(self, w1: np.array, y1: np.array) -> Tuple[float, dict, bool]:
         """
         w0 = self.w0
         p0 = self.p0
+        m0 = self.m0
+
+        # market return for new timepoint for each head
+        m1 = m0 * y1.mean(axis=1)
 
         # (eq7) since we last acted prices changed, so weights evolve into
         new_price_old_weights_sum = np.sum(y1 * w0, axis=1)
@@ -176,12 +180,15 @@ def step(self, w1: np.array, y1: np.array) -> Tuple[float, dict, bool]:
         # (eq22) immediate reward is log rate of return scaled by episode length
         if self.reward_type == "return":
             reward = r1
+        elif self.reward_type == "portfolio_vs_market":
+            reward = np.log(p1 + eps) - np.log(m1 + eps)
         # TODO: implement the differentiable sharpe ratio reward
         # https://quant.stackexchange.com/a/38040
 
         # remember for next step
         self.w0 = w1
         self.p0 = p1
+        self.m0 = m1
 
         # if we run out of money we're done: all env heads are linked here unfortunately
         done = np.any(p1 <= 0)
@@ -192,7 +199,7 @@ def step(self, w1: np.array, y1: np.array) -> Tuple[float, dict, bool]:
                 "reward": reward[i],
                 "log_return": r1[i],
                 "portfolio_value": p1[i],
-                "market_return": y1[i].mean(),
+                "market_return": m1[i],
                 "rate_of_return": rho1[i],
                 "weights_std": w1[i].std(),
                 "rebalancing_cost": mu[i],
@@ -205,9 +212,13 @@ def step(self, w1: np.array, y1: np.array) -> Tuple[float, dict, bool]:
         return reward, infos, done
 
     def reset(self):
+        # weights for each head
         self.w0 = np.zeros((self.num_env_heads, self.asset_n + 1))
         self.w0[:, 0] = 1
+        # portfolio value for each head
         self.p0 = np.ones(self.num_env_heads)
+        # market return for each head
+        self.m0 = np.ones(self.num_env_heads)
 
 
 class RLEnv(gym.Env):
diff --git a/src/dagobert/modelling/rl/ppo.py b/src/dagobert/modelling/rl/ppo.py
index 5d7155e0..0200ff0b 100644
--- a/src/dagobert/modelling/rl/ppo.py
+++ b/src/dagobert/modelling/rl/ppo.py
@@ -164,7 +164,7 @@ def generate_experience_buffer(
             max_steps_per_worker = int(
                 self.hparams.steps_per_epoch
                 / self.hparams.num_env_workers
-                / self.self.hparams.num_env_heads
+                / self.hparams.num_env_heads
             )
             parallel_experiences = ParallelExperiences()
             for i in range(self.hparams.num_env_workers):
@@ -252,18 +252,15 @@ def update_metrics_to_log(self):
         self.to_log[f"{e}/avg_step_reward"] = ep_rewards / self.hparams.steps_per_epoch
         self.to_log[f"{e}/avg_len"] = self.hparams.steps_per_epoch / done_eps
 
-        # need this otherwise the generator won't work multiple times
         p = "portfolio"
         infos = pd.DataFrame(list(self.buffer.infos)).mean()
-        self.to_log[f"{p}/avg_value_ep_end"] = (
-            sum(list(self.buffer.p_ep_end_value)) / done_eps
-        )
-        self.to_log[f"{p}/avg_market_return_ep_end"] = (
-            sum(list(self.buffer.p_ep_end_market_return)) / done_eps
-        )
+        p_val = np.array(list(self.buffer.p_ep_end_value))
+        m_ret = np.array(list(self.buffer.p_ep_end_market_return))
+        self.to_log[f"{p}/avg_value_ep_end"] = p_val.mean()
+        self.to_log[f"{p}/avg_market_return_ep_end"] = m_ret.mean()
+        self.to_log[f"{p}/avg_portfolio_vs_market"] = (p_val - m_ret).mean()
         self.to_log[f"{p}/avg_value"] = infos["portfolio_value"]
         self.to_log[f"{p}/avg_rebalancing_cost"] = infos["rebalancing_cost"]
-
         self.to_log["weights/avg_weight_std"] = infos["weights_std"]
         for w in infos.index[infos.index.str.contains("weight_")]:
             self.to_log[f"weights/{w}"] = infos[w]
@@ -464,7 +461,7 @@ def append(
         self.logps.append(logp)
         self.infos.append(info)
         self.ep_rewards.append(reward)
-        self.ep_values.append(value.item())
+        self.ep_values.append(value)
         self.ep_market_returns.append(info["market_return"])
 
     def shift_rewards(self):
@@ -607,13 +604,14 @@ def gather_experience(
     """
     state = env.reset()
     buffers = [ExperienceBuffer() for _ in range(num_env_heads)]
-    past_pw = init_past_pw(num_env_heads, asset_num, device)
-
-    for step in range(max_steps + 1):
-        episode_end = step > 0 and step % max_episode_length == 0
+    past_pw = torch.ones(num_env_heads, asset_num + 2).to(device)
 
+    for step in range(1, max_steps + 1):
         # get action, make step, get reward and info from env
         pi, action, actor_logits, logp, value = agent(state, past_pw, device)
+        if num_env_heads == 1:
+            action = action.unsqueeze(0)
+            logp = logp.unsqueeze(0)
         next_state, reward, done, info = env.step(action.cpu().numpy())
 
         # update past portfolio value / weights for next round
@@ -628,22 +626,22 @@ def gather_experience(
                 action[i],
                 logp[i],
                 reward[i],
-                value[i],
+                value[i].item(),
                 info[i],
             )
         state = next_state
 
+        episode_end = step > 0 and step % max_episode_length == 0
         if done or episode_end or step == max_steps - 1:
             for buffer in buffers:
                 # according to the PGPortfolio paper, reward should be the sum of
                 # immediate rewards (portfolio returns p1/p0) div by length of episode
                 if pgportfolio:
-                    if gamma == 0:
-                        buffer.shift_rewards()
-                    epr = buffer.ep_rewards
-                    epr = np.ones_like(epr) * sum(epr) / len(epr)
-                    buffer.qvals += list(epr)
-                    buffer.advs += list(epr - np.array(buffer.ep_values))
+                    buffer.shift_rewards()
+                    buffer.qvals += buffer.ep_rewards
+                    buffer.advs += list(
+                        np.array(buffer.ep_rewards) - np.array(buffer.ep_values)
+                    )
                 # classic PPO qval and reward estimation
                 else:
                     buffer.qvals += PPO.discount_rewards(buffer.ep_rewards, gamma)
@@ -652,7 +650,7 @@ def gather_experience(
                     )
                 if done or episode_end:
                     buffer.done_episodes += 1
-                buffer.epoch_rewards += np.sum(buffer.ep_rewards)
+                buffer.epoch_rewards += sum(buffer.ep_rewards)
                 buffer.p_ep_end_value.append(buffer.infos[-1]["portfolio_value"])
                 buffer.p_ep_end_market_return.append(
                     np.array(buffer.ep_market_returns).prod()
@@ -663,20 +661,9 @@ def gather_experience(
                 buffer.ep_values = []
                 buffer.ep_market_returns = []
                 state = env.reset()
-                past_pw = init_past_pw(num_env_heads, asset_num, device)
+                past_pw = torch.ones(num_env_heads, asset_num + 2).to(device)
 
     # merge buffers from each environment head and return new buffer
     merged_buffers = ExperienceBuffer()
     merged_buffers.merge_buffers(buffers)
     return merged_buffers
-
-
-def init_past_pw(
-    num_env_heads: int, asset_num: int, device: torch.device
-) -> torch.Tensor:
-    """Init past portfolio value and weights (Dirichlet concentrations) as all ones."""
-    # init past portfolio value and weights
-    past_pw = torch.ones(num_env_heads, asset_num + 2).to(device)
-    if num_env_heads == 1:
-        past_pw = past_pw.unsqueeze(0)
-    return past_pw

From 3d66510356f4808caf2d0d98c8b6a25622964159 Mon Sep 17 00:00:00 2001
From: MateMarcell <marcell-mate@hotmail.com>
Date: Wed, 3 Feb 2021 13:45:34 +0000
Subject: [PATCH 45/62] args and stuff

---
 config/timegan_config.yaml                    |  27 ++-
 setup.cfg                                     |   1 +
 .../modelling/augmentation/tgan_args.py       | 158 +++---------------
 .../modelling/augmentation/timegan.py         |  11 +-
 src/dagobert/modelling/dl/tcn_args.py         |   8 +-
 src/dagobert/naming.py                        |   9 +
 6 files changed, 71 insertions(+), 143 deletions(-)

diff --git a/config/timegan_config.yaml b/config/timegan_config.yaml
index cf1b838c..ad09b3cb 100644
--- a/config/timegan_config.yaml
+++ b/config/timegan_config.yaml
@@ -25,7 +25,7 @@ batch_size: 256
 # --------------------------------------------------------------------------------------
 
 # gru or lstm
-rnn: lstm
+rnn: gru
 # embedding weight in cost of generator loss
 emb_weight: 1
 
@@ -41,9 +41,16 @@ binariser_method:
 
 dropout: 0.2
 num_layers: 2
-hidden_size: 50
-z_dim: 50
-mini_series_length: 50
+hidden_size: 10
+z_dim: 12
+mini_series_length: 20
+# don't change order with lr dict
+lr:
+  embedder0: 0.001
+  supervisor: 0.001
+  generator: 0.001
+  embedder1: 0.001
+  discriminator: 0.001
 
 # --------------------------------------------------------------------------------------
 # DATA
@@ -80,9 +87,21 @@ cols_to_model:
 #    - sin_time
 #    - cos_time
 
+augment_method:
+augment_dfs:
+augment_dfs_mix: 0
 
 # --------------------------------------------------------------------------------------
 # PREPROCESSING
 # --------------------------------------------------------------------------------------
 
+train_start_date: "2019-01-01"
+train_days: 1
+val_days: 1
+val_train_offset_days: 1
+val_puffer_days: 1
+test_days: 1
+test_train_offset_days: 62
+test_puffer_days: 1
+
 scaling_method: minmax
\ No newline at end of file
diff --git a/setup.cfg b/setup.cfg
index f0d2b5ce..5c702758 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -58,6 +58,7 @@ console_scripts =
     dagobert-optuna = dagobert.modelling.dl.optuna:run
     dagobert-s3 = dagobert.io.runner:run
     dagobert-rl = dagobert.modelling.rl.rl_runner:run
+    dagobert-tgan = dagobert.modelling.augmentation.tgan_runner:run
 
 [test]
 # py.test options when running `python setup.py test`
diff --git a/src/dagobert/modelling/augmentation/tgan_args.py b/src/dagobert/modelling/augmentation/tgan_args.py
index 32853ad6..ef0c4a6e 100644
--- a/src/dagobert/modelling/augmentation/tgan_args.py
+++ b/src/dagobert/modelling/augmentation/tgan_args.py
@@ -1,26 +1,21 @@
 """
-All custom arguments and hyper-parameters for the reinforcement learning module.
+All custom arguments and hyper-parameters for the TimeGAN module.
 """
 
 from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
 
 from pytorch_lightning import Trainer
 
-from dagobert.modelling.dl.tcn import TCNLightning
 from dagobert.modelling.dl.tcn_args import (
     add_run_specific_args,
+    add_model_specific_args,
     add_data_specific_args,
     add_preprocessing_specific_args,
 )
-from dagobert.naming import (
-    NInputDataCols,
-    NAugmentationMethods,
-    NBarriers,
-    NPreprocessingArgs,
-)
+from dagobert.naming import NGAN
 
 
-def add_rl_specific_args(parent_parser):
+def add_tgan_specific_args(parent_parser):
     parser = ArgumentParser(
         parents=[parent_parser],
         add_help=False,
@@ -28,149 +23,48 @@ def add_rl_specific_args(parent_parser):
     )
 
     # this is just a place-holder so it's easier to read the million params in the cmd
-    parser.add_argument("--RL_PARAMS", help="====================================")
-    parser.add_argument(
-        "--asset_names",
-        type=str,
-        nargs="+",
-        default=["BTC", "ETH"],
-        help=(
-            "Names of instruments to include in the portfolio, corresponding to "
-            "anchor, df2, df3, etc."
-        ),
-    )
-    parser.add_argument(
-        "--trading_cost",
-        type=float,
-        default=0.002,
-        help="Commission rate of making trades + an estimated cost of slippage.",
-    )
+    parser.add_argument("--TGAN_PARAMS", help="====================================")
     parser.add_argument(
-        "--reward_type",
-        type=str,
-        default="return",
-        help=(
-            "Determines the overall reward to maximise by the agent. Either return or "
-            "sharpe. See RLPortfolio class for more details."
-        ),
-    )
-    parser.add_argument(
-        "--max_episode_length",
+        "--z_dim",
         type=int,
-        default=1000,
-        help=(
-            "Maximum number of interactions between the agent and the environment in "
-            "an episode."
-        ),
+        default=50,
+        help="number of dimensions of noise vector (input of generator) at t timepoint",
     )
     parser.add_argument(
-        "--steps_per_epoch",
+        "--hidden_size",
         type=int,
-        default=10000,
-        help=(
-            "How many action-state pairs to rollout for trajectory collection per "
-            "epoch. I.e. if all episodes run to their max_episode_length, we'll have "
-            "steps_per_epoch/max_episode_length number of unique episodes/trajectories."
-        ),
+        default=50,
+        help="The number of features in the hidden state, ie in embedded state.",
     )
     parser.add_argument(
-        "--n_optim_iters",
+        "--num_layers",
         type=int,
-        default=4,
+        default=1,
         help=(
-            "How many steps of gradient descent to perform on each batch. This might "
-            "seem weird, but it helps sampling efficiency, done by the original PPO "
-            "implementation and the Google ablation study found it to be useful."
+            "Number of RNN layers stacked onto each other, ie with new one using output"
+            " of previous"
         ),
     )
     parser.add_argument(
-        "--gamma", type=float, default=0.99, help="Discounting of rewards."
-    )
-    parser.add_argument(
-        "--lam",
-        type=float,
-        default=0.95,
-        help="Lambda parameter in the advantage discounting equation.",
-    )
-    parser.add_argument(
-        "--lr_actor",
-        type=float,
-        default=0.0003,
-        help="Learning rate for the actor/policy network.",
-    )
-    parser.add_argument(
-        "--lr_critic",
-        type=float,
-        default=0.001,
-        help="Learning rate for the critic/value network.",
-    )
-    parser.add_argument(
-        "--clip_ratio",
-        type=float,
-        default=0.2,
-        help="Clipping parameter for the PPO's policy upgrade cost function.",
-    )
-
-    return parser
-
-
-def add_model_specific_args(parent_parser):
-    parser = ArgumentParser(
-        parents=[parent_parser],
-        add_help=False,
-        formatter_class=ArgumentDefaultsHelpFormatter,
-    )
-
-    # this is just a place-holder so it's easier to read the million params in the cmd
-    parser.add_argument("--MODEL_PARAMS", help="====================================")
-    parser.add_argument(
-        "--actor_num_channels",
-        type=int,
-        nargs="+",
-        default=[50, 50, 50, 50, 50],
-        help=(
-            "Determines the number of layers (depth) of the actor / policy network and "
-            "the hidden unit count in each layer."
-        ),
+        "--rnn",
+        type=str,
+        default=NGAN.lstm,
+        choices=[NGAN.lstm, NGAN.gru],
+        help="Choice of RNN to use, either LSTM or GRU",
     )
     parser.add_argument(
-        "--critic_num_channels",
+        "--emb_weight",
         type=int,
-        nargs="+",
-        default=[50, 50, 50, 50, 50],
-        help=(
-            "Determines the number of layers (depth) of the critic / value network and "
-            "the hidden unit count in each layer."
-        ),
-    )
-    parser.add_argument("--actor_kernel_size", type=int, default=5, help=" ")
-    parser.add_argument("--critic_kernel_size", type=int, default=5, help=" ")
-    parser.add_argument("--actor_dropout", type=float, default=0, help=" ")
-    parser.add_argument("--critic_dropout", type=float, default=0, help=" ")
-    parser.add_argument(
-        "--no_class_weights",
-        action="store_true",
-        help=(
-            "Set this to True so we can leverage the Preprocessing pipeline written "
-            "for the supervised DL module."
-        ),
-    )
-    parser.add_argument(
-        "--use_last_timepoint",
-        action="store_true",
-        help=(
-            "If this flag is used the only the network's representation "
-            "corresponding at the latest time-point is used to predict the outcome."
-            "By default, we combine all representations across the sequence length"
-            "to make a prediction from, instead of just using the last one."
-        ),
+        default=1,
+        help="Weight multiplier for embedding component in generator loss",
     )
+
     return parser
 
 
 def get_all_args():
     parser = ArgumentParser(
-        description="Lightning RL module",
+        description="Lightning TimeGAN module",
         formatter_class=ArgumentDefaultsHelpFormatter,
     )
 
@@ -178,7 +72,7 @@ def get_all_args():
     parser = Trainer.add_argparse_args(parser)
 
     # add model and run specific params
-    parser = add_rl_specific_args(parser)
+    parser = add_tgan_specific_args(parser)
     parser = add_model_specific_args(parser)
     parser = add_run_specific_args(parser)
     parser = add_data_specific_args(parser)
diff --git a/src/dagobert/modelling/augmentation/timegan.py b/src/dagobert/modelling/augmentation/timegan.py
index f26acde9..a46861eb 100644
--- a/src/dagobert/modelling/augmentation/timegan.py
+++ b/src/dagobert/modelling/augmentation/timegan.py
@@ -147,7 +147,7 @@ def forward(self, x):
         rnn_out, _hidden = self.rnn(x)
         rnn_out = self.tanh(rnn_out)
         # todo: is there reshaping needed?
-        # rnn_out = rnn_out.reshape(-1, self.linear_input_size)
+        rnn_out = rnn_out.reshape(-1, self.linear_input_size)
         output = self.linear(rnn_out)
         if self.linear_activation:
             output = self.sigmoid(output)
@@ -406,13 +406,14 @@ def configure_optimizers(self) -> List[optim.Optimizer]:
             list(self.embedder.parameters()) + list(self.recovery.parameters()),
             list(self.discriminator.parameters()),
         ]
+        # TODO: diff lr for each net
         if "adam" in self.hparams.optimizer.lower():
-            for param_pair in param_pairs:
-                optimizer = torch.optim.AdamW(param_pair, lr=self.hparams.lr)
+            for param_pair, network in zip(param_pairs, self.hparams.lr.keys()):
+                optimizer = torch.optim.AdamW(param_pair, lr=self.hparams.lr[network])
                 optimizers.append(optimizer)
         elif "adabelief" in self.hparams.optimizer.lower():
-            for param_pair in param_pairs:
-                optimizer = AdaBelief(param_pair, lr=self.hparams.lr)
+            for param_pair, network in zip(param_pairs, self.hparams.lr.keys()):
+                optimizer = AdaBelief(param_pair, lr=self.hparams.lr[network])
                 optimizers.append(optimizer)
         return optimizers
 
diff --git a/src/dagobert/modelling/dl/tcn_args.py b/src/dagobert/modelling/dl/tcn_args.py
index 21df5792..1a14e4f2 100644
--- a/src/dagobert/modelling/dl/tcn_args.py
+++ b/src/dagobert/modelling/dl/tcn_args.py
@@ -1,6 +1,7 @@
 """
 All custom arguments and hyper-parameters for the TCN Lightning module. 
 """
+from typing import Union
 
 from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
 
@@ -79,9 +80,12 @@ def add_run_specific_args(parent_parser):
     )
     parser.add_argument(
         "--lr",
-        type=float,
+        type=Union[float, dict],
         default=0.003,
-        help="Learning rate. If set to 'auto' we'll find it automatically.",
+        help=(
+            "Learning rate. If set to 'auto' we'll find it automatically. In TimeGAN"
+            "different learning rates can be used for the various networks"
+        ),
     )
     parser.add_argument(
         "--max_lr",
diff --git a/src/dagobert/naming.py b/src/dagobert/naming.py
index 829c0eca..fd2da5a8 100644
--- a/src/dagobert/naming.py
+++ b/src/dagobert/naming.py
@@ -284,3 +284,12 @@ class NRL(object):
     """
 
     rl_return = "rl_return"
+
+
+class NGAN(object):
+    """
+    Naming object for TimeGAN.
+    """
+
+    gru = "gru"
+    lstm = "lstm"

From cd94349ed6bb91358bd2ae80f146537472586cd8 Mon Sep 17 00:00:00 2001
From: MateMarcell <marcell-mate@hotmail.com>
Date: Tue, 9 Feb 2021 17:24:55 +0000
Subject: [PATCH 46/62] debug

---
 config/custom/tcn_config_m.yaml               |   4 +-
 config/timegan_config.yaml                    |  21 +-
 notebooks/modelling/test_cryptodataset.ipynb  | 221 +++++++++++++++++-
 .../modelling/augmentation/timegan.py         |  23 +-
 src/dagobert/modelling/dl/data.py             |   2 -
 5 files changed, 247 insertions(+), 24 deletions(-)

diff --git a/config/custom/tcn_config_m.yaml b/config/custom/tcn_config_m.yaml
index 1dba8fa5..de51bdb2 100644
--- a/config/custom/tcn_config_m.yaml
+++ b/config/custom/tcn_config_m.yaml
@@ -15,7 +15,7 @@ auto_scale_batch_size:
 # --------------------------------------------------------------------------------------
 
 log_dir: logs
-num_workers: 4
+num_workers: 1
 exp_name: TCN
 tags:
   - model1
@@ -146,7 +146,7 @@ simple_augment_prob: 0.5
 # --------------------------------------------------------------------------------------
 
 train_start_date: "2018-06-01"
-train_days: 30
+train_days: 1
 val_days: 1
 val_train_offset_days: 1
 val_puffer_days: 1
diff --git a/config/timegan_config.yaml b/config/timegan_config.yaml
index ad09b3cb..33cb093a 100644
--- a/config/timegan_config.yaml
+++ b/config/timegan_config.yaml
@@ -11,7 +11,7 @@ gpus: 0
 # --------------------------------------------------------------------------------------
 
 log_dir: logs
-num_workers: 4
+num_workers: 0
 exp_name: TGAN-test
 tags:
   - time_gan_test
@@ -39,8 +39,9 @@ binariser_method:
 # MODEL
 # --------------------------------------------------------------------------------------
 
+optimizer: "adamw"
 dropout: 0.2
-num_layers: 2
+num_layers: 1
 hidden_size: 10
 z_dim: 12
 mini_series_length: 20
@@ -74,14 +75,14 @@ cols_to_model:
     - high
     - low
     - close
-#    - cum_ticks
-#    - cum_dollar
-#    - volume
-#    - cum_volume_buy
-#    - cum_volume_sell
-#    - cum_volume_quote
-#    - cum_volume_quote_buy
-#    - cum_volume_quote_sell
+    - cum_ticks
+    - cum_dollar
+    - volume
+    - cum_volume_buy
+    - cum_volume_sell
+    - cum_volume_quote
+    - cum_volume_quote_buy
+    - cum_volume_quote_sell
 #    - sin_date
 #    - cos_date
 #    - sin_time
diff --git a/notebooks/modelling/test_cryptodataset.ipynb b/notebooks/modelling/test_cryptodataset.ipynb
index 3cf26b90..a4d4d903 100644
--- a/notebooks/modelling/test_cryptodataset.ipynb
+++ b/notebooks/modelling/test_cryptodataset.ipynb
@@ -1716,12 +1716,229 @@
     "data_loaded[0].shape "
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Misc"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torchtext\n",
+    "import torch\n",
+    "from torchtext.data.utils import get_tokenizer\n",
+    "from collections import Counter\n",
+    "from torchtext.vocab import Vocab\n",
+    "from torchtext.utils import download_from_url, extract_archive\n",
+    "import io"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {
+    "collapsed": true,
+    "jupyter": {
+     "outputs_hidden": true
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Requirement already satisfied: en_core_web_sm==2.3.1 from https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz#egg=en_core_web_sm==2.3.1 in c:\\users\\u164428\\appdata\\local\\continuum\\anaconda3\\envs\\dagobert\\lib\\site-packages (2.3.1)\n",
+      "Requirement already satisfied: spacy<2.4.0,>=2.3.0 in c:\\users\\u164428\\appdata\\local\\continuum\\anaconda3\\envs\\dagobert\\lib\\site-packages (from en_core_web_sm==2.3.1) (2.3.2)\n",
+      "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in c:\\users\\u164428\\appdata\\local\\continuum\\anaconda3\\envs\\dagobert\\lib\\site-packages (from spacy<2.4.0,>=2.3.0->en_core_web_sm==2.3.1) (3.0.2)\n",
+      "Requirement already satisfied: setuptools in c:\\users\\u164428\\appdata\\local\\continuum\\anaconda3\\envs\\dagobert\\lib\\site-packages (from spacy<2.4.0,>=2.3.0->en_core_web_sm==2.3.1) (45.2.0.post20200210)\n",
+      "Requirement already satisfied: numpy>=1.15.0 in c:\\users\\u164428\\appdata\\local\\continuum\\anaconda3\\envs\\dagobert\\lib\\site-packages (from spacy<2.4.0,>=2.3.0->en_core_web_sm==2.3.1) (1.18.1)\n",
+      "Requirement already satisfied: blis<0.5.0,>=0.4.0 in c:\\users\\u164428\\appdata\\local\\continuum\\anaconda3\\envs\\dagobert\\lib\\site-packages (from spacy<2.4.0,>=2.3.0->en_core_web_sm==2.3.1) (0.4.1)\n",
+      "Requirement already satisfied: thinc==7.4.1 in c:\\users\\u164428\\appdata\\local\\continuum\\anaconda3\\envs\\dagobert\\lib\\site-packages (from spacy<2.4.0,>=2.3.0->en_core_web_sm==2.3.1) (7.4.1)\n",
+      "Requirement already satisfied: wasabi<1.1.0,>=0.4.0 in c:\\users\\u164428\\appdata\\local\\continuum\\anaconda3\\envs\\dagobert\\lib\\site-packages (from spacy<2.4.0,>=2.3.0->en_core_web_sm==2.3.1) (0.8.0)\n",
+      "Requirement already satisfied: plac<1.2.0,>=0.9.6 in c:\\users\\u164428\\appdata\\local\\continuum\\anaconda3\\envs\\dagobert\\lib\\site-packages (from spacy<2.4.0,>=2.3.0->en_core_web_sm==2.3.1) (1.1.3)\n",
+      "Requirement already satisfied: requests<3.0.0,>=2.13.0 in c:\\users\\u164428\\appdata\\local\\continuum\\anaconda3\\envs\\dagobert\\lib\\site-packages (from spacy<2.4.0,>=2.3.0->en_core_web_sm==2.3.1) (2.23.0)\n",
+      "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in c:\\users\\u164428\\appdata\\local\\continuum\\anaconda3\\envs\\dagobert\\lib\\site-packages (from spacy<2.4.0,>=2.3.0->en_core_web_sm==2.3.1) (1.0.2)\n",
+      "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in c:\\users\\u164428\\appdata\\local\\continuum\\anaconda3\\envs\\dagobert\\lib\\site-packages (from spacy<2.4.0,>=2.3.0->en_core_web_sm==2.3.1) (2.0.3)\n",
+      "Requirement already satisfied: srsly<1.1.0,>=1.0.2 in c:\\users\\u164428\\appdata\\local\\continuum\\anaconda3\\envs\\dagobert\\lib\\site-packages (from spacy<2.4.0,>=2.3.0->en_core_web_sm==2.3.1) (1.0.2)\n",
+      "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in c:\\users\\u164428\\appdata\\local\\continuum\\anaconda3\\envs\\dagobert\\lib\\site-packages (from spacy<2.4.0,>=2.3.0->en_core_web_sm==2.3.1) (4.45.0)\n",
+      "Requirement already satisfied: catalogue<1.1.0,>=0.0.7 in c:\\users\\u164428\\appdata\\local\\continuum\\anaconda3\\envs\\dagobert\\lib\\site-packages (from spacy<2.4.0,>=2.3.0->en_core_web_sm==2.3.1) (1.0.0)\n",
+      "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\u164428\\appdata\\local\\continuum\\anaconda3\\envs\\dagobert\\lib\\site-packages (from requests<3.0.0,>=2.13.0->spacy<2.4.0,>=2.3.0->en_core_web_sm==2.3.1) (2019.11.28)\n",
+      "Requirement already satisfied: chardet<4,>=3.0.2 in c:\\users\\u164428\\appdata\\local\\continuum\\anaconda3\\envs\\dagobert\\lib\\site-packages (from requests<3.0.0,>=2.13.0->spacy<2.4.0,>=2.3.0->en_core_web_sm==2.3.1) (3.0.4)\n",
+      "Requirement already satisfied: idna<3,>=2.5 in c:\\users\\u164428\\appdata\\local\\continuum\\anaconda3\\envs\\dagobert\\lib\\site-packages (from requests<3.0.0,>=2.13.0->spacy<2.4.0,>=2.3.0->en_core_web_sm==2.3.1) (2.9)\n",
+      "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in c:\\users\\u164428\\appdata\\local\\continuum\\anaconda3\\envs\\dagobert\\lib\\site-packages (from requests<3.0.0,>=2.13.0->spacy<2.4.0,>=2.3.0->en_core_web_sm==2.3.1) (1.25.8)\n",
+      "Requirement already satisfied: importlib-metadata>=0.20; python_version < \"3.8\" in c:\\users\\u164428\\appdata\\local\\continuum\\anaconda3\\envs\\dagobert\\lib\\site-packages (from catalogue<1.1.0,>=0.0.7->spacy<2.4.0,>=2.3.0->en_core_web_sm==2.3.1) (1.7.0)\n",
+      "Requirement already satisfied: zipp>=0.5 in c:\\users\\u164428\\appdata\\local\\continuum\\anaconda3\\envs\\dagobert\\lib\\site-packages (from importlib-metadata>=0.20; python_version < \"3.8\"->catalogue<1.1.0,>=0.0.7->spacy<2.4.0,>=2.3.0->en_core_web_sm==2.3.1) (2.2.0)\n",
+      "[+] Download and installation successful\n",
+      "You can now load the model via spacy.load('en_core_web_sm')\n",
+      "[x] Couldn't link model to 'en'\n",
+      "Creating a symlink in spacy/data failed. Make sure you have the required\n",
+      "permissions and try re-running the command as admin, or use a virtualenv. You\n",
+      "can still import the model as a module and call its load() method, or create the\n",
+      "symlink manually.\n",
+      "C:\\Users\\u164428\\AppData\\Local\\Continuum\\anaconda3\\envs\\dagobert\\lib\\site-packages\\en_core_web_sm\n",
+      "-->\n",
+      "C:\\Users\\u164428\\AppData\\Local\\Continuum\\anaconda3\\envs\\dagobert\\lib\\site-packages\\spacy\\data\\en\n",
+      "[!] Download successful but linking failed\n",
+      "Creating a shortcut link for 'en' didn't work (maybe you don't have admin\n",
+      "permissions?), but you can still load the model via its full package name: nlp =\n",
+      "spacy.load('en_core_web_sm')\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "You do not have sufficient privilege to perform this operation.\n"
+     ]
+    }
+   ],
+   "source": [
+    "! python -m spacy download en"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "en_tokenizer = get_tokenizer('spacy', language='en_core_web_sm')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "inputs = [\n",
+    "  'hello, my name is david', \n",
+    "  'david likes to swim', \n",
+    "  'the only thing david needs is attention'\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "TypeError",
+     "evalue": "Argument 'string' has incorrect type (expected str, got list)",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
+      "\u001b[1;32m<ipython-input-39-7bb25c446278>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[1;31m# We tokenize our input variables into numbers based on a loaded vocab\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[0mtokenized_inputs\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0men_tokenizer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0minputs\u001b[0m\u001b[1;33m)\u001b[0m  \u001b[1;31m# b x max_input_length\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
+      "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\envs\\dagobert\\lib\\site-packages\\torchtext\\data\\utils.py\u001b[0m in \u001b[0;36m_spacy_tokenize\u001b[1;34m(x, spacy)\u001b[0m\n\u001b[0;32m     12\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     13\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_spacy_tokenize\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mx\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mspacy\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 14\u001b[1;33m     \u001b[1;32mreturn\u001b[0m \u001b[1;33m[\u001b[0m\u001b[0mtok\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtext\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mtok\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mspacy\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtokenizer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mx\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     15\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     16\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;31mTypeError\u001b[0m: Argument 'string' has incorrect type (expected str, got list)"
+     ]
+    }
+   ],
+   "source": [
+    "# We tokenize our input variables into numbers based on a loaded vocab\n",
+    "tokenized_inputs = en_tokenizer(inputs)  # b x max_input_length"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def build_vocab(input_list, tokenizer):\n",
+    "    counter = Counter()\n",
+    "    for string_ in input_list:\n",
+    "        counter.update(tokenizer(string_))\n",
+    "    return Vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "en_vocab = build_vocab(inputs, en_tokenizer)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(<torchtext.vocab.Vocab at 0x18866617cc8>, 18)"
+      ]
+     },
+     "execution_count": 42,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "en_vocab, len(en_vocab)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# We define our layers\n",
+    "hidden_size = 300\n",
+    "embedding = torch.nn.Embedding(len(en_vocab), hidden_size)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# RNN encoded size must be half since bidirectional RNNs\n",
+    "# produce 2 hidden states: forwards and backwards\n",
+    "encoded_size = int(hidden_size / 2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "rnn = torch.nn.GRU(hidden_size, encoded_size, bidirectional=True, batch_first=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "NameError",
+     "evalue": "name 'tokenized_inputs' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "\u001b[1;32m<ipython-input-41-f60fafc0e7c9>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0membedded\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0membedding\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtokenized_inputs\u001b[0m\u001b[1;33m)\u001b[0m  \u001b[1;31m# tensor of size b x 300\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
+      "\u001b[1;31mNameError\u001b[0m: name 'tokenized_inputs' is not defined"
+     ]
+    }
+   ],
+   "source": [
+    "embedded = embedding(tokenized_inputs)  # tensor of size b x 300"
+   ]
   }
  ],
  "metadata": {
diff --git a/src/dagobert/modelling/augmentation/timegan.py b/src/dagobert/modelling/augmentation/timegan.py
index a46861eb..c81ed767 100644
--- a/src/dagobert/modelling/augmentation/timegan.py
+++ b/src/dagobert/modelling/augmentation/timegan.py
@@ -32,12 +32,13 @@
 
 from dagobert.naming import NStudy, NPreprocessingArgs as npa
 from dagobert.modelling.dl import (
-    GeneratorCryptoDataset,
     TemporalConvNet,
     Preprocessing,
     AdaBelief,
     LogCoshLoss,
     FocalLoss,
+    CryptoDataset,
+    GeneratorCryptoDataset,
 )
 from dagobert.modelling.augmentation.utils import get_noise, pca_analysis
 from dagobert.modelling.utils import (
@@ -182,15 +183,15 @@ def __init__(self, hparams: Namespace):
 
         # define main vars (other than model)
         super().__init__()
-        # TODO: pre sanity check, define hparams
         hparams = TimeGANLightning._pre_sanity_check(hparams)
         # lightning sets this to cuda too late for some of our setup to work
         self.tgan_device = "cuda" if hparams.gpus > 0 else "cpu"
         # prepare datafiles if necessary
-        hparams = Preprocessing().preprocess_train_dfs(hparams)
+        self.hparams = Preprocessing().preprocess_train_dfs(hparams)
 
         # TODO: any sanity checks on data, hypermparams
         self.real_logging = None
+
         self.comet_logging = not self.hparams.no_comet_logger
 
         # get feature number of instruments
@@ -407,7 +408,7 @@ def configure_optimizers(self) -> List[optim.Optimizer]:
             list(self.discriminator.parameters()),
         ]
         # TODO: diff lr for each net
-        if "adam" in self.hparams.optimizer.lower():
+        if "adamw" in self.hparams.optimizer.lower():
             for param_pair, network in zip(param_pairs, self.hparams.lr.keys()):
                 optimizer = torch.optim.AdamW(param_pair, lr=self.hparams.lr[network])
                 optimizers.append(optimizer)
@@ -434,8 +435,11 @@ def validation_step(self, batch, batch_idx):
             self.hparams.z_dim,
             device=self.tgan_device,
         )
-        z = z.to(self.generator.model[0].weight.dtype)
 
+        # z = z.to(self.generator.model[0].weight.dtype)
+        from IPython import embed
+
+        embed()
         # generate fake data and compare with validation set
         h = self.embedder(x)
         e_hat = self.generator(z)
@@ -491,6 +495,7 @@ def _get_dataloader(self, dfs_to_load: dict, prefix: str) -> DataLoader:
             shuffle = True
         else:
             shuffle = False
+
         dataset = GeneratorCryptoDataset(
             df_to_load=dfs_to_load,
             cols_to_model=self.hparams.cols_to_model,
@@ -713,10 +718,12 @@ def _make_plots(self, y_real, y_fake, pca_x, pca_x_hat, prefix):
     # ----------------------------------------------------------------------------------
 
     @staticmethod
-    def _pre_sanity_check(hparams: Namespace):
-        # ensure we have the rl specific target column in the config
+    def _pre_sanity_check(hparams: Namespace) -> Namespace:
+        """Certain sanity checks must happen before preprocessing takes place."""
+
+        # ensure we have the no specific target column in the config
         if hparams.target_col:
-            raise ValueError("target_col has to be None for GAn development.")
+            raise ValueError("target_col has to be None for GAN development.")
 
         # fill in the same cols for any df that doesn't have the cols_to_model defined
         if len(hparams.cols_to_model) > 1:
diff --git a/src/dagobert/modelling/dl/data.py b/src/dagobert/modelling/dl/data.py
index 5de98d3e..b3f043f6 100644
--- a/src/dagobert/modelling/dl/data.py
+++ b/src/dagobert/modelling/dl/data.py
@@ -651,7 +651,5 @@ def __getitem__(self, idx):
         batch_dfs, batch_indices, _ = self._get_batch_dfs_indices_target()
         from_idx, upto_idx = self._get_from_upto_idxs(idx, batch_indices)
         Xs = self._get_Xs(batch_dfs, from_idx, upto_idx)
-        # from IPython import embed
-        # embed()
         X = np.concatenate(Xs).T
         return X

From 99cbea4da30b5c612f7e69681bff86c20a5052a2 Mon Sep 17 00:00:00 2001
From: MateMarcell <marcell-mate@hotmail.com>
Date: Tue, 9 Feb 2021 19:09:12 +0000
Subject: [PATCH 47/62] vaaaaa

---
 config/custom/tcn_config_m.yaml               |   2 +-
 notebooks/modelling/test_cryptodataset.ipynb  | 224 ------------------
 .../modelling/augmentation/__init__.py        |   1 -
 .../modelling/augmentation/tgan_args.py       |   1 +
 .../modelling/augmentation/tgan_runner.py     |   3 +-
 .../modelling/augmentation/timegan.py         |  17 +-
 src/dagobert/modelling/dl/__init__.py         |   8 +-
 src/dagobert/modelling/dl/tcn_args.py         |   1 +
 8 files changed, 12 insertions(+), 245 deletions(-)

diff --git a/config/custom/tcn_config_m.yaml b/config/custom/tcn_config_m.yaml
index de51bdb2..02858439 100644
--- a/config/custom/tcn_config_m.yaml
+++ b/config/custom/tcn_config_m.yaml
@@ -15,7 +15,7 @@ auto_scale_batch_size:
 # --------------------------------------------------------------------------------------
 
 log_dir: logs
-num_workers: 1
+num_workers: 4
 exp_name: TCN
 tags:
   - model1
diff --git a/notebooks/modelling/test_cryptodataset.ipynb b/notebooks/modelling/test_cryptodataset.ipynb
index a4d4d903..8e3399e3 100644
--- a/notebooks/modelling/test_cryptodataset.ipynb
+++ b/notebooks/modelling/test_cryptodataset.ipynb
@@ -1715,230 +1715,6 @@
    "source": [
     "data_loaded[0].shape "
    ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Misc"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import torchtext\n",
-    "import torch\n",
-    "from torchtext.data.utils import get_tokenizer\n",
-    "from collections import Counter\n",
-    "from torchtext.vocab import Vocab\n",
-    "from torchtext.utils import download_from_url, extract_archive\n",
-    "import io"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "metadata": {
-    "collapsed": true,
-    "jupyter": {
-     "outputs_hidden": true
-    }
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Requirement already satisfied: en_core_web_sm==2.3.1 from https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz#egg=en_core_web_sm==2.3.1 in c:\\users\\u164428\\appdata\\local\\continuum\\anaconda3\\envs\\dagobert\\lib\\site-packages (2.3.1)\n",
-      "Requirement already satisfied: spacy<2.4.0,>=2.3.0 in c:\\users\\u164428\\appdata\\local\\continuum\\anaconda3\\envs\\dagobert\\lib\\site-packages (from en_core_web_sm==2.3.1) (2.3.2)\n",
-      "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in c:\\users\\u164428\\appdata\\local\\continuum\\anaconda3\\envs\\dagobert\\lib\\site-packages (from spacy<2.4.0,>=2.3.0->en_core_web_sm==2.3.1) (3.0.2)\n",
-      "Requirement already satisfied: setuptools in c:\\users\\u164428\\appdata\\local\\continuum\\anaconda3\\envs\\dagobert\\lib\\site-packages (from spacy<2.4.0,>=2.3.0->en_core_web_sm==2.3.1) (45.2.0.post20200210)\n",
-      "Requirement already satisfied: numpy>=1.15.0 in c:\\users\\u164428\\appdata\\local\\continuum\\anaconda3\\envs\\dagobert\\lib\\site-packages (from spacy<2.4.0,>=2.3.0->en_core_web_sm==2.3.1) (1.18.1)\n",
-      "Requirement already satisfied: blis<0.5.0,>=0.4.0 in c:\\users\\u164428\\appdata\\local\\continuum\\anaconda3\\envs\\dagobert\\lib\\site-packages (from spacy<2.4.0,>=2.3.0->en_core_web_sm==2.3.1) (0.4.1)\n",
-      "Requirement already satisfied: thinc==7.4.1 in c:\\users\\u164428\\appdata\\local\\continuum\\anaconda3\\envs\\dagobert\\lib\\site-packages (from spacy<2.4.0,>=2.3.0->en_core_web_sm==2.3.1) (7.4.1)\n",
-      "Requirement already satisfied: wasabi<1.1.0,>=0.4.0 in c:\\users\\u164428\\appdata\\local\\continuum\\anaconda3\\envs\\dagobert\\lib\\site-packages (from spacy<2.4.0,>=2.3.0->en_core_web_sm==2.3.1) (0.8.0)\n",
-      "Requirement already satisfied: plac<1.2.0,>=0.9.6 in c:\\users\\u164428\\appdata\\local\\continuum\\anaconda3\\envs\\dagobert\\lib\\site-packages (from spacy<2.4.0,>=2.3.0->en_core_web_sm==2.3.1) (1.1.3)\n",
-      "Requirement already satisfied: requests<3.0.0,>=2.13.0 in c:\\users\\u164428\\appdata\\local\\continuum\\anaconda3\\envs\\dagobert\\lib\\site-packages (from spacy<2.4.0,>=2.3.0->en_core_web_sm==2.3.1) (2.23.0)\n",
-      "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in c:\\users\\u164428\\appdata\\local\\continuum\\anaconda3\\envs\\dagobert\\lib\\site-packages (from spacy<2.4.0,>=2.3.0->en_core_web_sm==2.3.1) (1.0.2)\n",
-      "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in c:\\users\\u164428\\appdata\\local\\continuum\\anaconda3\\envs\\dagobert\\lib\\site-packages (from spacy<2.4.0,>=2.3.0->en_core_web_sm==2.3.1) (2.0.3)\n",
-      "Requirement already satisfied: srsly<1.1.0,>=1.0.2 in c:\\users\\u164428\\appdata\\local\\continuum\\anaconda3\\envs\\dagobert\\lib\\site-packages (from spacy<2.4.0,>=2.3.0->en_core_web_sm==2.3.1) (1.0.2)\n",
-      "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in c:\\users\\u164428\\appdata\\local\\continuum\\anaconda3\\envs\\dagobert\\lib\\site-packages (from spacy<2.4.0,>=2.3.0->en_core_web_sm==2.3.1) (4.45.0)\n",
-      "Requirement already satisfied: catalogue<1.1.0,>=0.0.7 in c:\\users\\u164428\\appdata\\local\\continuum\\anaconda3\\envs\\dagobert\\lib\\site-packages (from spacy<2.4.0,>=2.3.0->en_core_web_sm==2.3.1) (1.0.0)\n",
-      "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\u164428\\appdata\\local\\continuum\\anaconda3\\envs\\dagobert\\lib\\site-packages (from requests<3.0.0,>=2.13.0->spacy<2.4.0,>=2.3.0->en_core_web_sm==2.3.1) (2019.11.28)\n",
-      "Requirement already satisfied: chardet<4,>=3.0.2 in c:\\users\\u164428\\appdata\\local\\continuum\\anaconda3\\envs\\dagobert\\lib\\site-packages (from requests<3.0.0,>=2.13.0->spacy<2.4.0,>=2.3.0->en_core_web_sm==2.3.1) (3.0.4)\n",
-      "Requirement already satisfied: idna<3,>=2.5 in c:\\users\\u164428\\appdata\\local\\continuum\\anaconda3\\envs\\dagobert\\lib\\site-packages (from requests<3.0.0,>=2.13.0->spacy<2.4.0,>=2.3.0->en_core_web_sm==2.3.1) (2.9)\n",
-      "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in c:\\users\\u164428\\appdata\\local\\continuum\\anaconda3\\envs\\dagobert\\lib\\site-packages (from requests<3.0.0,>=2.13.0->spacy<2.4.0,>=2.3.0->en_core_web_sm==2.3.1) (1.25.8)\n",
-      "Requirement already satisfied: importlib-metadata>=0.20; python_version < \"3.8\" in c:\\users\\u164428\\appdata\\local\\continuum\\anaconda3\\envs\\dagobert\\lib\\site-packages (from catalogue<1.1.0,>=0.0.7->spacy<2.4.0,>=2.3.0->en_core_web_sm==2.3.1) (1.7.0)\n",
-      "Requirement already satisfied: zipp>=0.5 in c:\\users\\u164428\\appdata\\local\\continuum\\anaconda3\\envs\\dagobert\\lib\\site-packages (from importlib-metadata>=0.20; python_version < \"3.8\"->catalogue<1.1.0,>=0.0.7->spacy<2.4.0,>=2.3.0->en_core_web_sm==2.3.1) (2.2.0)\n",
-      "[+] Download and installation successful\n",
-      "You can now load the model via spacy.load('en_core_web_sm')\n",
-      "[x] Couldn't link model to 'en'\n",
-      "Creating a symlink in spacy/data failed. Make sure you have the required\n",
-      "permissions and try re-running the command as admin, or use a virtualenv. You\n",
-      "can still import the model as a module and call its load() method, or create the\n",
-      "symlink manually.\n",
-      "C:\\Users\\u164428\\AppData\\Local\\Continuum\\anaconda3\\envs\\dagobert\\lib\\site-packages\\en_core_web_sm\n",
-      "-->\n",
-      "C:\\Users\\u164428\\AppData\\Local\\Continuum\\anaconda3\\envs\\dagobert\\lib\\site-packages\\spacy\\data\\en\n",
-      "[!] Download successful but linking failed\n",
-      "Creating a shortcut link for 'en' didn't work (maybe you don't have admin\n",
-      "permissions?), but you can still load the model via its full package name: nlp =\n",
-      "spacy.load('en_core_web_sm')\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "You do not have sufficient privilege to perform this operation.\n"
-     ]
-    }
-   ],
-   "source": [
-    "! python -m spacy download en"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "en_tokenizer = get_tokenizer('spacy', language='en_core_web_sm')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 30,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "inputs = [\n",
-    "  'hello, my name is david', \n",
-    "  'david likes to swim', \n",
-    "  'the only thing david needs is attention'\n",
-    "]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 39,
-   "metadata": {},
-   "outputs": [
-    {
-     "ename": "TypeError",
-     "evalue": "Argument 'string' has incorrect type (expected str, got list)",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[1;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
-      "\u001b[1;32m<ipython-input-39-7bb25c446278>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[1;31m# We tokenize our input variables into numbers based on a loaded vocab\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[0mtokenized_inputs\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0men_tokenizer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0minputs\u001b[0m\u001b[1;33m)\u001b[0m  \u001b[1;31m# b x max_input_length\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
-      "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\envs\\dagobert\\lib\\site-packages\\torchtext\\data\\utils.py\u001b[0m in \u001b[0;36m_spacy_tokenize\u001b[1;34m(x, spacy)\u001b[0m\n\u001b[0;32m     12\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     13\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_spacy_tokenize\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mx\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mspacy\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 14\u001b[1;33m     \u001b[1;32mreturn\u001b[0m \u001b[1;33m[\u001b[0m\u001b[0mtok\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtext\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mtok\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mspacy\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtokenizer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mx\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     15\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     16\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
-      "\u001b[1;31mTypeError\u001b[0m: Argument 'string' has incorrect type (expected str, got list)"
-     ]
-    }
-   ],
-   "source": [
-    "# We tokenize our input variables into numbers based on a loaded vocab\n",
-    "tokenized_inputs = en_tokenizer(inputs)  # b x max_input_length"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 28,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def build_vocab(input_list, tokenizer):\n",
-    "    counter = Counter()\n",
-    "    for string_ in input_list:\n",
-    "        counter.update(tokenizer(string_))\n",
-    "    return Vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 31,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "en_vocab = build_vocab(inputs, en_tokenizer)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 42,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "(<torchtext.vocab.Vocab at 0x18866617cc8>, 18)"
-      ]
-     },
-     "execution_count": 42,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "en_vocab, len(en_vocab)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 34,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# We define our layers\n",
-    "hidden_size = 300\n",
-    "embedding = torch.nn.Embedding(len(en_vocab), hidden_size)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 35,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# RNN encoded size must be half since bidirectional RNNs\n",
-    "# produce 2 hidden states: forwards and backwards\n",
-    "encoded_size = int(hidden_size / 2)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 36,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "rnn = torch.nn.GRU(hidden_size, encoded_size, bidirectional=True, batch_first=True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 41,
-   "metadata": {},
-   "outputs": [
-    {
-     "ename": "NameError",
-     "evalue": "name 'tokenized_inputs' is not defined",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[1;31mNameError\u001b[0m                                 Traceback (most recent call last)",
-      "\u001b[1;32m<ipython-input-41-f60fafc0e7c9>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0membedded\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0membedding\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtokenized_inputs\u001b[0m\u001b[1;33m)\u001b[0m  \u001b[1;31m# tensor of size b x 300\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
-      "\u001b[1;31mNameError\u001b[0m: name 'tokenized_inputs' is not defined"
-     ]
-    }
-   ],
-   "source": [
-    "embedded = embedding(tokenized_inputs)  # tensor of size b x 300"
-   ]
   }
  ],
  "metadata": {
diff --git a/src/dagobert/modelling/augmentation/__init__.py b/src/dagobert/modelling/augmentation/__init__.py
index dbab2837..a4400910 100644
--- a/src/dagobert/modelling/augmentation/__init__.py
+++ b/src/dagobert/modelling/augmentation/__init__.py
@@ -1,2 +1 @@
 from .augmentation import augment
-from .timegan import RnnBlock, TimeGANLightning
diff --git a/src/dagobert/modelling/augmentation/tgan_args.py b/src/dagobert/modelling/augmentation/tgan_args.py
index ef0c4a6e..0491bbcb 100644
--- a/src/dagobert/modelling/augmentation/tgan_args.py
+++ b/src/dagobert/modelling/augmentation/tgan_args.py
@@ -6,6 +6,7 @@
 
 from pytorch_lightning import Trainer
 
+from dagobert.modelling.dl.tcn import TCNLightning
 from dagobert.modelling.dl.tcn_args import (
     add_run_specific_args,
     add_model_specific_args,
diff --git a/src/dagobert/modelling/augmentation/tgan_runner.py b/src/dagobert/modelling/augmentation/tgan_runner.py
index 3ae6c027..90d4e8b6 100644
--- a/src/dagobert/modelling/augmentation/tgan_runner.py
+++ b/src/dagobert/modelling/augmentation/tgan_runner.py
@@ -5,6 +5,8 @@
 command line arguments, but it's much more convenient to use YAML configs for this,
 see the `tcn_args.py` and `tgan_args.py` for more detail.
 """
+import os
+import sys
 import logging
 from pathlib import Path
 
@@ -13,7 +15,6 @@
 from dagobert.modelling.augmentation.tgan_args import get_all_args
 from dagobert.modelling.augmentation.timegan import run_tgan
 
-
 logger = logging.getLogger(__name__)
 
 
diff --git a/src/dagobert/modelling/augmentation/timegan.py b/src/dagobert/modelling/augmentation/timegan.py
index c81ed767..8877b1af 100644
--- a/src/dagobert/modelling/augmentation/timegan.py
+++ b/src/dagobert/modelling/augmentation/timegan.py
@@ -31,15 +31,11 @@
 from pytorch_lightning.metrics import functional as plm
 
 from dagobert.naming import NStudy, NPreprocessingArgs as npa
-from dagobert.modelling.dl import (
-    TemporalConvNet,
-    Preprocessing,
-    AdaBelief,
-    LogCoshLoss,
-    FocalLoss,
-    CryptoDataset,
-    GeneratorCryptoDataset,
-)
+
+from dagobert.modelling.dl import Preprocessing
+from dagobert.modelling.dl.data import GeneratorCryptoDataset
+from dagobert.modelling.dl import AdaBelief
+
 from dagobert.modelling.augmentation.utils import get_noise, pca_analysis
 from dagobert.modelling.utils import (
     triple_barrier_error,
@@ -145,6 +141,7 @@ def __init__(
         self.linear_activation = linear_activation
 
     def forward(self, x):
+
         rnn_out, _hidden = self.rnn(x)
         rnn_out = self.tanh(rnn_out)
         # todo: is there reshaping needed?
@@ -437,9 +434,7 @@ def validation_step(self, batch, batch_idx):
         )
 
         # z = z.to(self.generator.model[0].weight.dtype)
-        from IPython import embed
 
-        embed()
         # generate fake data and compare with validation set
         h = self.embedder(x)
         e_hat = self.generator(z)
diff --git a/src/dagobert/modelling/dl/__init__.py b/src/dagobert/modelling/dl/__init__.py
index a2849ec8..52b1e0bb 100644
--- a/src/dagobert/modelling/dl/__init__.py
+++ b/src/dagobert/modelling/dl/__init__.py
@@ -1,10 +1,4 @@
-from .data import (
-    CryptoDataset,
-    PortfolioCryptoDataset,
-    ExperienceSourceDataset,
-    GeneratorCryptoDataset,
-)
-
+from .data import PortfolioCryptoDataset, ExperienceSourceDataset, CryptoDataset
 from .tcn_net import TemporalConvNet
 from .utils import LogCoshLoss, FocalLoss, MixedNormalPDFLoss
 from .adabelief import AdaBelief
diff --git a/src/dagobert/modelling/dl/tcn_args.py b/src/dagobert/modelling/dl/tcn_args.py
index acaa4198..48370ef3 100644
--- a/src/dagobert/modelling/dl/tcn_args.py
+++ b/src/dagobert/modelling/dl/tcn_args.py
@@ -5,6 +5,7 @@
 
 from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
 
+
 from pytorch_lightning import Trainer
 
 from dagobert.modelling.dl.tcn import TCNLightning

From d646bf61242ea9a097f9a5235a5fa452b584668b Mon Sep 17 00:00:00 2001
From: MateMarcell <marcell-mate@hotmail.com>
Date: Thu, 18 Feb 2021 11:44:46 +0000
Subject: [PATCH 48/62] bugz

---
 config/custom/tcn_config_m.yaml               | 41 ++++++++++---------
 config/timegan_config.yaml                    |  9 ++--
 .../modelling/augmentation/timegan.py         | 32 +++++++++------
 src/dagobert/modelling/dl/tcn.py              |  4 +-
 src/dagobert/modelling/utils.py               |  6 +--
 5 files changed, 51 insertions(+), 41 deletions(-)

diff --git a/config/custom/tcn_config_m.yaml b/config/custom/tcn_config_m.yaml
index 02858439..64e60696 100644
--- a/config/custom/tcn_config_m.yaml
+++ b/config/custom/tcn_config_m.yaml
@@ -22,11 +22,11 @@ tags:
   - ethusdt_volume500
   - simple_lookahead_y
 no_comet_logger: True
-seed: 40
+seed: 42
 batch_size: 256
 early_stopping_rounds: 15
 optimizer: adabelief
-lr: 0.001
+lr: 'auto'
 max_lr: 0.1
 max_lr_multiplier: 10
 one_cycle_length: 60
@@ -40,7 +40,7 @@ output_size: 1
 num_channels: [20, 20, 20, 20]
 kernel_size: 3
 dropout: 0.5
-use_last_timepoint: False
+use_last_timepoint: True
 last_y: True
 non_last_y_frac: 0.5
 regression: False
@@ -58,14 +58,14 @@ no_sample_weights: True
 data_dir: "C:/Users/u164428/Desktop/Dagobert/data/modelling"
 
 lookback: auto
-mini_series_length: 20
+mini_series_length: auto
 
 # If this is set to a number, then simple lookahead labelling is in place
 simple_lookahead_y: 15
 simple_lookahead_reg: False
 
 # If this is True, anchor is labelled before preprocessing. to_label and simple_lookahead_y cannot be used together.
-to_label: False
+to_label: True
 label_sl: 1
 label_pt: 1
 label_first_or_max: "first"
@@ -84,6 +84,12 @@ cols_to_model:
     - high
     - low
     - close
+    - volume
+    - cum_volume_buy
+    - cum_volume_sell
+    - cum_volume_quote
+    - cum_volume_quote_buy
+    - cum_volume_quote_sell
 #    - open_fd_0.0
 #    - high_fd_0.0
 #    - low_fd_0.0
@@ -94,12 +100,6 @@ cols_to_model:
 #    - close_fd_tuned
 #    - cum_ticks
 #    - cum_dollar
-#    - volume
-#    - cum_volume_buy
-#    - cum_volume_sell
-#    - cum_volume_quote
-#    - cum_volume_quote_buy
-#    - cum_volume_quote_sell
 #    - sin_date
 #    - cos_date
 #    - sin_time
@@ -131,22 +131,23 @@ augment_method: random_fast
 augment_prob: 0.25
 simple_augment_dfs:
   - std_bar_BTCUSDT_tick_1.feather
-  - std_bar_LTCUSDT_tick_1.feather
-  - std_bar_XRPUSDT_tick_1.feather
-  - std_bar_BTCUSDT_volume_100.feather
-  - std_bar_LTCUSDT_volume_1000.feather
-  - std_bar_XRPUSDT_volume_125000.feather
-  - std_bar_BTCUSDT_dollar_1000000.feather
-  - std_bar_LTCUSDT_dollar_40000.feather
-  - std_bar_XRPUSDT_dollar_20000.feather
+#  - std_bar_LTCUSDT_tick_1.feather
+#  - std_bar_XRPUSDT_tick_1.feather
+#  - std_bar_BTCUSDT_volume_100.feather
+#  - std_bar_LTCUSDT_volume_1000.feather
+#  - std_bar_XRPUSDT_volume_125000.feather
+#  - std_bar_BTCUSDT_dollar_1000000.feather
+#  - std_bar_LTCUSDT_dollar_40000.feather
+#  - std_bar_XRPUSDT_dollar_20000.feather
 simple_augment_prob: 0.5
+augment_dfs_mix: 0.33
 
 # --------------------------------------------------------------------------------------
 # PREPROCESSING
 # --------------------------------------------------------------------------------------
 
 train_start_date: "2018-06-01"
-train_days: 1
+train_days: 2
 val_days: 1
 val_train_offset_days: 1
 val_puffer_days: 1
diff --git a/config/timegan_config.yaml b/config/timegan_config.yaml
index 33cb093a..526c4625 100644
--- a/config/timegan_config.yaml
+++ b/config/timegan_config.yaml
@@ -4,7 +4,9 @@
 # --------------------------------------------------------------------------------------
 
 gpus: 0
-
+pin_memory: True
+val_check_interval: 1
+print_nan_grads: True
 
 # --------------------------------------------------------------------------------------
 # RUN
@@ -17,15 +19,14 @@ tags:
   - time_gan_test
 no_comet_logger: True
 seed: 42
-batch_size: 256
-
+batch_size: 64
 
 # --------------------------------------------------------------------------------------
 # GAN
 # --------------------------------------------------------------------------------------
 
 # gru or lstm
-rnn: gru
+rnn: "gru"
 # embedding weight in cost of generator loss
 emb_weight: 1
 
diff --git a/src/dagobert/modelling/augmentation/timegan.py b/src/dagobert/modelling/augmentation/timegan.py
index 8877b1af..5ca35dc6 100644
--- a/src/dagobert/modelling/augmentation/timegan.py
+++ b/src/dagobert/modelling/augmentation/timegan.py
@@ -30,7 +30,7 @@
 from pytorch_lightning import Trainer, Callback, loggers
 from pytorch_lightning.metrics import functional as plm
 
-from dagobert.naming import NStudy, NPreprocessingArgs as npa
+from dagobert.naming import NStudy, NGAN, NPreprocessingArgs as npa
 
 from dagobert.modelling.dl import Preprocessing
 from dagobert.modelling.dl.data import GeneratorCryptoDataset
@@ -118,7 +118,7 @@ def __init__(
         super(RnnBlock, self).__init__()
 
         # input/output: (batch, seq, feature)
-        if rnn == "lstm":
+        if rnn == NGAN.lstm:
             self.rnn = nn.LSTM(
                 input_size=input_size,
                 hidden_size=hidden_size,
@@ -126,7 +126,7 @@ def __init__(
                 dropout=dropout,
                 batch_first=batch_first,
             )
-        elif rnn == "gru":
+        elif rnn == NGAN.gru:
             self.rnn = nn.GRU(
                 input_size=input_size,
                 hidden_size=hidden_size,
@@ -141,11 +141,10 @@ def __init__(
         self.linear_activation = linear_activation
 
     def forward(self, x):
-
         rnn_out, _hidden = self.rnn(x)
         rnn_out = self.tanh(rnn_out)
         # todo: is there reshaping needed?
-        rnn_out = rnn_out.reshape(-1, self.linear_input_size)
+        # rnn_out = rnn_out.reshape(-1, self.linear_input_size)
         output = self.linear(rnn_out)
         if self.linear_activation:
             output = self.sigmoid(output)
@@ -241,6 +240,7 @@ def __init__(self, hparams: Namespace):
             rnn=self.hparams.rnn,
             linear_activation=True,
         )
+        # TODO: shape of disc (batch, time, 1) or more rather (batch, 1)
         self.discriminator = RnnBlock(
             input_size=self.hparams.hidden_size,
             hidden_size=self.hparams.hidden_size,
@@ -269,7 +269,8 @@ def training_step(self, batch, batch_idx, optimizer_idx):
         Returns:
             Loss
         """
-        x = batch
+        print(f"train_step optimizer_idx: {optimizer_idx}")
+        x = batch.float()
         batch_len = len(x)
 
         h = self.embedder(x)
@@ -288,6 +289,7 @@ def training_step(self, batch, batch_idx, optimizer_idx):
                     prog_bar=True,
                     logger=True,
                 )
+
                 return loss_e
 
             elif optimizer_idx == 3:
@@ -341,6 +343,7 @@ def training_step(self, batch, batch_idx, optimizer_idx):
 
                     # synthetic data
                     x_hat = self.recovery(h_hat)
+
                     # no_grad to leave discriminator unchanged
                     with torch.no_grad():
                         y_fake = self.discriminator(h_hat)
@@ -364,10 +367,11 @@ def training_step(self, batch, batch_idx, optimizer_idx):
                     )
                     return loss_gen
 
+            # TODO: if (check_d_loss > 0.15):
             # update discriminator
             elif optimizer_idx == 4:
                 e_hat = self.generator(z)
-                h_hat = self.supervisor(e_hat)
+                h_hat = self.supervisor(e_hat.detach())
                 # detach to update only discriminator
                 y_fake = self.discriminator(h_hat.detach())
                 y_fake_e = self.discriminator(e_hat.detach())
@@ -386,6 +390,7 @@ def training_step(self, batch, batch_idx, optimizer_idx):
                 )
                 # pytorch lightning needs to have "loss" in the return dict
                 return {
+                    "loss": loss_disc,
                     "loss_disc/train": loss_disc,
                     "y_fake/train": y_fake,
                     "y_fake_e/train": y_fake_e,
@@ -422,7 +427,9 @@ def training_epoch_end(self, outputs):
         return self._epoch_end(outputs, "train")
 
     def validation_step(self, batch, batch_idx):
-        x = batch
+        # change float64 to float32
+        print(f"val_step batch idx: {batch_idx} LOFASZJOSKA")
+        x = batch.float()
         batch_len = len(x)
 
         # noise
@@ -433,10 +440,9 @@ def validation_step(self, batch, batch_idx):
             device=self.tgan_device,
         )
 
-        # z = z.to(self.generator.model[0].weight.dtype)
-
         # generate fake data and compare with validation set
         h = self.embedder(x)
+
         e_hat = self.generator(z)
         h_hat = self.supervisor(e_hat)
         x_hat = self.recovery(h_hat)
@@ -648,8 +654,8 @@ def _epoch_end(self, outputs, prefix="val"):
             y_fake_e.append(x[f"y_fake_e/{prefix}"])
             pca_x.append(x[f"pca_x/{prefix}"])
             pca_x_hat.append(x[f"pca_x_hat/{prefix}"])
-        # log sampled images
-        self._make_plots(y_real, y_fake, pca_x, pca_x_hat, prefix)
+        # log sampled images, only first batch (2 validation rounds @ start)
+        self._make_plots(y_real[0], y_fake[0], pca_x[0], pca_x_hat[0], prefix)
 
     # ----------------------------------------------------------------------------------
     # PLOTTING AND LOGGING FUNCTIONS
@@ -727,4 +733,6 @@ def _pre_sanity_check(hparams: Namespace) -> Namespace:
                     hparams.cols_to_model[df_name] = deepcopy(
                         hparams.cols_to_model[npa.anchor]
                     )
+        if hparams.rnn not in [NGAN.gru, NGAN.lstm]:
+            raise ValueError("rnn has to be either 'gru' or 'lstm'.")
         return hparams
diff --git a/src/dagobert/modelling/dl/tcn.py b/src/dagobert/modelling/dl/tcn.py
index 79dd6457..e17790a6 100644
--- a/src/dagobert/modelling/dl/tcn.py
+++ b/src/dagobert/modelling/dl/tcn.py
@@ -737,8 +737,8 @@ def _check_mini_series_lookback(hparams: Namespace) -> Namespace:
             num_channels = f"{case}num_channels"
             k_size = f"{case}kernel_size"
             if num_channels in hparams:
-                net_depth = len(hparams.__getattribute__(num_channels))
-                k_size = hparams.__getattribute__(k_size)
+                net_depth = len(hparams[num_channels])
+                k_size = hparams[k_size]
                 max_seq_len = TemporalConvNet.get_tcn_receptive_field_size(
                     k_size, net_depth
                 )
diff --git a/src/dagobert/modelling/utils.py b/src/dagobert/modelling/utils.py
index 90bf465c..69a3d876 100644
--- a/src/dagobert/modelling/utils.py
+++ b/src/dagobert/modelling/utils.py
@@ -423,7 +423,7 @@ def plot_anchor_sample(i, obj, x):
     df.plot(subplots=True, layout=(int(np.ceil((len(cols) / 4))), 4))
 
 
-def plot_pca(pca_x, pca_x_hat):
+def plot_pca(pca_x, pca_x_hat) -> Figure:
     """
     Plot PCA-reduced x and x_hat to visualise similarity. Overlap suggests similarity.
     Args:
@@ -431,10 +431,10 @@ def plot_pca(pca_x, pca_x_hat):
         pca_x_hat: 2-component-PCA of x_hat
 
     Returns:
-    Scatter plot showing 2-component-PCA of x & x_hat.
+        Scatter plot showing 2-component-PCA of x & x_hat.
     """
-
     f, ax = plt.subplots(1)
+
     length = pca_x.shape[0]
     colors = ["red" for i in range(length)] + ["blue" for i in range(length)]
     plt.scatter(pca_x[:, 0], pca_x[:, 1], c=colors[:length], alpha=0.2, label="Real")

From 26f41b06b41a2d3c0398309cd46f036991029a0f Mon Sep 17 00:00:00 2001
From: MateMarcell <marcell-mate@hotmail.com>
Date: Mon, 22 Feb 2021 17:36:15 +0000
Subject: [PATCH 49/62] just do it

---
 .../modelling/augmentation/timegan.py         | 69 ++++++++++---------
 1 file changed, 36 insertions(+), 33 deletions(-)

diff --git a/src/dagobert/modelling/augmentation/timegan.py b/src/dagobert/modelling/augmentation/timegan.py
index 5ca35dc6..2059e549 100644
--- a/src/dagobert/modelling/augmentation/timegan.py
+++ b/src/dagobert/modelling/augmentation/timegan.py
@@ -79,11 +79,11 @@ def run_tgan(args):
 
     # setup callbacks
     checkpoint_callback = ModelCheckpoint(
-        monitor="loss_gen",
+        monitor="loss_disc/val",
         filename="_{epoch:02d}_{avg_reward:.10f}",
         dirpath=f"{args.log_dir}/models/{args.exp_name}_{tb_logger.version}",
         save_top_k=3,
-        mode="min",
+        mode="max",
     )
 
     # define trainer and and lightning module
@@ -134,7 +134,7 @@ def __init__(
                 dropout=dropout,
                 batch_first=batch_first,
             )
-        self.linear_input_size = linear_input_size
+        self.linear_output_size = linear_output_size
         self.tanh = nn.Tanh()
         self.linear = nn.Linear(linear_input_size, linear_output_size)
         self.sigmoid = nn.Sigmoid()
@@ -143,11 +143,11 @@ def __init__(
     def forward(self, x):
         rnn_out, _hidden = self.rnn(x)
         rnn_out = self.tanh(rnn_out)
-        # todo: is there reshaping needed?
-        # rnn_out = rnn_out.reshape(-1, self.linear_input_size)
+        # reshape if net is the discirminator
+        if self.linear_output_size == 1:
+            rnn_out = rnn_out.reshape(rnn_out.shape[0], -1)
         output = self.linear(rnn_out)
-        if self.linear_activation:
-            output = self.sigmoid(output)
+        output = self.sigmoid(output)
         return output
 
 
@@ -245,7 +245,8 @@ def __init__(self, hparams: Namespace):
             input_size=self.hparams.hidden_size,
             hidden_size=self.hparams.hidden_size,
             num_layers=self.hparams.num_layers,
-            linear_input_size=self.hparams.hidden_size,
+            linear_input_size=self.hparams.hidden_size
+            * self.hparams.mini_series_length,
             linear_output_size=1,
             dropout=self.hparams.dropout,
             batch_first=True,
@@ -282,7 +283,7 @@ def training_step(self, batch, batch_idx, optimizer_idx):
             if optimizer_idx == 0:
                 loss_e = TimeGANLightning.embed_loss0(x_tilde, x)
                 self.log(
-                    "loss_e",
+                    "loss_e/train",
                     loss_e,
                     on_step=False,
                     on_epoch=True,
@@ -301,7 +302,7 @@ def training_step(self, batch, batch_idx, optimizer_idx):
                     h,
                 )
                 self.log(
-                    "loss_embed",
+                    "loss_embed/train",
                     loss_embed,
                     on_step=False,
                     on_epoch=True,
@@ -315,7 +316,7 @@ def training_step(self, batch, batch_idx, optimizer_idx):
             h_hat_supervise = self.supervisor(h)
             loss_supervisor = TimeGANLightning.supervisor_loss(h_hat_supervise, h)
             self.log(
-                "loss_supervisor",
+                "loss_supervisor/train",
                 loss_supervisor,
                 on_step=False,
                 on_epoch=True,
@@ -358,7 +359,7 @@ def training_step(self, batch, batch_idx, optimizer_idx):
                         self.hparams.emb_weight,
                     )
                     self.log(
-                        "loss_gen",
+                        "loss_gen/train",
                         loss_gen,
                         on_step=False,
                         on_epoch=True,
@@ -423,12 +424,8 @@ def configure_optimizers(self) -> List[optim.Optimizer]:
     def train_dataloader(self):
         return self._get_dataloader(self.hparams.df_train, "train")
 
-    def training_epoch_end(self, outputs):
-        return self._epoch_end(outputs, "train")
-
     def validation_step(self, batch, batch_idx):
         # change float64 to float32
-        print(f"val_step batch idx: {batch_idx} LOFASZJOSKA")
         x = batch.float()
         batch_len = len(x)
 
@@ -442,7 +439,6 @@ def validation_step(self, batch, batch_idx):
 
         # generate fake data and compare with validation set
         h = self.embedder(x)
-
         e_hat = self.generator(z)
         h_hat = self.supervisor(e_hat)
         x_hat = self.recovery(h_hat)
@@ -476,6 +472,9 @@ def validation_step(self, batch, batch_idx):
     def validation_epoch_end(self, outputs):
         return self._epoch_end(outputs, "val")
 
+    def training_epoch_end(self, outputs):
+        return self._epoch_end(outputs, "train")
+
     def val_dataloader(self):
         return self._get_dataloader(self.hparams.df_val, "val")
 
@@ -536,7 +535,8 @@ def embed_loss0(x_tilde, x):
     def supervisor_loss(h_hat_supervise, h):
         """
         This loss further ensures that generator produces similar stepwise transitions
-        (evaluated by ground-truth targets).
+        (evaluated by ground-truth targets). Responsible to capture how well the
+        generator approximates the next time step in the latent space.
         Args:
             h_hat_supervise: supervisors output from feeding h (real embedding) through
             h: real embedding defined by embedder net
@@ -641,21 +641,24 @@ def _epoch_end(self, outputs, prefix="val"):
         Returns:
 
         """
-        avg_loss = []
-        y_real = []
-        y_fake = []
-        y_fake_e = []
-        pca_x = []
-        pca_x_hat = []
-        for x in outputs:
-            avg_loss.append(x[f"loss_disc/{prefix}"])
-            y_real.append(x[f"y_real/{prefix}"])
-            y_fake.append(x[f"y_fake/{prefix}"])
-            y_fake_e.append(x[f"y_fake_e/{prefix}"])
-            pca_x.append(x[f"pca_x/{prefix}"])
-            pca_x_hat.append(x[f"pca_x_hat/{prefix}"])
-        # log sampled images, only first batch (2 validation rounds @ start)
-        self._make_plots(y_real[0], y_fake[0], pca_x[0], pca_x_hat[0], prefix)
+        if prefix == "train":
+            pass
+        elif prefix == "val":
+            avg_loss = []
+            y_real = []
+            y_fake = []
+            y_fake_e = []
+            pca_x = []
+            pca_x_hat = []
+            for x in outputs:
+                avg_loss.append(x[f"loss_disc/{prefix}"])
+                y_real.append(x[f"y_real/{prefix}"])
+                y_fake.append(x[f"y_fake/{prefix}"])
+                y_fake_e.append(x[f"y_fake_e/{prefix}"])
+                pca_x.append(x[f"pca_x/{prefix}"])
+                pca_x_hat.append(x[f"pca_x_hat/{prefix}"])
+            # log sampled images, only first batch (2 validation rounds @ start)
+            self._make_plots(y_real[0], y_fake[0], pca_x[0], pca_x_hat[0], prefix)
 
     # ----------------------------------------------------------------------------------
     # PLOTTING AND LOGGING FUNCTIONS

From de5339976ac836bb7829a1932154a0788675d48f Mon Sep 17 00:00:00 2001
From: MateMarcell <marcell-mate@hotmail.com>
Date: Tue, 23 Feb 2021 14:35:03 +0000
Subject: [PATCH 50/62] corrections

---
 config/timegan_config.yaml                    |  6 ++--
 .../modelling/augmentation/timegan.py         | 29 ++++++++++++-------
 2 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/config/timegan_config.yaml b/config/timegan_config.yaml
index 526c4625..78befdae 100644
--- a/config/timegan_config.yaml
+++ b/config/timegan_config.yaml
@@ -5,7 +5,7 @@
 
 gpus: 0
 pin_memory: True
-val_check_interval: 1
+val_check_interval: 0.5
 print_nan_grads: True
 
 # --------------------------------------------------------------------------------------
@@ -44,7 +44,7 @@ optimizer: "adamw"
 dropout: 0.2
 num_layers: 1
 hidden_size: 10
-z_dim: 12
+z_dim: 32
 mini_series_length: 20
 # don't change order with lr dict
 lr:
@@ -98,7 +98,7 @@ augment_dfs_mix: 0
 # --------------------------------------------------------------------------------------
 
 train_start_date: "2019-01-01"
-train_days: 1
+train_days: 10
 val_days: 1
 val_train_offset_days: 1
 val_puffer_days: 1
diff --git a/src/dagobert/modelling/augmentation/timegan.py b/src/dagobert/modelling/augmentation/timegan.py
index 2059e549..0f6992ef 100644
--- a/src/dagobert/modelling/augmentation/timegan.py
+++ b/src/dagobert/modelling/augmentation/timegan.py
@@ -143,11 +143,13 @@ def __init__(
     def forward(self, x):
         rnn_out, _hidden = self.rnn(x)
         rnn_out = self.tanh(rnn_out)
-        # reshape if net is the discirminator
+        # reshape if net is the discriminator, and no activation
         if self.linear_output_size == 1:
             rnn_out = rnn_out.reshape(rnn_out.shape[0], -1)
-        output = self.linear(rnn_out)
-        output = self.sigmoid(output)
+            output = self.linear(rnn_out)
+        else:
+            output = self.linear(rnn_out)
+            output = self.sigmoid(output)
         return output
 
 
@@ -270,7 +272,6 @@ def training_step(self, batch, batch_idx, optimizer_idx):
         Returns:
             Loss
         """
-        print(f"train_step optimizer_idx: {optimizer_idx}")
         x = batch.float()
         batch_len = len(x)
 
@@ -573,8 +574,9 @@ def generator_loss(
             Loss
         """
         # adversarial
-        g_loss_u = nn.BCELoss()(y_fake, torch.ones_like(y_fake))
-        g_loss_u_e = nn.BCELoss()(y_fake_e, torch.ones_like(y_fake_e))
+        criterion = nn.BCEWithLogitsLoss()
+        g_loss_u = criterion(y_fake, torch.ones_like(y_fake))
+        g_loss_u_e = criterion(y_fake_e, torch.ones_like(y_fake_e))
         w_g_loss_u_e = emb_weight * g_loss_u_e
         # supervisor
         g_loss_s = TimeGANLightning.supervisor_loss(h_hat_supervise, h)
@@ -608,7 +610,8 @@ def embedder_loss(x_tilde, x, h_hat_supervise, h):
     @staticmethod
     def discriminator_loss(y_fake, y_fake_e, y_real, emb_weight):
         """
-        Discriminator’s binary adversarial feedback, both on fake and real data.
+        Discriminator’s binary adversarial feedback, both on fake and real data. Real
+        data is labelled as 1, fake as 0.
         Args:
             y_fake: logits for classification of fakes (from h_hat)
             y_fake_e: logits for classification of fake embeddings (from e_hat)
@@ -618,13 +621,11 @@ def discriminator_loss(y_fake, y_fake_e, y_real, emb_weight):
         Returns:
             Loss
         """
-        # TODO: is this the correct loss? discriminator returns logits w/out activation.
-        #   changed compared to original TF implementation
         criterion = nn.BCEWithLogitsLoss()
         d_loss_fake_e = criterion(y_fake_e, torch.zeros_like(y_fake_e))
         d_loss_fake = criterion(y_fake, torch.zeros_like(y_fake))
         d_loss_real = criterion(y_real, torch.ones_like(y_real))
-        # TODO: any use of dividing loss by (2 + emb_weight)?
+        # TODO: any use of dividing loss by (2 + emb_weight)? probably not
         return emb_weight * d_loss_fake_e + d_loss_fake + d_loss_real
 
     # ----------------------------------------------------------------------------------
@@ -711,9 +712,15 @@ def _make_plots(self, y_real, y_fake, pca_x, pca_x_hat, prefix):
         )
 
         # HISTOGRAM
+        # discirminator's take on real data
+        y_real_class = (torch.sigmoid(y_real) > 0.5).int()
+        y_real_for_hist = torch.sigmoid(y_real)
+        # discirminator's take on fake data
+        y_fake_class = (torch.sigmoid(y_fake) > 0.5).int()
+        y_fake_for_hist = torch.sigmoid(y_fake)
         self._log_image(
             f"real v fake hist/{prefix}",
-            hist_from_tensor(y_real, y_fake),
+            hist_from_tensor(y_real, y_real_for_hist),
             self.current_epoch,
         )
 

From 4f055d3cb0aac09f311a7cd84d3be385b06b0acf Mon Sep 17 00:00:00 2001
From: MateMarcell <marcell-mate@hotmail.com>
Date: Wed, 24 Feb 2021 20:02:31 +0000
Subject: [PATCH 51/62] TGAN

---
 .../modelling/augmentation/timegan.py         | 28 ++++++++++---------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/src/dagobert/modelling/augmentation/timegan.py b/src/dagobert/modelling/augmentation/timegan.py
index 0f6992ef..7c691182 100644
--- a/src/dagobert/modelling/augmentation/timegan.py
+++ b/src/dagobert/modelling/augmentation/timegan.py
@@ -100,7 +100,7 @@ def run_tgan(args):
 
 class RnnBlock(nn.Module):
     """
-    Class for creating 5 components of TimeGAN.
+    Class for creating 5 different rnn-based nets as components of TimeGAN.
     """
 
     def __init__(
@@ -265,8 +265,8 @@ def training_step(self, batch, batch_idx, optimizer_idx):
         """
         Carries out updates to networks from a batch of real samples.
         Args:
-            batch: batch of
-            batch_idx:
+            batch: batch of X.
+            batch_idx: idx of batch
             optimizer_idx: idx that controls optimizing the 5 networks
 
         Returns:
@@ -390,14 +390,16 @@ def training_step(self, batch, batch_idx, optimizer_idx):
                     prog_bar=True,
                     logger=True,
                 )
-                # pytorch lightning needs to have "loss" in the return dict
-                return {
-                    "loss": loss_disc,
-                    "loss_disc/train": loss_disc,
-                    "y_fake/train": y_fake,
-                    "y_fake_e/train": y_fake_e,
-                    "y_real/train": y_real,
-                }
+                # limit discriminator from being "too good"
+                if loss_disc > 0.15:
+                    # pytorch lightning needs to have "loss" in the return dict
+                    return {
+                        "loss": loss_disc,
+                        "loss_disc/train": loss_disc,
+                        "y_fake/train": y_fake,
+                        "y_fake_e/train": y_fake_e,
+                        "y_real/train": y_real,
+                    }
 
     def configure_optimizers(self) -> List[optim.Optimizer]:
         """
@@ -625,7 +627,7 @@ def discriminator_loss(y_fake, y_fake_e, y_real, emb_weight):
         d_loss_fake_e = criterion(y_fake_e, torch.zeros_like(y_fake_e))
         d_loss_fake = criterion(y_fake, torch.zeros_like(y_fake))
         d_loss_real = criterion(y_real, torch.ones_like(y_real))
-        # TODO: any use of dividing loss by (2 + emb_weight)? probably not
+        # TODO: any use of dividing loss by (2 + emb_weight)? probably readability
         return emb_weight * d_loss_fake_e + d_loss_fake + d_loss_real
 
     # ----------------------------------------------------------------------------------
@@ -637,7 +639,7 @@ def _epoch_end(self, outputs, prefix="val"):
         and log them. Finally, we make plots using all the y_true and y_preds.
         Args:
             outputs:
-            prefix:
+            prefix: indicates train or val epoch end
 
         Returns:
 

From 9c4913b62d0de07b27fbe9ef79171a7a2f17d931 Mon Sep 17 00:00:00 2001
From: MateMarcell <marcell-mate@hotmail.com>
Date: Sun, 28 Feb 2021 21:33:40 +0000
Subject: [PATCH 52/62] fixed optimizers as per TGAN paper. not sure if/why
 images log or not or what the HEKK

---
 .../modelling/augmentation/timegan.py         | 97 +++++++++----------
 src/dagobert/modelling/utils.py               |  8 +-
 2 files changed, 49 insertions(+), 56 deletions(-)

diff --git a/src/dagobert/modelling/augmentation/timegan.py b/src/dagobert/modelling/augmentation/timegan.py
index 7c691182..5365b70c 100644
--- a/src/dagobert/modelling/augmentation/timegan.py
+++ b/src/dagobert/modelling/augmentation/timegan.py
@@ -186,9 +186,7 @@ def __init__(self, hparams: Namespace):
         self.tgan_device = "cuda" if hparams.gpus > 0 else "cpu"
         # prepare datafiles if necessary
         self.hparams = Preprocessing().preprocess_train_dfs(hparams)
-
         # TODO: any sanity checks on data, hypermparams
-        self.real_logging = None
 
         self.comet_logging = not self.hparams.no_comet_logger
 
@@ -277,8 +275,8 @@ def training_step(self, batch, batch_idx, optimizer_idx):
 
         h = self.embedder(x)
 
-        # optimizers #0 & #3 update embedder nets
-        if optimizer_idx in [0, 3]:
+        # optimizers #0, #3 & #5 update embedder nets
+        if optimizer_idx in [0, 3, 5]:
             x_tilde = self.recovery(h)
             # optimize embedding via embedder and recovery nets
             if optimizer_idx == 0:
@@ -294,7 +292,7 @@ def training_step(self, batch, batch_idx, optimizer_idx):
 
                 return loss_e
 
-            elif optimizer_idx == 3:
+            elif optimizer_idx in [3, 5]:
                 h_hat_supervise = self.supervisor(h)
                 loss_embed = TimeGANLightning.embedder_loss(
                     x_tilde,
@@ -326,9 +324,8 @@ def training_step(self, batch, batch_idx, optimizer_idx):
             )
             return loss_supervisor
 
-        # TODO: If you need to control how often those optimizers step or override
-        #   the default .step() schedule, override the optimizer_step() hook.
-        elif optimizer_idx in [2, 4]:
+        # TODO: can we streamline 7 optimizers to 5 with optimizer_step() hook?
+        elif optimizer_idx in [2, 4, 6]:
             # random input to generator
             z = get_noise(
                 batch_len,
@@ -337,41 +334,39 @@ def training_step(self, batch, batch_idx, optimizer_idx):
                 device=self.tgan_device,
             )
             # update generator
-            if optimizer_idx == 2:
-                for i in range(2):
-                    e_hat = self.generator(z)
-                    h_hat = self.supervisor(e_hat)
-                    h_hat_supervise = self.supervisor(h)
-
-                    # synthetic data
-                    x_hat = self.recovery(h_hat)
-
-                    # no_grad to leave discriminator unchanged
-                    with torch.no_grad():
-                        y_fake = self.discriminator(h_hat)
-                        y_fake_e = self.discriminator(e_hat)
-                    loss_gen = TimeGANLightning.generator_loss(
-                        y_fake,
-                        y_fake_e,
-                        h,
-                        h_hat_supervise,
-                        x,
-                        x_hat,
-                        self.hparams.emb_weight,
-                    )
-                    self.log(
-                        "loss_gen/train",
-                        loss_gen,
-                        on_step=False,
-                        on_epoch=True,
-                        prog_bar=True,
-                        logger=True,
-                    )
-                    return loss_gen
+            if optimizer_idx in [2, 4]:
+                e_hat = self.generator(z)
+                h_hat = self.supervisor(e_hat)
+                h_hat_supervise = self.supervisor(h)
+
+                # synthetic data
+                x_hat = self.recovery(h_hat)
+
+                # no_grad to leave discriminator unchanged
+                with torch.no_grad():
+                    y_fake = self.discriminator(h_hat)
+                    y_fake_e = self.discriminator(e_hat)
+                loss_gen = TimeGANLightning.generator_loss(
+                    y_fake,
+                    y_fake_e,
+                    h,
+                    h_hat_supervise,
+                    x,
+                    x_hat,
+                    self.hparams.emb_weight,
+                )
+                self.log(
+                    "loss_gen/train",
+                    loss_gen,
+                    on_step=False,
+                    on_epoch=True,
+                    prog_bar=True,
+                    logger=True,
+                )
+                return loss_gen
 
-            # TODO: if (check_d_loss > 0.15):
             # update discriminator
-            elif optimizer_idx == 4:
+            elif optimizer_idx == 6:
                 e_hat = self.generator(z)
                 h_hat = self.supervisor(e_hat.detach())
                 # detach to update only discriminator
@@ -411,6 +406,8 @@ def configure_optimizers(self) -> List[optim.Optimizer]:
             list(self.generator.parameters()) + list(self.supervisor.parameters()),
             list(self.generator.parameters()) + list(self.supervisor.parameters()),
             list(self.embedder.parameters()) + list(self.recovery.parameters()),
+            list(self.generator.parameters()) + list(self.supervisor.parameters()),
+            list(self.embedder.parameters()) + list(self.recovery.parameters()),
             list(self.discriminator.parameters()),
         ]
         # TODO: diff lr for each net
@@ -683,12 +680,11 @@ def _log_image(self, image_name, image_data, i):
         """
         Logs any generated image to both tensorboard and comet.
         """
-        if self.real_logging:
-            self.logger.experiment[0].add_image(image_name, fig_to_tb(image_data), i)
-            if self.comet_logging:
-                self.logger.experiment[1].log_image(
-                    fig_to_comet(image_data), name=image_name, step=i
-                )
+        self.logger.experiment[0].add_image(image_name, fig_to_tb(image_data), i)
+        if self.comet_logging:
+            self.logger.experiment[1].log_image(
+                fig_to_comet(image_data), name=image_name, step=i
+            )
 
     def _log_graph(self, datasets: GeneratorCryptoDataset):
         """
@@ -697,8 +693,7 @@ def _log_graph(self, datasets: GeneratorCryptoDataset):
         examples_dataloader = DataLoader(datasets, batch_size=32)
         example_shapes = [xi.shape for xi in next(iter(examples_dataloader))[0]]
         examples = [torch.rand(*s).float().to(self.tgan_device) for s in example_shapes]
-        if self.real_logging:
-            self.logger.experiment[0].add_graph(self, examples)
+        self.logger.experiment[0].add_graph(self, examples)
 
     def _make_plots(self, y_real, y_fake, pca_x, pca_x_hat, prefix):
         """
@@ -714,10 +709,10 @@ def _make_plots(self, y_real, y_fake, pca_x, pca_x_hat, prefix):
         )
 
         # HISTOGRAM
-        # discirminator's take on real data
+        # discriminator's take on real data
         y_real_class = (torch.sigmoid(y_real) > 0.5).int()
         y_real_for_hist = torch.sigmoid(y_real)
-        # discirminator's take on fake data
+        # discriminator's take on fake data
         y_fake_class = (torch.sigmoid(y_fake) > 0.5).int()
         y_fake_for_hist = torch.sigmoid(y_fake)
         self._log_image(
diff --git a/src/dagobert/modelling/utils.py b/src/dagobert/modelling/utils.py
index 69a3d876..0de5e3ff 100644
--- a/src/dagobert/modelling/utils.py
+++ b/src/dagobert/modelling/utils.py
@@ -435,19 +435,17 @@ def plot_pca(pca_x, pca_x_hat) -> Figure:
     """
     f, ax = plt.subplots(1)
 
-    length = pca_x.shape[0]
-    colors = ["red" for i in range(length)] + ["blue" for i in range(length)]
-    plt.scatter(pca_x[:, 0], pca_x[:, 1], c=colors[:length], alpha=0.2, label="Real")
+    plt.scatter(pca_x[:, 0], pca_x[:, 1], c="black", alpha=0.2, label="Real")
     plt.scatter(
         pca_x_hat[:, 0],
         pca_x_hat[:, 1],
-        c=colors[length:],
+        c="red",
         alpha=0.2,
         label="Synthetic",
     )
     ax.legend()
     plt.title("PCA plot")
-    plt.xlabel("x-pca")
+    plt.xlabel("x_pca")
     plt.ylabel("y_pca")
     plt.close()
     return f

From efc0556abacc16373df3956f81eb9c0bdeb430a1 Mon Sep 17 00:00:00 2001
From: MateMarcell <marcell-mate@hotmail.com>
Date: Sun, 28 Feb 2021 21:42:42 +0000
Subject: [PATCH 53/62] fix fix

---
 config/timegan_config.yaml                     | 2 ++
 src/dagobert/modelling/augmentation/timegan.py | 1 -
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/config/timegan_config.yaml b/config/timegan_config.yaml
index 78befdae..59bf1bc3 100644
--- a/config/timegan_config.yaml
+++ b/config/timegan_config.yaml
@@ -52,6 +52,8 @@ lr:
   supervisor: 0.001
   generator: 0.001
   embedder1: 0.001
+  generator_: 0.001
+  embedder1_: 0.001
   discriminator: 0.001
 
 # --------------------------------------------------------------------------------------
diff --git a/src/dagobert/modelling/augmentation/timegan.py b/src/dagobert/modelling/augmentation/timegan.py
index 5365b70c..99263636 100644
--- a/src/dagobert/modelling/augmentation/timegan.py
+++ b/src/dagobert/modelling/augmentation/timegan.py
@@ -410,7 +410,6 @@ def configure_optimizers(self) -> List[optim.Optimizer]:
             list(self.embedder.parameters()) + list(self.recovery.parameters()),
             list(self.discriminator.parameters()),
         ]
-        # TODO: diff lr for each net
         if "adamw" in self.hparams.optimizer.lower():
             for param_pair, network in zip(param_pairs, self.hparams.lr.keys()):
                 optimizer = torch.optim.AdamW(param_pair, lr=self.hparams.lr[network])

From cc8f9b80d551e5be36717998dd24433d7340f9d0 Mon Sep 17 00:00:00 2001
From: marcell_mate <marcell.mate@gophr.it>
Date: Wed, 17 Mar 2021 19:55:59 +0000
Subject: [PATCH 54/62] testing on local and adding more flexibility

---
 config/timegan_config.yaml                    | 48 +++++++++++--------
 .../modelling/augmentation/timegan.py         | 34 +++++++++----
 src/dagobert/modelling/dl/preprocessing.py    |  1 +
 src/dagobert/naming.py                        |  5 ++
 4 files changed, 61 insertions(+), 27 deletions(-)

diff --git a/config/timegan_config.yaml b/config/timegan_config.yaml
index 59bf1bc3..eb5de9e2 100644
--- a/config/timegan_config.yaml
+++ b/config/timegan_config.yaml
@@ -3,7 +3,7 @@
 # LIGHTNING
 # --------------------------------------------------------------------------------------
 
-gpus: 0
+gpus: 1
 pin_memory: True
 val_check_interval: 0.5
 print_nan_grads: True
@@ -19,14 +19,14 @@ tags:
   - time_gan_test
 no_comet_logger: True
 seed: 42
-batch_size: 64
+batch_size: 128
 
 # --------------------------------------------------------------------------------------
 # GAN
 # --------------------------------------------------------------------------------------
 
 # gru or lstm
-rnn: "gru"
+rnn: "lstm"
 # embedding weight in cost of generator loss
 emb_weight: 1
 
@@ -41,20 +41,28 @@ binariser_method:
 # --------------------------------------------------------------------------------------
 
 optimizer: "adamw"
-dropout: 0.2
-num_layers: 1
-hidden_size: 10
+dropout:
+  recovery: 0.2
+  embedder: 0.2
+  supervisor: 0.2
+  generator: 0.2
+  discriminator: 0.2
+
+num_layers: 2
+hidden_size: 32
 z_dim: 32
-mini_series_length: 20
-# don't change order with lr dict
+mini_series_length: 256
+
+# don't change order with lr dict.
+# generator_, embedder1_ separated out for ease of code for now. keep lr constant
 lr:
-  embedder0: 0.001
-  supervisor: 0.001
-  generator: 0.001
-  embedder1: 0.001
-  generator_: 0.001
-  embedder1_: 0.001
-  discriminator: 0.001
+  embedder0: 0.0005
+  supervisor: 0.0005
+  generator: 0.0005
+  embedder1: 0.0005
+  generator_: 0.0005
+  embedder1_: 0.0005
+  discriminator: 0.0005
 
 # --------------------------------------------------------------------------------------
 # DATA
@@ -62,11 +70,13 @@ lr:
 
 #data_dir: "C:/Work/dagobert/data/modelling"
 #data_dir: "/home/daniel/dagobert_data/modelling"
-data_dir: "C:/Users/u164428/Desktop/Dagobert/data/modelling"
+data_dir: "C:/Users/marcell/d/data/modelling"
 
 
 df_train:
   anchor: std_bar_ETHUSDT_tick_1.feather
+#  anchor: std_bar_ETHUSDT_tick_1.feather
+
 df_val:
 df_test:
 
@@ -78,8 +88,8 @@ cols_to_model:
     - high
     - low
     - close
-    - cum_ticks
-    - cum_dollar
+#    - cum_ticks
+#    - cum_dollar
     - volume
     - cum_volume_buy
     - cum_volume_sell
@@ -100,7 +110,7 @@ augment_dfs_mix: 0
 # --------------------------------------------------------------------------------------
 
 train_start_date: "2019-01-01"
-train_days: 10
+train_days: 300
 val_days: 1
 val_train_offset_days: 1
 val_puffer_days: 1
diff --git a/src/dagobert/modelling/augmentation/timegan.py b/src/dagobert/modelling/augmentation/timegan.py
index 99263636..b429e3cc 100644
--- a/src/dagobert/modelling/augmentation/timegan.py
+++ b/src/dagobert/modelling/augmentation/timegan.py
@@ -79,11 +79,11 @@ def run_tgan(args):
 
     # setup callbacks
     checkpoint_callback = ModelCheckpoint(
-        monitor="loss_disc/val",
+        monitor="loss_gen/val",
         filename="_{epoch:02d}_{avg_reward:.10f}",
         dirpath=f"{args.log_dir}/models/{args.exp_name}_{tb_logger.version}",
-        save_top_k=3,
-        mode="max",
+        save_top_k=5,
+        mode="min",
     )
 
     # define trainer and and lightning module
@@ -201,7 +201,7 @@ def __init__(self, hparams: Namespace):
             num_layers=self.hparams.num_layers,
             linear_input_size=self.hparams.hidden_size,
             linear_output_size=self.hparams.hidden_size,
-            dropout=self.hparams.dropout,
+            dropout=self.hparams.dropout[NGAN.generator],
             batch_first=True,
             rnn=self.hparams.rnn,
             linear_activation=True,
@@ -212,7 +212,7 @@ def __init__(self, hparams: Namespace):
             num_layers=self.hparams.num_layers,
             linear_input_size=self.hparams.hidden_size,
             linear_output_size=self.hparams.hidden_size,
-            dropout=self.hparams.dropout,
+            dropout=self.hparams.dropout[NGAN.embedder],
             batch_first=True,
             rnn=self.hparams.rnn,
             linear_activation=True,
@@ -224,7 +224,7 @@ def __init__(self, hparams: Namespace):
             num_layers=self.hparams.num_layers,
             linear_input_size=self.hparams.hidden_size,
             linear_output_size=self.hparams.hidden_size,
-            dropout=self.hparams.dropout,
+            dropout=self.hparams.dropout[NGAN.supervisor],
             batch_first=True,
             rnn=self.hparams.rnn,
             linear_activation=True,
@@ -235,7 +235,7 @@ def __init__(self, hparams: Namespace):
             num_layers=self.hparams.num_layers,
             linear_input_size=self.hparams.hidden_size,
             linear_output_size=all_inputs,
-            dropout=self.hparams.dropout,
+            dropout=self.hparams.dropout[NGAN.recovery],
             batch_first=True,
             rnn=self.hparams.rnn,
             linear_activation=True,
@@ -248,7 +248,7 @@ def __init__(self, hparams: Namespace):
             linear_input_size=self.hparams.hidden_size
             * self.hparams.mini_series_length,
             linear_output_size=1,
-            dropout=self.hparams.dropout,
+            dropout=self.hparams.dropout[NGAN.discriminator],
             batch_first=True,
             rnn=self.hparams.rnn,
             linear_activation=False,
@@ -441,6 +441,7 @@ def validation_step(self, batch, batch_idx):
         e_hat = self.generator(z)
         h_hat = self.supervisor(e_hat)
         x_hat = self.recovery(h_hat)
+        h_hat_supervise = self.supervisor(h)
 
         y_fake = self.discriminator(h_hat.detach())
         y_fake_e = self.discriminator(e_hat.detach())
@@ -448,6 +449,23 @@ def validation_step(self, batch, batch_idx):
 
         pca_x, pca_x_hat = pca_analysis(t2n(x), t2n(x_hat))
 
+        loss_gen = TimeGANLightning.generator_loss(
+            y_fake,
+            y_fake_e,
+            h,
+            h_hat_supervise,
+            x,
+            x_hat,
+            self.hparams.emb_weight,
+        )
+        self.log(
+            "loss_gen/val",
+            loss_gen,
+            on_step=False,
+            on_epoch=True,
+            prog_bar=True,
+            logger=True,
+        )
         loss_disc = TimeGANLightning.discriminator_loss(
             y_fake, y_fake_e, y_real, self.hparams.emb_weight
         )
diff --git a/src/dagobert/modelling/dl/preprocessing.py b/src/dagobert/modelling/dl/preprocessing.py
index 5654064b..dddf7c82 100644
--- a/src/dagobert/modelling/dl/preprocessing.py
+++ b/src/dagobert/modelling/dl/preprocessing.py
@@ -345,6 +345,7 @@ def _preprocess_train_dfs(
                     df_to_scale = df_split[cols_to_scale].values.reshape(-1, 1)
                 log_msg = f"{cols_to_scale} of {df_name} {df_split_name} with {id(sc)}."
                 try:
+                    #todo: scikit 0.24+ needs values.reshape(-1,1) to transform
                     check_is_fitted(sc)
                     df_split[cols_to_scale] = sc.transform(df_to_scale)
                     logger.info(f"Transformed {log_msg}")
diff --git a/src/dagobert/naming.py b/src/dagobert/naming.py
index fd2da5a8..f5af7438 100644
--- a/src/dagobert/naming.py
+++ b/src/dagobert/naming.py
@@ -293,3 +293,8 @@ class NGAN(object):
 
     gru = "gru"
     lstm = "lstm"
+    embedder = "embedder"
+    supervisor = "supervisor"
+    generator = "generator"
+    recovery = "recovery"
+    discriminator = "discriminator"

From 79b9ee22fecf0f0ee55cb654fc62ca60e184292f Mon Sep 17 00:00:00 2001
From: marcell_mate <marcell.mate@gophr.it>
Date: Wed, 17 Mar 2021 20:12:38 +0000
Subject: [PATCH 55/62] black. run!

---
 src/dagobert/modelling/augmentation/timegan.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/dagobert/modelling/augmentation/timegan.py b/src/dagobert/modelling/augmentation/timegan.py
index b429e3cc..32bc7a38 100644
--- a/src/dagobert/modelling/augmentation/timegan.py
+++ b/src/dagobert/modelling/augmentation/timegan.py
@@ -263,7 +263,7 @@ def training_step(self, batch, batch_idx, optimizer_idx):
         """
         Carries out updates to networks from a batch of real samples.
         Args:
-            batch: batch of X.
+            batch: batch of X
             batch_idx: idx of batch
             optimizer_idx: idx that controls optimizing the 5 networks
 

From f7c93d06d9e5002835782faca83abf54d241f0d9 Mon Sep 17 00:00:00 2001
From: marcell_mate <marcell.mate@gophr.it>
Date: Wed, 24 Mar 2021 22:47:32 +0000
Subject: [PATCH 56/62] more graphs

---
 config/timegan_config.yaml                    | 13 +++----
 .../modelling/augmentation/timegan.py         | 36 +++++++++++++++----
 src/dagobert/modelling/augmentation/utils.py  | 23 ++++++++++++
 src/dagobert/modelling/utils.py               | 28 +++++++++++++++
 4 files changed, 88 insertions(+), 12 deletions(-)

diff --git a/config/timegan_config.yaml b/config/timegan_config.yaml
index eb5de9e2..cec6b6de 100644
--- a/config/timegan_config.yaml
+++ b/config/timegan_config.yaml
@@ -26,7 +26,7 @@ batch_size: 128
 # --------------------------------------------------------------------------------------
 
 # gru or lstm
-rnn: "lstm"
+rnn: "gru"
 # embedding weight in cost of generator loss
 emb_weight: 1
 
@@ -48,10 +48,10 @@ dropout:
   generator: 0.2
   discriminator: 0.2
 
-num_layers: 2
-hidden_size: 32
+num_layers: 1
+hidden_size: 24
 z_dim: 32
-mini_series_length: 256
+mini_series_length: 128
 
 # don't change order with lr dict.
 # generator_, embedder1_ separated out for ease of code for now. keep lr constant
@@ -75,7 +75,7 @@ data_dir: "C:/Users/marcell/d/data/modelling"
 
 df_train:
   anchor: std_bar_ETHUSDT_tick_1.feather
-#  anchor: std_bar_ETHUSDT_tick_1.feather
+#  df2: std_bar_BTCUSDT_tick_1.feather
 
 df_val:
 df_test:
@@ -100,6 +100,7 @@ cols_to_model:
 #    - cos_date
 #    - sin_time
 #    - cos_time
+#  df2:
 
 augment_method:
 augment_dfs:
@@ -110,7 +111,7 @@ augment_dfs_mix: 0
 # --------------------------------------------------------------------------------------
 
 train_start_date: "2019-01-01"
-train_days: 300
+train_days: 100
 val_days: 1
 val_train_offset_days: 1
 val_puffer_days: 1
diff --git a/src/dagobert/modelling/augmentation/timegan.py b/src/dagobert/modelling/augmentation/timegan.py
index 32bc7a38..c1197da8 100644
--- a/src/dagobert/modelling/augmentation/timegan.py
+++ b/src/dagobert/modelling/augmentation/timegan.py
@@ -36,7 +36,11 @@
 from dagobert.modelling.dl.data import GeneratorCryptoDataset
 from dagobert.modelling.dl import AdaBelief
 
-from dagobert.modelling.augmentation.utils import get_noise, pca_analysis
+from dagobert.modelling.augmentation.utils import (
+    get_noise,
+    pca_analysis,
+    tsne_analysis,
+)
 from dagobert.modelling.utils import (
     triple_barrier_error,
     non_vertical_error,
@@ -48,6 +52,7 @@
     fig_to_tb,
     fig_to_comet,
     plot_pca,
+    plot_tsne,
     update_lookback,
     plot_anchor_sample,
 )
@@ -448,6 +453,7 @@ def validation_step(self, batch, batch_idx):
         y_real = self.discriminator(h.detach())
 
         pca_x, pca_x_hat = pca_analysis(t2n(x), t2n(x_hat))
+        tsne_x, tsne_x_hat = tsne_analysis(t2n(x), t2n(x_hat))
 
         loss_gen = TimeGANLightning.generator_loss(
             y_fake,
@@ -484,6 +490,8 @@ def validation_step(self, batch, batch_idx):
             "y_real/val": y_real,
             "pca_x/val": pca_x,
             "pca_x_hat/val": pca_x_hat,
+            "tsne_x/val": tsne_x,
+            "tsne_x_hat/val": tsne_x_hat,
         }
 
     def validation_epoch_end(self, outputs):
@@ -654,9 +662,6 @@ def _epoch_end(self, outputs, prefix="val"):
         Args:
             outputs:
             prefix: indicates train or val epoch end
-
-        Returns:
-
         """
         if prefix == "train":
             pass
@@ -667,6 +672,8 @@ def _epoch_end(self, outputs, prefix="val"):
             y_fake_e = []
             pca_x = []
             pca_x_hat = []
+            tsne_x = []
+            tsne_x_hat = []
             for x in outputs:
                 avg_loss.append(x[f"loss_disc/{prefix}"])
                 y_real.append(x[f"y_real/{prefix}"])
@@ -674,8 +681,18 @@ def _epoch_end(self, outputs, prefix="val"):
                 y_fake_e.append(x[f"y_fake_e/{prefix}"])
                 pca_x.append(x[f"pca_x/{prefix}"])
                 pca_x_hat.append(x[f"pca_x_hat/{prefix}"])
+                tsne_x.append(x[f"tsne_x/{prefix}"])
+                tsne_x_hat.append(x[f"tsne_x_hat/{prefix}"])
             # log sampled images, only first batch (2 validation rounds @ start)
-            self._make_plots(y_real[0], y_fake[0], pca_x[0], pca_x_hat[0], prefix)
+            self._make_plots(
+                y_real[0],
+                y_fake[0],
+                pca_x[0],
+                pca_x_hat[0],
+                tsne_x[0],
+                tsne_x_hat[0],
+                prefix,
+            )
 
     # ----------------------------------------------------------------------------------
     # PLOTTING AND LOGGING FUNCTIONS
@@ -712,7 +729,7 @@ def _log_graph(self, datasets: GeneratorCryptoDataset):
         examples = [torch.rand(*s).float().to(self.tgan_device) for s in example_shapes]
         self.logger.experiment[0].add_graph(self, examples)
 
-    def _make_plots(self, y_real, y_fake, pca_x, pca_x_hat, prefix):
+    def _make_plots(self, y_real, y_fake, pca_x, pca_x_hat, tsne_x, tsne_x_hat, prefix):
         """
         Makes following useful summary plots:
             - plotting 2-dim PCA for visualising diversity learned
@@ -725,6 +742,13 @@ def _make_plots(self, y_real, y_fake, pca_x, pca_x_hat, prefix):
             self.current_epoch,
         )
 
+        # TSNE SCATTER
+        self._log_image(
+            f"real v fake TSNE-scatter/{prefix}",
+            plot_tsne(tsne_x, tsne_x_hat),
+            self.current_epoch,
+        )
+
         # HISTOGRAM
         # discriminator's take on real data
         y_real_class = (torch.sigmoid(y_real) > 0.5).int()
diff --git a/src/dagobert/modelling/augmentation/utils.py b/src/dagobert/modelling/augmentation/utils.py
index 11351f66..72fde853 100644
--- a/src/dagobert/modelling/augmentation/utils.py
+++ b/src/dagobert/modelling/augmentation/utils.py
@@ -3,6 +3,7 @@
 import torch
 import numpy as np
 from sklearn.decomposition import PCA
+from sklearn.manifold import TSNE
 
 
 def get_noise(n_samples: int, mini_series_length: int, z_dim: int, device: str = "cpu"):
@@ -42,3 +43,25 @@ def pca_analysis(x, x_hat, components: int = 2):
     pca_results = pca.transform(x)
     pca_hat_results = pca.transform(x_hat)
     return pca_results, pca_hat_results
+
+
+def tsne_analysis(x, x_hat, components: int = 2, n_iter=300):
+    """
+    TSNE on 2 (real and synthetic) datasets
+    Args:
+        x: real data of shape (batch, time, feature)
+        x_hat: synthetic data of the same shape
+        components: number of components to keep
+
+    Returns:
+    2 arrays of TSNE-reduced real and synthetic data
+    """
+    x = np.mean(x, 2)
+    x_hat = np.mean(x_hat, 2)
+    batch_len = x.shape[0]
+
+    tsne = TSNE(n_components=components, n_iter=n_iter)
+    tsne_all = tsne.fit_transform(np.concatenate((x, x_hat), axis=0))
+    tsne_results = tsne_all[:batch_len]
+    tsne_hat_results = tsne_all[batch_len:]
+    return tsne_results, tsne_hat_results
diff --git a/src/dagobert/modelling/utils.py b/src/dagobert/modelling/utils.py
index 0de5e3ff..c84612bb 100644
--- a/src/dagobert/modelling/utils.py
+++ b/src/dagobert/modelling/utils.py
@@ -449,3 +449,31 @@ def plot_pca(pca_x, pca_x_hat) -> Figure:
     plt.ylabel("y_pca")
     plt.close()
     return f
+
+
+def plot_tsne(tsne_x, tsne_x_hat) -> Figure:
+    """
+    Plot TSNE-reduced x and x_hat to visualise similarity. Overlap suggests similarity.
+    Args:
+        tsne_x: 2-component-PCA of x
+        tsne_x_hat: 2-component-PCA of x_hat
+
+    Returns:
+        Scatter plot showing 2-component-TSNE of x & x_hat.
+    """
+    f, ax = plt.subplots(1)
+
+    plt.scatter(tsne_x[:, 0], tsne_x[:, 1], c="black", alpha=0.2, label="Real")
+    plt.scatter(
+        tsne_x_hat[:, 0],
+        tsne_x_hat[:, 1],
+        c="red",
+        alpha=0.2,
+        label="Synthetic",
+    )
+    ax.legend()
+    plt.title("TSNE plot")
+    plt.xlabel("x_tsne")
+    plt.ylabel("y_tsne")
+    plt.close()
+    return f

From 337420dd6d6a5bb196a026e560ec57472aaba4bc Mon Sep 17 00:00:00 2001
From: marcell_mate <marcell.mate@gophr.it>
Date: Thu, 25 Mar 2021 11:50:57 +0000
Subject: [PATCH 57/62] keep models based on training loss

---
 src/dagobert/modelling/augmentation/timegan.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/dagobert/modelling/augmentation/timegan.py b/src/dagobert/modelling/augmentation/timegan.py
index c1197da8..59c6b2b5 100644
--- a/src/dagobert/modelling/augmentation/timegan.py
+++ b/src/dagobert/modelling/augmentation/timegan.py
@@ -84,7 +84,7 @@ def run_tgan(args):
 
     # setup callbacks
     checkpoint_callback = ModelCheckpoint(
-        monitor="loss_gen/val",
+        monitor="loss_gen/train",
         filename="_{epoch:02d}_{avg_reward:.10f}",
         dirpath=f"{args.log_dir}/models/{args.exp_name}_{tb_logger.version}",
         save_top_k=5,

From 951ffcdb60db66c67454fb5e84fb227d9c72673e Mon Sep 17 00:00:00 2001
From: marcell_mate <marcell.mate@gophr.it>
Date: Thu, 25 Mar 2021 11:59:11 +0000
Subject: [PATCH 58/62] changing config

---
 config/timegan_config.yaml | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/config/timegan_config.yaml b/config/timegan_config.yaml
index cec6b6de..034d5395 100644
--- a/config/timegan_config.yaml
+++ b/config/timegan_config.yaml
@@ -19,14 +19,14 @@ tags:
   - time_gan_test
 no_comet_logger: True
 seed: 42
-batch_size: 128
+batch_size: 256
 
 # --------------------------------------------------------------------------------------
 # GAN
 # --------------------------------------------------------------------------------------
 
 # gru or lstm
-rnn: "gru"
+rnn: "lstm"
 # embedding weight in cost of generator loss
 emb_weight: 1
 
@@ -48,10 +48,10 @@ dropout:
   generator: 0.2
   discriminator: 0.2
 
-num_layers: 1
+num_layers: 3
 hidden_size: 24
 z_dim: 32
-mini_series_length: 128
+mini_series_length: 256
 
 # don't change order with lr dict.
 # generator_, embedder1_ separated out for ease of code for now. keep lr constant
@@ -75,7 +75,7 @@ data_dir: "C:/Users/marcell/d/data/modelling"
 
 df_train:
   anchor: std_bar_ETHUSDT_tick_1.feather
-#  df2: std_bar_BTCUSDT_tick_1.feather
+  df2: std_bar_BTCUSDT_tick_1.feather
 
 df_val:
 df_test:
@@ -88,7 +88,7 @@ cols_to_model:
     - high
     - low
     - close
-#    - cum_ticks
+    - cum_ticks
 #    - cum_dollar
     - volume
     - cum_volume_buy
@@ -100,7 +100,7 @@ cols_to_model:
 #    - cos_date
 #    - sin_time
 #    - cos_time
-#  df2:
+  df2:
 
 augment_method:
 augment_dfs:
@@ -111,7 +111,7 @@ augment_dfs_mix: 0
 # --------------------------------------------------------------------------------------
 
 train_start_date: "2019-01-01"
-train_days: 100
+train_days: 500
 val_days: 1
 val_train_offset_days: 1
 val_puffer_days: 1

From eec0ee7b7da66f1151f199fb00a4023cdf9163f8 Mon Sep 17 00:00:00 2001
From: marcell_mate <marcell.mate@gophr.it>
Date: Thu, 25 Mar 2021 13:16:13 +0000
Subject: [PATCH 59/62] changing config

---
 config/timegan_config.yaml                     | 4 ++--
 src/dagobert/modelling/augmentation/timegan.py | 4 +++-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/config/timegan_config.yaml b/config/timegan_config.yaml
index 034d5395..c6743087 100644
--- a/config/timegan_config.yaml
+++ b/config/timegan_config.yaml
@@ -13,7 +13,7 @@ print_nan_grads: True
 # --------------------------------------------------------------------------------------
 
 log_dir: logs
-num_workers: 0
+num_workers: 8
 exp_name: TGAN-test
 tags:
   - time_gan_test
@@ -71,7 +71,7 @@ lr:
 #data_dir: "C:/Work/dagobert/data/modelling"
 #data_dir: "/home/daniel/dagobert_data/modelling"
 data_dir: "C:/Users/marcell/d/data/modelling"
-
+# data_dir: "home/ubuntu/dagobert/data/modelling"
 
 df_train:
   anchor: std_bar_ETHUSDT_tick_1.feather
diff --git a/src/dagobert/modelling/augmentation/timegan.py b/src/dagobert/modelling/augmentation/timegan.py
index 59c6b2b5..b1d62d29 100644
--- a/src/dagobert/modelling/augmentation/timegan.py
+++ b/src/dagobert/modelling/augmentation/timegan.py
@@ -84,7 +84,7 @@ def run_tgan(args):
 
     # setup callbacks
     checkpoint_callback = ModelCheckpoint(
-        monitor="loss_gen/train",
+        monitor="loss_gen/val",
         filename="_{epoch:02d}_{avg_reward:.10f}",
         dirpath=f"{args.log_dir}/models/{args.exp_name}_{tb_logger.version}",
         save_top_k=5,
@@ -395,6 +395,7 @@ def training_step(self, batch, batch_idx, optimizer_idx):
                     # pytorch lightning needs to have "loss" in the return dict
                     return {
                         "loss": loss_disc,
+                        "loss_gen/train": loss_gen,
                         "loss_disc/train": loss_disc,
                         "y_fake/train": y_fake,
                         "y_fake_e/train": y_fake_e,
@@ -484,6 +485,7 @@ def validation_step(self, batch, batch_idx):
             logger=True,
         )
         return {
+            "loss_gen/val": loss_gen,
             "loss_disc/val": loss_disc,
             "y_fake/val": y_fake,
             "y_fake_e/val": y_fake_e,

From fa5ab6fa2d89acdbd872b7972a722028266a89e6 Mon Sep 17 00:00:00 2001
From: marcell_mate <marcell.mate@gophr.it>
Date: Thu, 25 Mar 2021 13:26:31 +0000
Subject: [PATCH 60/62] changing config

---
 config/timegan_config.yaml                     | 2 +-
 src/dagobert/modelling/augmentation/timegan.py | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/config/timegan_config.yaml b/config/timegan_config.yaml
index c6743087..ffe7ebc4 100644
--- a/config/timegan_config.yaml
+++ b/config/timegan_config.yaml
@@ -71,7 +71,7 @@ lr:
 #data_dir: "C:/Work/dagobert/data/modelling"
 #data_dir: "/home/daniel/dagobert_data/modelling"
 data_dir: "C:/Users/marcell/d/data/modelling"
-# data_dir: "home/ubuntu/dagobert/data/modelling"
+# data_dir: "/home/ubuntu/dagobert/data/modelling"
 
 df_train:
   anchor: std_bar_ETHUSDT_tick_1.feather
diff --git a/src/dagobert/modelling/augmentation/timegan.py b/src/dagobert/modelling/augmentation/timegan.py
index b1d62d29..bdccfd7e 100644
--- a/src/dagobert/modelling/augmentation/timegan.py
+++ b/src/dagobert/modelling/augmentation/timegan.py
@@ -395,7 +395,6 @@ def training_step(self, batch, batch_idx, optimizer_idx):
                     # pytorch lightning needs to have "loss" in the return dict
                     return {
                         "loss": loss_disc,
-                        "loss_gen/train": loss_gen,
                         "loss_disc/train": loss_disc,
                         "y_fake/train": y_fake,
                         "y_fake_e/train": y_fake_e,

From 96570be6abd5841b54ef485e70f8224bac7612e8 Mon Sep 17 00:00:00 2001
From: marcell_mate <marcell.mate@gophr.it>
Date: Wed, 14 Jul 2021 12:21:32 +0100
Subject: [PATCH 61/62] config and accessing node nb

---
 config/timegan_config.yaml                    |   4 +-
 config/timegan_config_local.yaml              | 122 ++++++++
 .../experiments/interact_with_nodes.ipynb     |   2 +-
 .../interact_with_nodes_tgan.ipynb            | 268 ++++++++++++++++++
 4 files changed, 393 insertions(+), 3 deletions(-)
 create mode 100644 config/timegan_config_local.yaml
 create mode 100644 notebooks/experiments/interact_with_nodes_tgan.ipynb

diff --git a/config/timegan_config.yaml b/config/timegan_config.yaml
index ffe7ebc4..1c76881d 100644
--- a/config/timegan_config.yaml
+++ b/config/timegan_config.yaml
@@ -70,8 +70,8 @@ lr:
 
 #data_dir: "C:/Work/dagobert/data/modelling"
 #data_dir: "/home/daniel/dagobert_data/modelling"
-data_dir: "C:/Users/marcell/d/data/modelling"
-# data_dir: "/home/ubuntu/dagobert/data/modelling"
+# data_dir: "C:/Users/marcell/d/data/modelling"
+data_dir: "/home/ubuntu/dagobert/data/modelling"
 
 df_train:
   anchor: std_bar_ETHUSDT_tick_1.feather
diff --git a/config/timegan_config_local.yaml b/config/timegan_config_local.yaml
new file mode 100644
index 00000000..ffe7ebc4
--- /dev/null
+++ b/config/timegan_config_local.yaml
@@ -0,0 +1,122 @@
+
+# --------------------------------------------------------------------------------------
+# LIGHTNING
+# --------------------------------------------------------------------------------------
+
+gpus: 1
+pin_memory: True
+val_check_interval: 0.5
+print_nan_grads: True
+
+# --------------------------------------------------------------------------------------
+# RUN
+# --------------------------------------------------------------------------------------
+
+log_dir: logs
+num_workers: 8
+exp_name: TGAN-test
+tags:
+  - time_gan_test
+no_comet_logger: True
+seed: 42
+batch_size: 256
+
+# --------------------------------------------------------------------------------------
+# GAN
+# --------------------------------------------------------------------------------------
+
+# gru or lstm
+rnn: "lstm"
+# embedding weight in cost of generator loss
+emb_weight: 1
+
+# don't change these, or preprocessing won't work
+target_col:
+to_label: False
+no_sample_weights: True
+binariser_method:
+
+# --------------------------------------------------------------------------------------
+# MODEL
+# --------------------------------------------------------------------------------------
+
+optimizer: "adamw"
+dropout:
+  recovery: 0.2
+  embedder: 0.2
+  supervisor: 0.2
+  generator: 0.2
+  discriminator: 0.2
+
+num_layers: 3
+hidden_size: 24
+z_dim: 32
+mini_series_length: 256
+
+# don't change order with lr dict.
+# generator_, embedder1_ separated out for ease of code for now. keep lr constant
+lr:
+  embedder0: 0.0005
+  supervisor: 0.0005
+  generator: 0.0005
+  embedder1: 0.0005
+  generator_: 0.0005
+  embedder1_: 0.0005
+  discriminator: 0.0005
+
+# --------------------------------------------------------------------------------------
+# DATA
+# --------------------------------------------------------------------------------------
+
+#data_dir: "C:/Work/dagobert/data/modelling"
+#data_dir: "/home/daniel/dagobert_data/modelling"
+data_dir: "C:/Users/marcell/d/data/modelling"
+# data_dir: "/home/ubuntu/dagobert/data/modelling"
+
+df_train:
+  anchor: std_bar_ETHUSDT_tick_1.feather
+  df2: std_bar_BTCUSDT_tick_1.feather
+
+df_val:
+df_test:
+
+# the cols of the secondary DFs will automatically be set to anchor's if not defined
+cols_to_model:
+  anchor:
+    - date_diff
+    - open
+    - high
+    - low
+    - close
+    - cum_ticks
+#    - cum_dollar
+    - volume
+    - cum_volume_buy
+    - cum_volume_sell
+    - cum_volume_quote
+    - cum_volume_quote_buy
+    - cum_volume_quote_sell
+#    - sin_date
+#    - cos_date
+#    - sin_time
+#    - cos_time
+  df2:
+
+augment_method:
+augment_dfs:
+augment_dfs_mix: 0
+
+# --------------------------------------------------------------------------------------
+# PREPROCESSING
+# --------------------------------------------------------------------------------------
+
+train_start_date: "2019-01-01"
+train_days: 500
+val_days: 1
+val_train_offset_days: 1
+val_puffer_days: 1
+test_days: 1
+test_train_offset_days: 62
+test_puffer_days: 1
+
+scaling_method: minmax
\ No newline at end of file
diff --git a/notebooks/experiments/interact_with_nodes.ipynb b/notebooks/experiments/interact_with_nodes.ipynb
index f6e63954..6aa7b0a9 100644
--- a/notebooks/experiments/interact_with_nodes.ipynb
+++ b/notebooks/experiments/interact_with_nodes.ipynb
@@ -228,7 +228,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.9"
+   "version": "3.8.5"
   }
  },
  "nbformat": 4,
diff --git a/notebooks/experiments/interact_with_nodes_tgan.ipynb b/notebooks/experiments/interact_with_nodes_tgan.ipynb
new file mode 100644
index 00000000..3692f08f
--- /dev/null
+++ b/notebooks/experiments/interact_with_nodes_tgan.ipynb
@@ -0,0 +1,268 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "ModuleNotFoundError",
+     "evalue": "No module named 'paramiko'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
+      "\u001b[1;32m<ipython-input-4-8b881a967c18>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mpathlib\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mPath\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      2\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mtime\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 3\u001b[1;33m \u001b[1;32mimport\u001b[0m \u001b[0mparamiko\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
+      "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'paramiko'"
+     ]
+    }
+   ],
+   "source": [
+    "from pathlib import Path\n",
+    "import time\n",
+    "import paramiko "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'C:\\\\Users\\\\marcell\\\\d\\\\dagobert\\\\notebooks\\\\experiments'"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pwd"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## London GPUs\n",
+    "\n",
+    "- log in to all 10\n",
+    "- pull latest branch\n",
+    "- delete prev data files"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "london_gpus = [\n",
+    "    \"ec2-3-8-198-113.eu-west-2.compute.amazonaws.com\",\n",
+    "    \"ec2-18-132-49-7.eu-west-2.compute.amazonaws.com\",\n",
+    "    \"ec2-35-178-168-24.eu-west-2.compute.amazonaws.com\",\n",
+    "    \"ec2-18-130-246-221.eu-west-2.compute.amazonaws.com\",\n",
+    "    \"ec2-52-56-202-156.eu-west-2.compute.amazonaws.com\",\n",
+    "    \"ec2-18-132-17-125.eu-west-2.compute.amazonaws.com\",\n",
+    "    \"ec2-35-178-170-162.eu-west-2.compute.amazonaws.com\",\n",
+    "    \"ec2-3-8-155-239.eu-west-2.compute.amazonaws.com\",\n",
+    "    \"ec2-18-130-180-205.eu-west-2.compute.amazonaws.com\",\n",
+    "    \"ec2-3-8-194-52.eu-west-2.compute.amazonaws.com\",\n",
+    "]\n",
+    "username = \"ubuntu\"\n",
+    "london_k = paramiko.RSAKey.from_private_key_file(\"../../../sec/dagobert.pem\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "metadata": {
+    "collapsed": true,
+    "jupyter": {
+     "outputs_hidden": true
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0 ec2-3-8-198-113.eu-west-2.compute.amazonaws.com\n",
+      "0 b''\n",
+      "1 ec2-18-132-49-7.eu-west-2.compute.amazonaws.com\n",
+      "1 b''\n",
+      "2 ec2-35-178-168-24.eu-west-2.compute.amazonaws.com\n",
+      "2 b''\n",
+      "3 ec2-18-130-246-221.eu-west-2.compute.amazonaws.com\n"
+     ]
+    },
+    {
+     "ename": "KeyboardInterrupt",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
+      "\u001b[1;32m<ipython-input-35-a80ad43b0faf>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m     16\u001b[0m         \"\"\"\n\u001b[0;32m     17\u001b[0m     \u001b[0mssh_stdin\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mssh_stdout\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mssh_stderr\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mssh\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexec_command\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcmd2\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 18\u001b[1;33m     \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mi\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mssh_stderr\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     19\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;32m~\\Anaconda3\\envs\\dagobert\\lib\\site-packages\\paramiko\\file.py\u001b[0m in \u001b[0;36mread\u001b[1;34m(self, size)\u001b[0m\n\u001b[0;32m    198\u001b[0m             \u001b[1;32mwhile\u001b[0m \u001b[1;32mTrue\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    199\u001b[0m                 \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 200\u001b[1;33m                     \u001b[0mnew_data\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_read\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_DEFAULT_BUFSIZE\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    201\u001b[0m                 \u001b[1;32mexcept\u001b[0m \u001b[0mEOFError\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    202\u001b[0m                     \u001b[0mnew_data\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;32m~\\Anaconda3\\envs\\dagobert\\lib\\site-packages\\paramiko\\channel.py\u001b[0m in \u001b[0;36m_read\u001b[1;34m(self, size)\u001b[0m\n\u001b[0;32m   1374\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1375\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0m_read\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msize\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1376\u001b[1;33m         \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mchannel\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrecv_stderr\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msize\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   1377\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1378\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0m_write\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdata\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;32m~\\Anaconda3\\envs\\dagobert\\lib\\site-packages\\paramiko\\channel.py\u001b[0m in \u001b[0;36mrecv_stderr\u001b[1;34m(self, nbytes)\u001b[0m\n\u001b[0;32m    745\u001b[0m         \"\"\"\n\u001b[0;32m    746\u001b[0m         \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 747\u001b[1;33m             \u001b[0mout\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0min_stderr_buffer\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mnbytes\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    748\u001b[0m         \u001b[1;32mexcept\u001b[0m \u001b[0mPipeTimeout\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    749\u001b[0m             \u001b[1;32mraise\u001b[0m \u001b[0msocket\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;32m~\\Anaconda3\\envs\\dagobert\\lib\\site-packages\\paramiko\\buffered_pipe.py\u001b[0m in \u001b[0;36mread\u001b[1;34m(self, nbytes, timeout)\u001b[0m\n\u001b[0;32m    158\u001b[0m                 \u001b[1;32mwhile\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_buffer\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;36m0\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mand\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_closed\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    159\u001b[0m                     \u001b[0mthen\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtime\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 160\u001b[1;33m                     \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_cv\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mwait\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    161\u001b[0m                     \u001b[1;32mif\u001b[0m \u001b[0mtimeout\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    162\u001b[0m                         \u001b[0mtimeout\u001b[0m \u001b[1;33m-=\u001b[0m \u001b[0mtime\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtime\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m-\u001b[0m \u001b[0mthen\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;32m~\\Anaconda3\\envs\\dagobert\\lib\\threading.py\u001b[0m in \u001b[0;36mwait\u001b[1;34m(self, timeout)\u001b[0m\n\u001b[0;32m    294\u001b[0m         \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m    \u001b[1;31m# restore state no matter what (e.g., KeyboardInterrupt)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    295\u001b[0m             \u001b[1;32mif\u001b[0m \u001b[0mtimeout\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 296\u001b[1;33m                 \u001b[0mwaiter\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0macquire\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    297\u001b[0m                 \u001b[0mgotit\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;32mTrue\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    298\u001b[0m             \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;31mKeyboardInterrupt\u001b[0m: "
+     ]
+    }
+   ],
+   "source": [
+    "for i, hostname in enumerate(london_gpus):\n",
+    "    ssh = paramiko.SSHClient()\n",
+    "    ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())\n",
+    "    ssh.connect(hostname=hostname, username=username, pkey=london_k)\n",
+    "    print(i, hostname)\n",
+    "    cmd = \"\"\"\n",
+    "        cd dagobert/dagobert;\n",
+    "        rm ../data/modelling/*;\n",
+    "        git pull https://danielhomola:4frvgh%GTB@github.com/danielhomola/dagobert hparams_labelling;\n",
+    "        \"\"\"\n",
+    "    ssh_stdin, ssh_stdout, ssh_stderr = ssh.exec_command(cmd)\n",
+    "    print(i, ssh_stderr.read())\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "## Zip all log folders\n",
+    "\n",
+    "- log in to all gpus and zip all folders that start with log and log the models too\n",
+    "- uplaod them to s3\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "import time\n",
+    "import paramiko "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "local_path = Path('/home/daniel/dagobert_data/')\n",
+    "username = \"ubuntu\"\n",
+    "ohio_k = paramiko.RSAKey.from_private_key_file(\"../../../sec/dagobert_preprocessing_node.pem\")\n",
+    "london_k = paramiko.RSAKey.from_private_key_file(\"../../../sec/dagobert.pem\")\n",
+    "nodes = {\n",
+    "    \"gpu1\": {\"hostname\": \"ec2-52-20-7-61.compute-1.amazonaws.com\", \"key\": ohio_k},\n",
+    "    \"gpu2\": {\"hostname\": \"ec2-52-22-178-27.compute-1.amazonaws.com\", \"key\": ohio_k},\n",
+    "    \"gpu3\": {\"hostname\": \"ec2-54-147-237-118.compute-1.amazonaws.com\", \"key\": ohio_k},\n",
+    "    \"gpu4\": {\"hostname\": \"ec2-54-152-39-74.compute-1.amazonaws.com\", \"key\": ohio_k},\n",
+    "    \"gpu5\": {\"hostname\": \"ec2-54-225-32-4.compute-1.amazonaws.com\", \"key\": ohio_k},\n",
+    "    \"gpu6\": {\"hostname\": \"ec2-54-90-219-179.compute-1.amazonaws.com\", \"key\": ohio_k},\n",
+    "    \"gpu7\": {\"hostname\": \"ec2-100-24-115-15.compute-1.amazonaws.com\", \"key\": ohio_k},\n",
+    "    \"gpu8\": {\"hostname\": \"ec2-3-236-251-175.compute-1.amazonaws.com\", \"key\": ohio_k},\n",
+    "    \"gpu9\": {\"hostname\": \"ec2-34-237-76-111.compute-1.amazonaws.com\", \"key\": ohio_k},\n",
+    "    \"gpu10\": {\"hostname\": \"ec2-3-10-228-3.eu-west-2.compute.amazonaws.com\",\"key\": london_k},\n",
+    "    \"gpu11\": {\"hostname\": \"ec2-18-130-191-126.eu-west-2.compute.amazonaws.com\",\"key\": london_k},\n",
+    "    \"gpu12\": {\"hostname\": \"ec2-3-10-150-229.eu-west-2.compute.amazonaws.com\",\"key\": london_k},\n",
+    "    \"gpu13\": {\"hostname\": \"ec2-3-8-28-118.eu-west-2.compute.amazonaws.com\",\"key\": london_k},\n",
+    "    \"gpu14\": {\"hostname\": \"ec2-35-176-172-205.eu-west-2.compute.amazonaws.com\",\"key\": london_k},\n",
+    "    \"gpu15\": {\"hostname\": \"ec2-18-133-29-17.eu-west-2.compute.amazonaws.com\",\"key\": london_k},\n",
+    "    \"gpu16\": {\"hostname\": \"ec2-18-133-64-254.eu-west-2.compute.amazonaws.com\",\"key\": london_k},\n",
+    "    \"gpu17\": {\"hostname\": \"ec2-3-8-197-96.eu-west-2.compute.amazonaws.com\",\"key\": london_k},\n",
+    "    \"gpu18\": {\"hostname\": \"ec2-35-178-66-77.eu-west-2.compute.amazonaws.com\",\"key\": london_k},\n",
+    "    \"gpu19\": {\"hostname\": \"ec2-3-8-181-180.eu-west-2.compute.amazonaws.com\",\"key\": london_k},\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "-------------------------------------------\n",
+      "gpu6\n",
+      "-------------------------------------------\n",
+      "Uploaded everything successfully for gpu6\n",
+      "-------------------------------------------\n",
+      "gpu7\n",
+      "-------------------------------------------\n",
+      "Uploaded everything successfully for gpu7\n",
+      "-------------------------------------------\n",
+      "gpu8\n",
+      "-------------------------------------------\n",
+      "Uploaded everything successfully for gpu8\n",
+      "-------------------------------------------\n",
+      "gpu9\n",
+      "-------------------------------------------\n",
+      "Uploaded everything successfully for gpu9\n"
+     ]
+    }
+   ],
+   "source": [
+    "for name, node in nodes.items():\n",
+    "    ssh = paramiko.SSHClient()\n",
+    "    ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())\n",
+    "    ssh.connect(hostname=node['hostname'], username=username, pkey=node['key'])\n",
+    "    \n",
+    "    print('-------------------------------------------')\n",
+    "    print(name)\n",
+    "    print('-------------------------------------------')\n",
+    "    \n",
+    "    cmd = (\n",
+    "        f\"cd dagobert/dagobert;\"\n",
+    "        f\"sudo apt install zip;\"\n",
+    "        f\"rm -rf logs_run1;\"\n",
+    "        f\"ls | grep logs | xargs  zip {name}_all_logs.zip -r;\"\n",
+    "        f\"zip {name}_models.zip -r TCN;\"\n",
+    "        f\"aws s3 cp {name}_all_logs.zip s3://dagobert/;\"\n",
+    "        f\"aws s3 cp {name}_models.zip s3://dagobert/;\"\n",
+    "    )\n",
+    "    ssh_stdin, ssh_stdout, ssh_stderr = ssh.exec_command(cmd)\n",
+    "    print(f'Uploaded everything successfully for {name}')\n",
+    "    \n",
+    "    # download zip - not used in the end because we have better ways\n",
+    "    # ftp_client=ssh.open_sftp()\n",
+    "    # ftp_client.get(\"/home/ubuntu/dagobert/dagobert/all_logs.zip\", local_path / f\"{name}_all_logs.zip\")\n",
+    "    # print (f\"Downloaded all_zips from {name}.\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

From 11296f7e75ad19f6ea558e7dea3d5b72acdad14a Mon Sep 17 00:00:00 2001
From: marcell_mate <marcell.mate@gophr.it>
Date: Thu, 15 Jul 2021 12:41:22 +0100
Subject: [PATCH 62/62] tgan readme

---
 config/timegan_config.yaml                    | 16 +++---
 src/dagobert/modelling/augmentation/README.md | 54 +++++++++++++++++++
 2 files changed, 63 insertions(+), 7 deletions(-)
 create mode 100644 src/dagobert/modelling/augmentation/README.md

diff --git a/config/timegan_config.yaml b/config/timegan_config.yaml
index 1c76881d..10a9f0bc 100644
--- a/config/timegan_config.yaml
+++ b/config/timegan_config.yaml
@@ -22,14 +22,9 @@ seed: 42
 batch_size: 256
 
 # --------------------------------------------------------------------------------------
-# GAN
+# PREPROCESSING
 # --------------------------------------------------------------------------------------
 
-# gru or lstm
-rnn: "lstm"
-# embedding weight in cost of generator loss
-emb_weight: 1
-
 # don't change these, or preprocessing won't work
 target_col:
 to_label: False
@@ -40,6 +35,13 @@ binariser_method:
 # MODEL
 # --------------------------------------------------------------------------------------
 
+
+# gru or lstm
+rnn: "lstm"
+
+# embedding weight in cost of generator loss
+emb_weight: 1
+
 optimizer: "adamw"
 dropout:
   recovery: 0.2
@@ -107,7 +109,7 @@ augment_dfs:
 augment_dfs_mix: 0
 
 # --------------------------------------------------------------------------------------
-# PREPROCESSING
+# PREPROCESSING DATES
 # --------------------------------------------------------------------------------------
 
 train_start_date: "2019-01-01"
diff --git a/src/dagobert/modelling/augmentation/README.md b/src/dagobert/modelling/augmentation/README.md
new file mode 100644
index 00000000..fc4ee69f
--- /dev/null
+++ b/src/dagobert/modelling/augmentation/README.md
@@ -0,0 +1,54 @@
+# Dagobert augmentation / TGAN module
+
+This module holds the implementation of TimeGAN. It is adopted to fit into Pytorch Lightning so we get benefits of easy 
+set up, checkpointing etc. The network is based on 
+[this paper](https://papers.nips.cc/paper/2019/file/c9efe5f26cd17ba6216bbe2a7d26d490-Paper.pdf).
+
+A couple of other useful links can be found in [this issue](https://github.com/danielhomola/dagobert/issues/63)
+
+## Running it
+Like the `dl` module, this module this is to be driven from the cmd line via an entry-point and a config file.
+```
+dagobert-tgan -c config/timegan_config.yaml
+```
+
+## Config params
+There are some config params which we can tinker with for optimal training. The example one 
+(`config/timegan_config.yml`) is nicely structured in blocks so it should be easy to understand which relate to the 
+TimeGAN model structure.
+
+The params are nicely documented in `augmentation/tgan_args.py` so make sure to check there before trying to find out 
+from the code what each of these do.
+
+## The TimeGAN 
+Much of the structure was implemented as quoted from the original paper (above). Five RNNs work together to create a 
+learned embedding space optimized with both supervised and adversarial objectives, encouraging the network to adhere to 
+the temporal dynamics of the training data.
+
+GAN convergence is notorioulsy tricky, and there are a bunch of handles and hyperparameters we can toggle 
+(some of this is inspired by various literature about training GANs more widely):
+- in order for the discirminator not to get 'too smart', we optimise it on the simple condition that the loss is not too
+  small. 
+- the generator (and one of the embedders) is optimised twice before every optimisation of the discriminator - this is currently implemented in a 
+  crude way, but it works.
+- convergence is very training intensive, the authors and [other implementations](https://github.com/jsyoon0823/TimeGAN)
+  all refer to 5-10k epochs
+- one imprtant aspect of this network is that the ouput series' lenght is a hyperparameter we set before training, and 
+  need to feed in the same lenght for X - we call this mini series length
+- performance is measured visually by PCA, and t-SNE analyses between the original and the synthetic data. For 
+discriminative performance the authors use an rnn classifier to distinguish between real and synthetic data. For 
+  predictive performance they train an RNN to predict the last element of a series on synthetic samples. This trained 
+  rnn is then validated on real data, measuring MAE. These latter two are not implemented currently, as synthetics data 
+  was already fucked up upon visual inspection of PCA, t-SNE.
+  
+## Future work
+- experiment with more advanced learning rates for different components of TimeGAN
+- warm up training of generator is a common measure to avoid lack of convergence
+- try different thresholds for prohibiting discriminator optimisation
+- it is challenging to iterate fast with this project. Convergence can take time, and training the generator 
+can go well initially and then deteriorate or vica versa. To always wait for being able to inspect (PCA,t-SNE) visuals, 
+  is time consuming and inconsistent. Some distance measurements between real/synthetic can come in handy for triggering
+  various actions, or just introducing more consistency into monitoring.
+
+
+