From 8104ddaf7859136c34300509caaa9de74bd378b6 Mon Sep 17 00:00:00 2001 From: Ben Messerly Date: Fri, 26 Aug 2022 14:23:08 -0500 Subject: [PATCH 1/7] Fiddling with RL ML. --- DeepReinforcementLearning.ipynb | 423 ++++++++++++++++++++++++++++++++ DumbGame.py | 62 +++++ hanabi_ml_2.ipynb | 228 +++++++++++++++++ tests/DumbGame.py | 93 +++++++ 4 files changed, 806 insertions(+) create mode 100644 DeepReinforcementLearning.ipynb create mode 100644 DumbGame.py create mode 100644 hanabi_ml_2.ipynb create mode 100644 tests/DumbGame.py diff --git a/DeepReinforcementLearning.ipynb b/DeepReinforcementLearning.ipynb new file mode 100644 index 0000000..fc934a5 --- /dev/null +++ b/DeepReinforcementLearning.ipynb @@ -0,0 +1,423 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 0. Install Dependencies" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 1. Test Random Environment with OpenAI Gym" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "States:(1,) Actions:5\n" + ] + } + ], + "source": [ + "from DumbGame import DumbGameEnv\n", + "env = DumbGameEnv()\n", + "states = env.observation_space.shape\n", + "actions = env.action_space.n\n", + "print(f\"States:{states} Actions:{actions}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 2. Create a Deep Learning Model with Keras" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "from tensorflow.keras.models import Sequential\n", + "from tensorflow.keras.layers import Dense, Flatten\n", + "from tensorflow.keras.optimizers import Adam" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "states = env.observation_space.shape\n", + "actions = env.action_space.n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "5" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "actions" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "def build_model(states, actions):\n", + " model = Sequential() \n", + " model.add(Dense(24, activation='relu', input_shape=states))\n", + " model.add(Dense(24, activation='relu'))\n", + " model.add(Dense(actions, activation='linear'))\n", + " return model" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "del model " + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "model = build_model(states, actions)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model: \"sequential_2\"\n", + "_________________________________________________________________\n", + "Layer (type) Output Shape Param # \n", + "=================================================================\n", + "dense_6 (Dense) (None, 24) 48 \n", + "_________________________________________________________________\n", + "dense_7 (Dense) (None, 24) 600 \n", + "_________________________________________________________________\n", + "dense_8 (Dense) (None, 5) 125 \n", + "=================================================================\n", + "Total params: 773\n", + "Trainable params: 773\n", + "Non-trainable params: 0\n", + "_________________________________________________________________\n" + ] + } + ], + "source": [ + "model.summary()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 3. Build Agent with Keras-RL" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "from rl.agents import DQNAgent\n", + "from rl.policy import BoltzmannQPolicy\n", + "from rl.memory import SequentialMemory" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "def build_agent(model, actions):\n", + " policy = BoltzmannQPolicy()\n", + " memory = SequentialMemory(limit=50000, window_length=1)\n", + " dqn = DQNAgent(model=model, memory=memory, policy=policy, \n", + " nb_actions=actions, nb_steps_warmup=10, target_model_update=1e-2)\n", + " return dqn" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Training for 50000 steps ...\n", + "Interval 1 (0 steps performed)\n", + "WARNING:tensorflow:From /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages/tensorflow/python/keras/engine/training_v1.py:2070: Model.state_updates (from tensorflow.python.keras.engine.training) is deprecated and will be removed in a future version.\n", + "Instructions for updating:\n", + "This property should not be used in TensorFlow 2.0, as updates are applied automatically.\n", + " 1/10000 [..............................] - ETA: 9:10 - reward: -1.0000" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages/rl/memory.py:37: UserWarning: Not enough entries to sample without replacement. Consider increasing your warm-up phase to avoid oversampling!\n", + " warnings.warn('Not enough entries to sample without replacement. Consider increasing your warm-up phase to avoid oversampling!')\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "10000/10000 [==============================] - 73s 7ms/step - reward: -0.5974\n", + "402 episodes - episode_reward: -14.821 [-47.000, 5.000] - loss: 1.242 - mae: 5.141 - mean_q: -6.008\n", + "\n", + "Interval 2 (10000 steps performed)\n", + "10000/10000 [==============================] - 70s 7ms/step - reward: -0.5944\n", + "406 episodes - episode_reward: -14.675 [-52.000, 3.000] - loss: 1.479 - mae: 6.070 - mean_q: -7.153\n", + "\n", + "Interval 3 (20000 steps performed)\n", + "10000/10000 [==============================] - 71s 7ms/step - reward: -0.5978\n", + "402 episodes - episode_reward: -14.876 [-59.000, 2.000] - loss: 1.487 - mae: 6.079 - mean_q: -7.167\n", + "\n", + "Interval 4 (30000 steps performed)\n", + "10000/10000 [==============================] - 71s 7ms/step - reward: -0.5982\n", + "402 episodes - episode_reward: -14.883 [-60.000, 4.000] - loss: 1.505 - mae: 6.153 - mean_q: -7.265\n", + "\n", + "Interval 5 (40000 steps performed)\n", + "10000/10000 [==============================] - 73s 7ms/step - reward: -0.6216\n", + "done, took 357.885 seconds\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dqn = build_agent(model, actions)\n", + "dqn.compile(Adam(lr=1e-3), metrics=['mae'])\n", + "dqn.fit(env, nb_steps=50000, visualize=False, verbose=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Testing for 100 episodes ...\n" + ] + }, + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m/var/folders/wp/_fng4ppn01b2j4_j98240s780000gn/T/ipykernel_10921/978772492.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mscores\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdqn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0menv\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnb_episodes\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m100\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvisualize\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmean\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mscores\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhistory\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'episode_reward'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/Home/Working/Hanabi/envs/lib/python3.7/site-packages/rl/core.py\u001b[0m in \u001b[0;36mtest\u001b[0;34m(self, env, nb_episodes, action_repetition, callbacks, visualize, nb_max_episode_steps, nb_max_start_steps, start_step_policy, verbose)\u001b[0m\n\u001b[1;32m 339\u001b[0m \u001b[0mcallbacks\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mon_step_begin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mepisode_step\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 340\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 341\u001b[0;31m \u001b[0maction\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mforward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mobservation\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 342\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mprocessor\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 343\u001b[0m \u001b[0maction\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mprocessor\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mprocess_action\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maction\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/Home/Working/Hanabi/envs/lib/python3.7/site-packages/rl/agents/dqn.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, observation)\u001b[0m\n\u001b[1;32m 222\u001b[0m \u001b[0;31m# Select an action.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 223\u001b[0m \u001b[0mstate\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmemory\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_recent_state\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mobservation\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 224\u001b[0;31m \u001b[0mq_values\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcompute_q_values\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstate\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 225\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtraining\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 226\u001b[0m \u001b[0maction\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpolicy\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mselect_action\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mq_values\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mq_values\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/Home/Working/Hanabi/envs/lib/python3.7/site-packages/rl/agents/dqn.py\u001b[0m in \u001b[0;36mcompute_q_values\u001b[0;34m(self, state)\u001b[0m\n\u001b[1;32m 66\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 67\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mcompute_q_values\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstate\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 68\u001b[0;31m \u001b[0mq_values\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcompute_batch_q_values\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mstate\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mflatten\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 69\u001b[0m \u001b[0;32massert\u001b[0m \u001b[0mq_values\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnb_actions\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 70\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mq_values\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/Home/Working/Hanabi/envs/lib/python3.7/site-packages/rl/agents/dqn.py\u001b[0m in \u001b[0;36mcompute_batch_q_values\u001b[0;34m(self, state_batch)\u001b[0m\n\u001b[1;32m 61\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mcompute_batch_q_values\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstate_batch\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 62\u001b[0m \u001b[0mbatch\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mprocess_state_batch\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstate_batch\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 63\u001b[0;31m \u001b[0mq_values\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpredict_on_batch\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbatch\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 64\u001b[0m \u001b[0;32massert\u001b[0m \u001b[0mq_values\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstate_batch\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnb_actions\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 65\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mq_values\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/Home/Working/Hanabi/envs/lib/python3.7/site-packages/tensorflow/python/keras/engine/training_v1.py\u001b[0m in \u001b[0;36mpredict_on_batch\u001b[0;34m(self, x)\u001b[0m\n\u001b[1;32m 1212\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1213\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_make_predict_function\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1214\u001b[0;31m \u001b[0moutputs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpredict_function\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minputs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1215\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1216\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0moutputs\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/Home/Working/Hanabi/envs/lib/python3.7/site-packages/tensorflow/python/keras/backend.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, inputs)\u001b[0m\n\u001b[1;32m 3823\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3824\u001b[0m fetched = self._callable_fn(*array_vals,\n\u001b[0;32m-> 3825\u001b[0;31m run_metadata=self.run_metadata)\n\u001b[0m\u001b[1;32m 3826\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_call_fetch_callbacks\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfetched\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_fetches\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3827\u001b[0m output_structure = nest.pack_sequence_as(\n", + "\u001b[0;32m~/Home/Working/Hanabi/envs/lib/python3.7/site-packages/tensorflow/python/client/session.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1470\u001b[0m ret = tf_session.TF_SessionRunCallable(self._session._session,\n\u001b[1;32m 1471\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_handle\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1472\u001b[0;31m run_metadata_ptr)\n\u001b[0m\u001b[1;32m 1473\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mrun_metadata\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1474\u001b[0m \u001b[0mproto_data\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtf_session\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mTF_GetBuffer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrun_metadata_ptr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], + "source": [ + "scores = dqn.test(env, nb_episodes=100, visualize=False)\n", + "print(np.mean(scores.history['episode_reward']))" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Testing for 15 episodes ...\n", + "Episode 1: reward: 200.000, steps: 200\n", + "Episode 2: reward: 200.000, steps: 200\n", + "Episode 3: reward: 200.000, steps: 200\n", + "Episode 4: reward: 200.000, steps: 200\n", + "Episode 5: reward: 200.000, steps: 200\n", + "Episode 6: reward: 200.000, steps: 200\n", + "Episode 7: reward: 200.000, steps: 200\n", + "Episode 8: reward: 200.000, steps: 200\n", + "Episode 9: reward: 200.000, steps: 200\n", + "Episode 10: reward: 200.000, steps: 200\n", + "Episode 11: reward: 200.000, steps: 200\n", + "Episode 12: reward: 200.000, steps: 200\n", + "Episode 13: reward: 200.000, steps: 200\n", + "Episode 14: reward: 200.000, steps: 200\n", + "Episode 15: reward: 200.000, steps: 200\n" + ] + } + ], + "source": [ + "_ = dqn.test(env, nb_episodes=15, visualize=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 4. Reloading Agent from Memory" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "dqn.save_weights('dqn_weights.h5f', overwrite=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [], + "source": [ + "del model\n", + "del dqn\n", + "del env" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "env = gym.make('CartPole-v0')\n", + "actions = env.action_space.n\n", + "states = env.observation_space.shape[0]\n", + "model = build_model(states, actions)\n", + "dqn = build_agent(model, actions)\n", + "dqn.compile(Adam(lr=1e-3), metrics=['mae'])" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "dqn.load_weights('dqn_weights.h5f')" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Testing for 5 episodes ...\n", + "WARNING:tensorflow:From /Users/nicholasrenotte/opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/keras/engine/training_v1.py:2070: Model.state_updates (from tensorflow.python.keras.engine.training) is deprecated and will be removed in a future version.\n", + "Instructions for updating:\n", + "This property should not be used in TensorFlow 2.0, as updates are applied automatically.\n", + "Episode 1: reward: 200.000, steps: 200\n", + "Episode 2: reward: 200.000, steps: 200\n", + "Episode 3: reward: 200.000, steps: 200\n", + "Episode 4: reward: 200.000, steps: 200\n", + "Episode 5: reward: 200.000, steps: 200\n" + ] + } + ], + "source": [ + "_ = dqn.test(env, nb_episodes=5, visualize=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.11" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/DumbGame.py b/DumbGame.py new file mode 100644 index 0000000..f6d8e27 --- /dev/null +++ b/DumbGame.py @@ -0,0 +1,62 @@ +################################################################################ +# Dumb game to be played by machines +# Guess the correct order of the numbers 1-5 which are shuffled. +# Keep guessing until you get the whole sequence. +# Penalized -1 for every wrong guess. +################################################################################ + +from gym import Env +from gym.spaces import Discrete, Box +import random +import numpy as np + + +class DumbGameEnv(Env): + def __init__(self): + self.n_numbers = 5 + self.answer = list(range(self.n_numbers)) + random.shuffle(self.answer) + self.state = 0 + self.action_space = Discrete(5) + self.observation_space = Box(low=np.array([0],dtype=np.float32), high=np.array([2],dtype=np.float32)) + + def step(self, action): + reward = 0 + if action == self.answer[self.state]: + self.state += 1 + reward = 1 + else: + reward = -1 + + done = self.state == self.n_numbers or self.state < -50 + + info = {} + + # Return step information + return self.state, reward, done, info + + def render(self): + pass + + def reset(self): + random.shuffle(self.answer) + self.state = 0 + self.n_guesses = 0 + return self.state + +if __name__ == "__main__": + env = DumbGameEnv() + #print(env.observation_space.sample()) # 0-1 + #print(env.action_space.sample()) # 0-4 + episodes = 10 + for episode in range(1, episodes+1): + state = env.reset() + done = False + score = 0 + n_guesses = 0 + while not done: + n_guesses += 1 + action = env.action_space.sample() + n_state, reward, done, info = env.step(action) + score+=reward + print(f'Episode:{episode} Score:{score} NGuesses:{n_guesses}') diff --git a/hanabi_ml_2.ipynb b/hanabi_ml_2.ipynb new file mode 100644 index 0000000..a83d061 --- /dev/null +++ b/hanabi_ml_2.ipynb @@ -0,0 +1,228 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from DumbGame import DumbGameEnv\n", + "import numpy as np\n", + "from tensorflow.keras.models import Sequential\n", + "from tensorflow.keras.layers import Dense, Flatten\n", + "from tensorflow.keras.optimizers import Adam" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "States:(1,) Actions:5\n" + ] + } + ], + "source": [ + "env = DumbGameEnv()\n", + "states = env.observation_space.shape\n", + "actions = env.action_space.n\n", + "print(f\"States:{states} Actions:{actions}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "def build_model(states, actions):\n", + " model = Sequential() \n", + " model.add(Dense(24, activation='relu', input_shape=states))\n", + " model.add(Dense(24, activation='relu'))\n", + " model.add(Dense(actions, activation='linear'))\n", + " return model" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "from rl.agents import DQNAgent\n", + "from rl.policy import BoltzmannQPolicy\n", + "from rl.memory import SequentialMemory\n", + "\n", + "def build_agent(model, actions):\n", + " policy = BoltzmannQPolicy()\n", + " memory = SequentialMemory(limit=20000, window_length=1)\n", + " dqn = DQNAgent(model=model, memory=memory, policy=policy, \n", + " nb_actions=actions, nb_steps_warmup=20, target_model_update=0.1)\n", + " return dqn" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model: \"sequential\"\n", + "_________________________________________________________________\n", + "Layer (type) Output Shape Param # \n", + "=================================================================\n", + "dense (Dense) (None, 24) 48 \n", + "_________________________________________________________________\n", + "dense_1 (Dense) (None, 24) 600 \n", + "_________________________________________________________________\n", + "dense_2 (Dense) (None, 5) 125 \n", + "=================================================================\n", + "Total params: 773\n", + "Trainable params: 773\n", + "Non-trainable params: 0\n", + "_________________________________________________________________\n" + ] + } + ], + "source": [ + "model = build_model(states, actions)\n", + "model.summary()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Training for 5000 steps ...\n", + "Interval 1 (0 steps performed)\n", + "WARNING:tensorflow:From /Users/Ben/anaconda3/lib/python3.6/site-packages/tensorflow/python/keras/engine/training_v1.py:2070: Model.state_updates (from tensorflow.python.keras.engine.training) is deprecated and will be removed in a future version.\n", + "Instructions for updating:\n", + "This property should not be used in TensorFlow 2.0, as updates are applied automatically.\n", + " 1/10000 [..............................] - ETA: 8:06 - reward: -1.0000" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/Ben/anaconda3/lib/python3.6/site-packages/rl/memory.py:37: UserWarning: Not enough entries to sample without replacement. Consider increasing your warm-up phase to avoid oversampling!\n", + " warnings.warn('Not enough entries to sample without replacement. Consider increasing your warm-up phase to avoid oversampling!')\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 4995/10000 [=============>................] - ETA: 29s - reward: -0.6040done, took 29.182 seconds\n" + ] + } + ], + "source": [ + "dqn = build_agent(model, actions)\n", + "dqn.compile(Adam(lr=0.1))#, metrics=['mae'])\n", + "history = dqn.fit(env, nb_steps=5000, visualize=False, verbose=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'nb_steps': 5000}\n", + "{'episode_reward': [-16.0, -10.0, -8.0, -24.0, -26.0, -2.0, -8.0, -21.0, -1.0, -2.0, -16.0, 0.0, -31.0, -38.0, -23.0, -38.0, -21.0, -22.0, -4.0, -9.0, -19.0, -19.0, -20.0, -21.0, -8.0, -29.0, -26.0, -18.0, -22.0, -10.0, -17.0, -17.0, -19.0, -50.0, -31.0, -16.0, -4.0, -13.0, -23.0, -29.0, -18.0, -22.0, -21.0, -9.0, -9.0, 0.0, -20.0, -28.0, -20.0, -26.0, -13.0, -25.0, -3.0, -11.0, -9.0, -7.0, -18.0, -11.0, -20.0, -15.0, -14.0, -15.0, -5.0, 0.0, -27.0, -46.0, -13.0, -7.0, -16.0, -16.0, -16.0, -12.0, -9.0, -16.0, -3.0, -13.0, -15.0, -5.0, -44.0, -11.0, 1.0, -26.0, -21.0, -18.0, -27.0, -33.0, -24.0, -12.0, -13.0, -6.0, -22.0, -1.0, -11.0, -2.0, -24.0, -1.0, -14.0, -7.0, -8.0, -27.0, -14.0, -18.0, -31.0, -20.0, -9.0, -3.0, 0.0, -27.0, -7.0, -16.0, -16.0, -10.0, -31.0, -14.0, -7.0, -16.0, -3.0, -24.0, -16.0, -39.0, -44.0, -20.0, -8.0, -2.0, -25.0, -8.0, -24.0, -36.0, -6.0, -20.0, -11.0, -21.0, -27.0, -33.0, -8.0, -12.0, -28.0, -8.0, -35.0, -4.0, -8.0, -27.0, 1.0, -8.0, -9.0, -12.0, -20.0, -14.0, 1.0, -23.0, -15.0, -5.0, -2.0, -14.0, -29.0, -18.0, -24.0, -4.0, -7.0, -26.0, -11.0, -18.0, -15.0, -10.0, -25.0, -6.0, -5.0, -13.0, -8.0, -13.0, -2.0, -7.0, -3.0, -9.0, -20.0, 1.0, -10.0, -4.0, -23.0, -16.0, -24.0, -9.0, -9.0, -6.0, -8.0, -10.0, -26.0, 1.0, -24.0, -14.0, -6.0, -22.0, -3.0, -1.0, -2.0, -14.0, -8.0, -15.0], 'nb_episode_steps': [26, 20, 18, 34, 36, 12, 18, 31, 11, 12, 26, 10, 41, 48, 33, 48, 31, 32, 14, 19, 29, 29, 30, 31, 18, 39, 36, 28, 32, 20, 27, 27, 29, 60, 41, 26, 14, 23, 33, 39, 28, 32, 31, 19, 19, 10, 30, 38, 30, 36, 23, 35, 13, 21, 19, 17, 28, 21, 30, 25, 24, 25, 15, 10, 37, 56, 23, 17, 26, 26, 26, 22, 19, 26, 13, 23, 25, 15, 54, 21, 9, 36, 31, 28, 37, 43, 34, 22, 23, 16, 32, 11, 21, 12, 34, 11, 24, 17, 18, 37, 24, 28, 41, 30, 19, 13, 10, 37, 17, 26, 26, 20, 41, 24, 17, 26, 13, 34, 26, 49, 54, 30, 18, 12, 35, 18, 34, 46, 16, 30, 21, 31, 37, 43, 18, 22, 38, 18, 45, 14, 18, 37, 9, 18, 19, 22, 30, 24, 9, 33, 25, 15, 12, 24, 39, 28, 34, 14, 17, 36, 21, 28, 25, 20, 35, 16, 15, 23, 18, 23, 12, 17, 13, 19, 30, 9, 20, 14, 33, 26, 34, 19, 19, 16, 18, 20, 36, 9, 34, 24, 16, 32, 13, 11, 12, 24, 18, 25], 'nb_steps': [26, 46, 64, 98, 134, 146, 164, 195, 206, 218, 244, 254, 295, 343, 376, 424, 455, 487, 501, 520, 549, 578, 608, 639, 657, 696, 732, 760, 792, 812, 839, 866, 895, 955, 996, 1022, 1036, 1059, 1092, 1131, 1159, 1191, 1222, 1241, 1260, 1270, 1300, 1338, 1368, 1404, 1427, 1462, 1475, 1496, 1515, 1532, 1560, 1581, 1611, 1636, 1660, 1685, 1700, 1710, 1747, 1803, 1826, 1843, 1869, 1895, 1921, 1943, 1962, 1988, 2001, 2024, 2049, 2064, 2118, 2139, 2148, 2184, 2215, 2243, 2280, 2323, 2357, 2379, 2402, 2418, 2450, 2461, 2482, 2494, 2528, 2539, 2563, 2580, 2598, 2635, 2659, 2687, 2728, 2758, 2777, 2790, 2800, 2837, 2854, 2880, 2906, 2926, 2967, 2991, 3008, 3034, 3047, 3081, 3107, 3156, 3210, 3240, 3258, 3270, 3305, 3323, 3357, 3403, 3419, 3449, 3470, 3501, 3538, 3581, 3599, 3621, 3659, 3677, 3722, 3736, 3754, 3791, 3800, 3818, 3837, 3859, 3889, 3913, 3922, 3955, 3980, 3995, 4007, 4031, 4070, 4098, 4132, 4146, 4163, 4199, 4220, 4248, 4273, 4293, 4328, 4344, 4359, 4382, 4400, 4423, 4435, 4452, 4465, 4484, 4514, 4523, 4543, 4557, 4590, 4616, 4650, 4669, 4688, 4704, 4722, 4742, 4778, 4787, 4821, 4845, 4861, 4893, 4906, 4917, 4929, 4953, 4971, 4996]}\n" + ] + } + ], + "source": [ + "print(history.params)\n", + "print(history.history)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Testing for 1 episodes ...\n" + ] + }, + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mscores\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdqn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0menv\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnb_episodes\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvisualize\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/rl/core.py\u001b[0m in \u001b[0;36mtest\u001b[0;34m(self, env, nb_episodes, action_repetition, callbacks, visualize, nb_max_episode_steps, nb_max_start_steps, start_step_policy, verbose)\u001b[0m\n\u001b[1;32m 363\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mnb_max_episode_steps\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mepisode_step\u001b[0m \u001b[0;34m>=\u001b[0m \u001b[0mnb_max_episode_steps\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 364\u001b[0m \u001b[0mdone\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 365\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbackward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreward\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mterminal\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 366\u001b[0m \u001b[0mepisode_reward\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0mreward\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 367\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/rl/agents/dqn.py\u001b[0m in \u001b[0;36mbackward\u001b[0;34m(self, reward, terminal)\u001b[0m\n\u001b[1;32m 240\u001b[0m training=self.training)\n\u001b[1;32m 241\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 242\u001b[0;31m \u001b[0mmetrics\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnan\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0m_\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmetrics_names\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 243\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtraining\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 244\u001b[0m \u001b[0;31m# We're done here. No need to update the experience memory since we only use the working\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/rl/agents/dqn.py\u001b[0m in \u001b[0;36mmetrics_names\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 340\u001b[0m \u001b[0mdummy_output_name\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtrainable_model\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moutput_names\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 341\u001b[0m \u001b[0mmodel_metrics\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mname\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0midx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtrainable_model\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmetrics_names\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0midx\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 342\u001b[0;31m \u001b[0mmodel_metrics\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreplace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdummy_output_name\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m'_'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m''\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mname\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mmodel_metrics\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 343\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 344\u001b[0m \u001b[0mnames\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmodel_metrics\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpolicy\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmetrics_names\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/rl/agents/dqn.py\u001b[0m in \u001b[0;36m\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 340\u001b[0m \u001b[0mdummy_output_name\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtrainable_model\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moutput_names\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 341\u001b[0m \u001b[0mmodel_metrics\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mname\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0midx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtrainable_model\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmetrics_names\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0midx\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 342\u001b[0;31m \u001b[0mmodel_metrics\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreplace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdummy_output_name\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m'_'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m''\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mname\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mmodel_metrics\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 343\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 344\u001b[0m \u001b[0mnames\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmodel_metrics\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpolicy\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmetrics_names\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], + "source": [ + "scores = dqn.test(env, nb_episodes=1, visualize=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#print(np.mean(scores.history['episode_reward']))\n", + "#dqn.get_config()\n", + "#scores = dqn.test(env, nb_episodes=1, visualize=False, verbose=1)\n", + "#test(self, env, nb_episodes=1, action_repetition=1, callbacks=None, visualize=True, nb_max_episode_steps=None, nb_max_start_steps=0, start_step_policy=None, verbose=1)\n", + "#print(np.mean(scores.history['episode_reward']))callbacks = callbacks[:]" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/tests/DumbGame.py b/tests/DumbGame.py new file mode 100644 index 0000000..828668d --- /dev/null +++ b/tests/DumbGame.py @@ -0,0 +1,93 @@ +import random + +class Game: + + def __init__(self): + print("New Game. Guess the correct order of numbers 0-5 with a partner. 7 strikes and you're out.") + self.over = False + self.n_strikes = 8 + self.numbers = list(range(0,5)) + random.shuffle(self.numbers) + self.acting_player_index = 0 + self.score = 0 + self.state = self.GetState() + + def GetActingPlayerIndex(self): + return self.acting_player_index + + def GetNextPlayerIndex(self): + return (self.acting_player_index + 1) % 2 + + def GetState(self, player_idx=None): + s = [] + s.append(self.n_strikes) + s.append(self.acting_player_index) + s.append(self.score) + # conceal solution from player + if player_idx is not None: + self.state.append("") + else: + s.append(self.numbers) + return s + + def NumberIsPlayable(self, guessed_number): + return self.numbers[self.score] == guessed_number + + def CheckGameOver(self): + if self.n_strikes == 0: + return True + if self.score == len(self.numbers): + return True + return False + + def NextTurn(self): + start_state = self.GetState() + print(f"Player {self.acting_player_index + 1}'s turn to act.") + print("Here's what they know about the game:") + print(self.GetState(self.acting_player_index)) + self.Action() + #self.GetActingPlayer().Act() + self.over = self.CheckGameOver() + self.acting_player_index = self.GetNextPlayerIndex() + end_state = self.GetState() + assert start_state != end_state + return end_state + + def Action(self): + while True: + try: + guessed_number = int(input("Guess a number 0-9> ")) + assert guessed_number in range(0,9) + break + except ValueError: # not an int + continue + except AssertionError: # not 0-9 + continue + if self.NumberIsPlayable(guessed_number): + print("Correct!") + self.score += 1 + else: + self.n_strikes -= 1 + print(f"Wrong. {self.n_strikes} strikes remaining.") + + + +if __name__ == "__main__": + + game = Game() + print(game.GetState()) + + while not game.over: + try: + new_state = game.NextTurn() + print(game.GetState()) + except AssertionError: + print("Error: game state did not change when a turn was taken.") + sys.exit(1) + + print("Game finished.") + + if game.score == len(game.numbers): + print("Fireworks! You Win!") + else: + print("Too bad, you lose with a score of", game.GetScore()) From b3e1b95413fb2389496ff19077bd900fc291bb67 Mon Sep 17 00:00:00 2001 From: Ben Messerly Date: Fri, 26 Aug 2022 14:38:52 -0500 Subject: [PATCH 2/7] Bunch of RL sandboxes that I never really got working. --- ...stonEnvironmentReinforcementLearning.ipynb | 513 +++++++++++++ RL_test_1/DumbGame.py | 62 ++ ...m Environment Reinforcement Learning.ipynb | 701 ++++++++++++++++++ RL_test_1/checkpoint | 2 + .../dqn_weights_box.h5f.data-00000-of-00001 | Bin 0 -> 3653 bytes RL_test_1/dqn_weights_box.h5f.index | Bin 0 -> 502 bytes ...n_weights_discrete.h5f.data-00000-of-00001 | Bin 0 -> 3653 bytes RL_test_1/dqn_weights_discrete.h5f.index | Bin 0 -> 502 bytes RL_test_2/Deep Reinforcement Learning.ipynb | 451 +++++++++++ RL_test_2/checkpoint | 2 + RL_test_2/dqn_weights.h5f.data-00000-of-00001 | Bin 0 -> 3850 bytes RL_test_2/dqn_weights.h5f.index | Bin 0 -> 504 bytes RL_test_3/Untitled.ipynb | 6 + 13 files changed, 1737 insertions(+) create mode 100644 RL_test_0/OpenAICustonEnvironmentReinforcementLearning.ipynb create mode 100644 RL_test_1/DumbGame.py create mode 100644 RL_test_1/OpenAI Custom Environment Reinforcement Learning.ipynb create mode 100644 RL_test_1/checkpoint create mode 100644 RL_test_1/dqn_weights_box.h5f.data-00000-of-00001 create mode 100644 RL_test_1/dqn_weights_box.h5f.index create mode 100644 RL_test_1/dqn_weights_discrete.h5f.data-00000-of-00001 create mode 100644 RL_test_1/dqn_weights_discrete.h5f.index create mode 100644 RL_test_2/Deep Reinforcement Learning.ipynb create mode 100644 RL_test_2/checkpoint create mode 100644 RL_test_2/dqn_weights.h5f.data-00000-of-00001 create mode 100644 RL_test_2/dqn_weights.h5f.index create mode 100644 RL_test_3/Untitled.ipynb diff --git a/RL_test_0/OpenAICustonEnvironmentReinforcementLearning.ipynb b/RL_test_0/OpenAICustonEnvironmentReinforcementLearning.ipynb new file mode 100644 index 0000000..4cd38d1 --- /dev/null +++ b/RL_test_0/OpenAICustonEnvironmentReinforcementLearning.ipynb @@ -0,0 +1,513 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 0. Install Dependencies" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 1. Test Random Environment with OpenAI Gym" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from gym import Env\n", + "from gym.spaces import Discrete, Box\n", + "import numpy as np\n", + "import random" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "class ShowerEnv(Env):\n", + " def __init__(self):\n", + " # Actions we can take, down, stay, up\n", + " self.action_space = Discrete(3)\n", + " # Temperature array\n", + " self.observation_space = Box(low=np.array([0]), high=np.array([100]))\n", + " # Set start temp\n", + " self.state = 38 + random.randint(-3,3)\n", + " # Set shower length\n", + " self.shower_length = 60\n", + " \n", + " def step(self, action):\n", + " # Apply action\n", + " # 0 -1 = -1 temperature\n", + " # 1 -1 = 0 \n", + " # 2 -1 = 1 temperature \n", + " self.state += action -1 \n", + " # Reduce shower length by 1 second\n", + " self.shower_length -= 1 \n", + " \n", + " # Calculate reward\n", + " if self.state >=37 and self.state <=39: \n", + " reward =1 \n", + " else: \n", + " reward = -1 \n", + " \n", + " # Check if shower is done\n", + " if self.shower_length <= 0: \n", + " done = True\n", + " else:\n", + " done = False\n", + " \n", + " # Apply temperature noise\n", + " #self.state += random.randint(-1,1)\n", + " # Set placeholder for info\n", + " info = {}\n", + " \n", + " # Return step information\n", + " return self.state, reward, done, info\n", + "\n", + " def render(self):\n", + " # Implement viz\n", + " pass\n", + " \n", + " def reset(self):\n", + " # Reset shower temperature\n", + " self.state = 38 + random.randint(-3,3)\n", + " # Reset shower time\n", + " self.shower_length = 60 \n", + " return self.state\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages/gym/spaces/box.py:74: UserWarning: \u001b[33mWARN: Box bound precision lowered by casting to float32\u001b[0m\n", + " \"Box bound precision lowered by casting to {}\".format(self.dtype)\n" + ] + } + ], + "source": [ + "env = ShowerEnv()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([94.43672], dtype=float32)" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "env.observation_space.sample()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Episode:1 Score:-60\n", + "Episode:2 Score:12\n", + "Episode:3 Score:-46\n", + "Episode:4 Score:-38\n", + "Episode:5 Score:34\n", + "Episode:6 Score:-22\n", + "Episode:7 Score:-44\n", + "Episode:8 Score:-34\n", + "Episode:9 Score:-56\n", + "Episode:10 Score:-54\n" + ] + } + ], + "source": [ + "episodes = 10\n", + "for episode in range(1, episodes+1):\n", + " state = env.reset()\n", + " done = False\n", + " score = 0 \n", + " \n", + " while not done:\n", + " #env.render()\n", + " action = env.action_space.sample()\n", + " n_state, reward, done, info = env.step(action)\n", + " score+=reward\n", + " print('Episode:{} Score:{}'.format(episode, score))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 2. Create a Deep Learning Model with Keras" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "from tensorflow.keras.models import Sequential\n", + "from tensorflow.keras.layers import Dense, Flatten\n", + "from tensorflow.keras.optimizers import Adam" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "states = env.observation_space.shape\n", + "actions = env.action_space.n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "3" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "actions" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "def build_model(states, actions):\n", + " model = Sequential() \n", + " model.add(Dense(24, activation='relu', input_shape=states))\n", + " model.add(Dense(24, activation='relu'))\n", + " model.add(Dense(actions, activation='linear'))\n", + " return model" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "del model " + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "model = build_model(states, actions)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model: \"sequential\"\n", + "_________________________________________________________________\n", + "Layer (type) Output Shape Param # \n", + "=================================================================\n", + "dense (Dense) (None, 24) 48 \n", + "_________________________________________________________________\n", + "dense_1 (Dense) (None, 24) 600 \n", + "_________________________________________________________________\n", + "dense_2 (Dense) (None, 3) 75 \n", + "=================================================================\n", + "Total params: 723\n", + "Trainable params: 723\n", + "Non-trainable params: 0\n", + "_________________________________________________________________\n" + ] + } + ], + "source": [ + "model.summary()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 3. Build Agent with Keras-RL" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "from rl.agents import DQNAgent\n", + "from rl.policy import BoltzmannQPolicy\n", + "from rl.memory import SequentialMemory" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "def build_agent(model, actions):\n", + " policy = BoltzmannQPolicy()\n", + " memory = SequentialMemory(limit=50000, window_length=1)\n", + " dqn = DQNAgent(model=model, memory=memory, policy=policy, \n", + " nb_actions=actions, nb_steps_warmup=10, target_model_update=1e-2)\n", + " return dqn" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Training for 50000 steps ...\n", + "Interval 1 (0 steps performed)\n", + "WARNING:tensorflow:From /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages/tensorflow/python/keras/engine/training_v1.py:2070: Model.state_updates (from tensorflow.python.keras.engine.training) is deprecated and will be removed in a future version.\n", + "Instructions for updating:\n", + "This property should not be used in TensorFlow 2.0, as updates are applied automatically.\n", + " 1/10000 [..............................] - ETA: 11:07 - reward: 1.0000" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages/rl/memory.py:37: UserWarning: Not enough entries to sample without replacement. Consider increasing your warm-up phase to avoid oversampling!\n", + " warnings.warn('Not enough entries to sample without replacement. Consider increasing your warm-up phase to avoid oversampling!')\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "10000/10000 [==============================] - 71s 7ms/step - reward: -0.5238\n", + "166 episodes - episode_reward: -31.410 [-60.000, 32.000] - loss: 0.820 - mae: 6.553 - mean_q: -6.437\n", + "\n", + "Interval 2 (10000 steps performed)\n", + "10000/10000 [==============================] - 72s 7ms/step - reward: -0.4258\n", + "167 episodes - episode_reward: -25.725 [-60.000, 44.000] - loss: 1.651 - mae: 9.015 - mean_q: -12.825\n", + "\n", + "Interval 3 (20000 steps performed)\n", + "10000/10000 [==============================] - 73s 7ms/step - reward: -0.4308\n", + "167 episodes - episode_reward: -25.713 [-60.000, 50.000] - loss: 1.489 - mae: 8.015 - mean_q: -11.281\n", + "\n", + "Interval 4 (30000 steps performed)\n", + "10000/10000 [==============================] - 77s 8ms/step - reward: 0.1662\n", + "166 episodes - episode_reward: 9.843 [-60.000, 56.000] - loss: 0.848 - mae: 4.911 - mean_q: -5.555\n", + "\n", + "Interval 5 (40000 steps performed)\n", + "10000/10000 [==============================] - 92s 9ms/step - reward: 0.7746\n", + "done, took 384.441 seconds\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dqn = build_agent(model, actions)\n", + "dqn.compile(Adam(lr=1e-3), metrics=['mae'])\n", + "dqn.fit(env, nb_steps=50000, visualize=False, verbose=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Testing for 100 episodes ...\n", + "Episode 1: reward: 60.000, steps: 60\n", + "Episode 2: reward: 60.000, steps: 60\n", + "Episode 3: reward: 58.000, steps: 60\n", + "Episode 4: reward: 60.000, steps: 60\n", + "Episode 5: reward: 60.000, steps: 60\n", + "Episode 6: reward: 60.000, steps: 60\n", + "Episode 7: reward: 60.000, steps: 60\n", + "Episode 8: reward: 60.000, steps: 60\n", + "Episode 9: reward: 60.000, steps: 60\n", + "Episode 10: reward: 60.000, steps: 60\n", + "Episode 11: reward: 58.000, steps: 60\n", + "Episode 12: reward: 60.000, steps: 60\n", + "Episode 13: reward: 58.000, steps: 60\n", + "Episode 14: reward: 58.000, steps: 60\n", + "Episode 15: reward: 60.000, steps: 60\n", + "Episode 16: reward: 60.000, steps: 60\n", + "Episode 17: reward: 60.000, steps: 60\n", + "Episode 18: reward: 60.000, steps: 60\n", + "Episode 19: reward: 58.000, steps: 60\n", + "Episode 20: reward: 60.000, steps: 60\n", + "Episode 21: reward: 60.000, steps: 60\n", + "Episode 22: reward: 60.000, steps: 60\n", + "Episode 23: reward: 60.000, steps: 60\n", + "Episode 24: reward: 60.000, steps: 60\n", + "Episode 25: reward: 60.000, steps: 60\n", + "Episode 26: reward: 60.000, steps: 60\n", + "Episode 27: reward: 60.000, steps: 60\n", + "Episode 28: reward: 60.000, steps: 60\n", + "Episode 29: reward: 60.000, steps: 60\n", + "Episode 30: reward: 60.000, steps: 60\n", + "Episode 31: reward: 58.000, steps: 60\n", + "Episode 32: reward: 60.000, steps: 60\n", + "Episode 33: reward: 58.000, steps: 60\n", + "Episode 34: reward: 58.000, steps: 60\n", + "Episode 35: reward: 60.000, steps: 60\n", + "Episode 36: reward: 58.000, steps: 60\n", + "Episode 37: reward: 60.000, steps: 60\n", + "Episode 38: reward: 58.000, steps: 60\n", + "Episode 39: reward: 60.000, steps: 60\n", + "Episode 40: reward: 58.000, steps: 60\n", + "Episode 41: reward: 60.000, steps: 60\n", + "Episode 42: reward: 58.000, steps: 60\n", + "Episode 43: reward: 60.000, steps: 60\n", + "Episode 44: reward: 58.000, steps: 60\n", + "Episode 45: reward: 58.000, steps: 60\n", + "Episode 46: reward: 60.000, steps: 60\n", + "Episode 47: reward: 60.000, steps: 60\n", + "Episode 48: reward: 58.000, steps: 60\n", + "Episode 49: reward: 60.000, steps: 60\n", + "Episode 50: reward: 60.000, steps: 60\n", + "Episode 51: reward: 60.000, steps: 60\n", + "Episode 52: reward: 58.000, steps: 60\n", + "Episode 53: reward: 60.000, steps: 60\n", + "Episode 54: reward: 60.000, steps: 60\n", + "Episode 55: reward: 60.000, steps: 60\n", + "Episode 56: reward: 60.000, steps: 60\n", + "Episode 57: reward: 60.000, steps: 60\n", + "Episode 58: reward: 60.000, steps: 60\n", + "Episode 59: reward: 58.000, steps: 60\n", + "Episode 60: reward: 60.000, steps: 60\n", + "Episode 61: reward: 58.000, steps: 60\n", + "Episode 62: reward: 60.000, steps: 60\n", + "Episode 63: reward: 60.000, steps: 60\n", + "Episode 64: reward: 60.000, steps: 60\n", + "Episode 65: reward: 60.000, steps: 60\n", + "Episode 66: reward: 58.000, steps: 60\n", + "Episode 67: reward: 60.000, steps: 60\n", + "Episode 68: reward: 58.000, steps: 60\n", + "Episode 69: reward: 58.000, steps: 60\n", + "Episode 70: reward: 60.000, steps: 60\n", + "Episode 71: reward: 60.000, steps: 60\n", + "Episode 72: reward: 60.000, steps: 60\n", + "Episode 73: reward: 60.000, steps: 60\n", + "Episode 74: reward: 60.000, steps: 60\n", + "Episode 75: reward: 58.000, steps: 60\n", + "Episode 76: reward: 60.000, steps: 60\n", + "Episode 77: reward: 60.000, steps: 60\n", + "Episode 78: reward: 58.000, steps: 60\n", + "Episode 79: reward: 60.000, steps: 60\n", + "Episode 80: reward: 58.000, steps: 60\n", + "Episode 81: reward: 60.000, steps: 60\n", + "Episode 82: reward: 60.000, steps: 60\n", + "Episode 83: reward: 60.000, steps: 60\n", + "Episode 84: reward: 58.000, steps: 60\n", + "Episode 85: reward: 60.000, steps: 60\n", + "Episode 86: reward: 60.000, steps: 60\n", + "Episode 87: reward: 60.000, steps: 60\n", + "Episode 88: reward: 60.000, steps: 60\n", + "Episode 89: reward: 60.000, steps: 60\n", + "Episode 90: reward: 58.000, steps: 60\n", + "Episode 91: reward: 60.000, steps: 60\n", + "Episode 92: reward: 58.000, steps: 60\n", + "Episode 93: reward: 60.000, steps: 60\n", + "Episode 94: reward: 58.000, steps: 60\n", + "Episode 95: reward: 60.000, steps: 60\n", + "Episode 96: reward: 60.000, steps: 60\n", + "Episode 97: reward: 58.000, steps: 60\n", + "Episode 98: reward: 58.000, steps: 60\n", + "Episode 99: reward: 60.000, steps: 60\n", + "Episode 100: reward: 60.000, steps: 60\n", + "59.4\n" + ] + } + ], + "source": [ + "scores = dqn.test(env, nb_episodes=100, visualize=False)\n", + "print(np.mean(scores.history['episode_reward']))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.11" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/RL_test_1/DumbGame.py b/RL_test_1/DumbGame.py new file mode 100644 index 0000000..f6d8e27 --- /dev/null +++ b/RL_test_1/DumbGame.py @@ -0,0 +1,62 @@ +################################################################################ +# Dumb game to be played by machines +# Guess the correct order of the numbers 1-5 which are shuffled. +# Keep guessing until you get the whole sequence. +# Penalized -1 for every wrong guess. +################################################################################ + +from gym import Env +from gym.spaces import Discrete, Box +import random +import numpy as np + + +class DumbGameEnv(Env): + def __init__(self): + self.n_numbers = 5 + self.answer = list(range(self.n_numbers)) + random.shuffle(self.answer) + self.state = 0 + self.action_space = Discrete(5) + self.observation_space = Box(low=np.array([0],dtype=np.float32), high=np.array([2],dtype=np.float32)) + + def step(self, action): + reward = 0 + if action == self.answer[self.state]: + self.state += 1 + reward = 1 + else: + reward = -1 + + done = self.state == self.n_numbers or self.state < -50 + + info = {} + + # Return step information + return self.state, reward, done, info + + def render(self): + pass + + def reset(self): + random.shuffle(self.answer) + self.state = 0 + self.n_guesses = 0 + return self.state + +if __name__ == "__main__": + env = DumbGameEnv() + #print(env.observation_space.sample()) # 0-1 + #print(env.action_space.sample()) # 0-4 + episodes = 10 + for episode in range(1, episodes+1): + state = env.reset() + done = False + score = 0 + n_guesses = 0 + while not done: + n_guesses += 1 + action = env.action_space.sample() + n_state, reward, done, info = env.step(action) + score+=reward + print(f'Episode:{episode} Score:{score} NGuesses:{n_guesses}') diff --git a/RL_test_1/OpenAI Custom Environment Reinforcement Learning.ipynb b/RL_test_1/OpenAI Custom Environment Reinforcement Learning.ipynb new file mode 100644 index 0000000..031259c --- /dev/null +++ b/RL_test_1/OpenAI Custom Environment Reinforcement Learning.ipynb @@ -0,0 +1,701 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 0. Install Dependencies" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 1. Test Random Environment with OpenAI Gym" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from gym import Env\n", + "from gym.spaces import Discrete, Box\n", + "import numpy as np\n", + "import random" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "class ShowerEnv(Env):\n", + " def __init__(self):\n", + " # Actions we can take, down, stay, up\n", + " self.action_space = Discrete(5)\n", + " # Temperature array\n", + " self.observation_space = Discrete(100)\n", + " # Set start temp\n", + " self.state = 38 + random.randint(-3,3)\n", + " # Set shower length\n", + " self.shower_length = 60\n", + " \n", + " self.answer = list(range(3))\n", + " random.shuffle(self.answer)\n", + " \n", + " def step(self, action):\n", + " # Apply action\n", + " # 0 -1 = -1 temperature\n", + " # 1 -1 = 0 \n", + " # 2 -1 = 1 temperature \n", + " self.state += action -2 \n", + " # Reduce shower length by 1 second\n", + " self.shower_length -= 1 \n", + " \n", + " # Calculate reward\n", + " if self.state >=37 and self.state <=39: \n", + " reward =1 \n", + " else: \n", + " reward = -1 \n", + " \n", + " # Check if shower is done\n", + " if self.shower_length <= 0: \n", + " done = True\n", + " else:\n", + " done = False\n", + " \n", + " # Apply temperature noise\n", + " #self.state += random.randint(-1,1)\n", + " # Set placeholder for info\n", + " info = {}\n", + " \n", + " # Return step information\n", + " return self.state, reward, done, info\n", + "\n", + " def render(self):\n", + " # Implement viz\n", + " pass\n", + " \n", + " def reset(self):\n", + " # Reset shower temperature\n", + " self.state = 38 + random.randint(-3,3)\n", + " # Reset shower time\n", + " self.shower_length = 60\n", + " random.shuffle(self.answer)\n", + " return self.state" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "env = ShowerEnv()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "76" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "env.observation_space.sample()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "assert not env.observation_space.contains(1.5)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Episode:1 Score:10\n", + "Episode:2 Score:-56\n", + "Episode:3 Score:-34\n", + "Episode:4 Score:-58\n", + "Episode:5 Score:-60\n", + "Episode:6 Score:-50\n", + "Episode:7 Score:-6\n", + "Episode:8 Score:-60\n", + "Episode:9 Score:4\n", + "Episode:10 Score:44\n" + ] + } + ], + "source": [ + "episodes = 10\n", + "for episode in range(1, episodes+1):\n", + " state = env.reset()\n", + " done = False\n", + " score = 0 \n", + " \n", + " while not done:\n", + " #env.render()\n", + " action = env.action_space.sample()\n", + " n_state, reward, done, info = env.step(action)\n", + " score+=reward\n", + " print('Episode:{} Score:{}'.format(episode, score))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 2. Create a Deep Learning Model with Keras" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "from tensorflow.keras.models import Sequential\n", + "from tensorflow.keras.layers import Dense, Flatten\n", + "from tensorflow.keras.optimizers import Adam" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "#states = env.observation_space.shape\n", + "states = (1,)\n", + "actions = env.action_space.n" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "3" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "actions" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(1,)" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "states" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "def build_model(states, actions):\n", + " model = Sequential() \n", + " model.add(Dense(24, activation='relu', input_shape=states))\n", + " model.add(Dense(24, activation='relu'))\n", + " model.add(Dense(actions, activation='linear'))\n", + " return model" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [], + "source": [ + "del model" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "model = build_model(states, actions)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model: \"sequential_2\"\n", + "_________________________________________________________________\n", + "Layer (type) Output Shape Param # \n", + "=================================================================\n", + "dense_6 (Dense) (None, 24) 48 \n", + "_________________________________________________________________\n", + "dense_7 (Dense) (None, 24) 600 \n", + "_________________________________________________________________\n", + "dense_8 (Dense) (None, 3) 75 \n", + "=================================================================\n", + "Total params: 723\n", + "Trainable params: 723\n", + "Non-trainable params: 0\n", + "_________________________________________________________________\n" + ] + } + ], + "source": [ + "model.summary()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 3. Build Agent with Keras-RL" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "from rl.agents import DQNAgent\n", + "from rl.policy import BoltzmannQPolicy\n", + "from rl.memory import SequentialMemory" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "def build_agent(model, actions):\n", + " policy = BoltzmannQPolicy()\n", + " memory = SequentialMemory(limit=50000, window_length=1)\n", + " dqn = DQNAgent(model=model, memory=memory, policy=policy, \n", + " nb_actions=actions, nb_steps_warmup=10, target_model_update=1e-2)\n", + " return dqn" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Training for 50000 steps ...\n", + "Interval 1 (0 steps performed)\n", + "WARNING:tensorflow:From /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages/tensorflow/python/keras/engine/training_v1.py:2070: Model.state_updates (from tensorflow.python.keras.engine.training) is deprecated and will be removed in a future version.\n", + "Instructions for updating:\n", + "This property should not be used in TensorFlow 2.0, as updates are applied automatically.\n", + " 1/10000 [..............................] - ETA: 11:37 - reward: 1.0000" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages/rl/memory.py:37: UserWarning: Not enough entries to sample without replacement. Consider increasing your warm-up phase to avoid oversampling!\n", + " warnings.warn('Not enough entries to sample without replacement. Consider increasing your warm-up phase to avoid oversampling!')\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "10000/10000 [==============================] - 71s 7ms/step - reward: -0.4972\n", + "166 episodes - episode_reward: -30.193 [-60.000, 60.000] - loss: 0.802 - mae: 7.679 - mean_q: -9.525\n", + "\n", + "Interval 2 (10000 steps performed)\n", + "10000/10000 [==============================] - 72s 7ms/step - reward: -0.0406\n", + "167 episodes - episode_reward: -2.072 [-60.000, 60.000] - loss: 1.381 - mae: 8.380 - mean_q: -5.159\n", + "\n", + "Interval 3 (20000 steps performed)\n", + "10000/10000 [==============================] - 74s 7ms/step - reward: 0.1818\n", + "167 episodes - episode_reward: 10.766 [-60.000, 60.000] - loss: 2.197 - mae: 8.740 - mean_q: 1.123\n", + "\n", + "Interval 4 (30000 steps performed)\n", + "10000/10000 [==============================] - 79s 8ms/step - reward: 0.3792\n", + "166 episodes - episode_reward: 22.602 [-60.000, 60.000] - loss: 6.342 - mae: 10.959 - mean_q: 12.510\n", + "\n", + "Interval 5 (40000 steps performed)\n", + "10000/10000 [==============================] - 83s 8ms/step - reward: 0.3358\n", + "done, took 378.766 seconds\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dqn = build_agent(model, actions)\n", + "dqn.compile(Adam(lr=1e-3), metrics=['mae'])\n", + "dqn.fit(env, nb_steps=50000, visualize=False, verbose=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Testing for 100 episodes ...\n", + "Episode 1: reward: -60.000, steps: 60\n", + "Episode 2: reward: 60.000, steps: 60\n", + "Episode 3: reward: 60.000, steps: 60\n", + "Episode 4: reward: -60.000, steps: 60\n", + "Episode 5: reward: 60.000, steps: 60\n", + "Episode 6: reward: -60.000, steps: 60\n", + "Episode 7: reward: -60.000, steps: 60\n", + "Episode 8: reward: -60.000, steps: 60\n", + "Episode 9: reward: -60.000, steps: 60\n", + "Episode 10: reward: -60.000, steps: 60\n", + "Episode 11: reward: 60.000, steps: 60\n", + "Episode 12: reward: 60.000, steps: 60\n", + "Episode 13: reward: 60.000, steps: 60\n", + "Episode 14: reward: -60.000, steps: 60\n", + "Episode 15: reward: 60.000, steps: 60\n", + "Episode 16: reward: 60.000, steps: 60\n", + "Episode 17: reward: 60.000, steps: 60\n", + "Episode 18: reward: -60.000, steps: 60\n", + "Episode 19: reward: -60.000, steps: 60\n", + "Episode 20: reward: 60.000, steps: 60\n", + "Episode 21: reward: -60.000, steps: 60\n", + "Episode 22: reward: -60.000, steps: 60\n", + "Episode 23: reward: -60.000, steps: 60\n", + "Episode 24: reward: 60.000, steps: 60\n", + "Episode 25: reward: -60.000, steps: 60\n", + "Episode 26: reward: -60.000, steps: 60\n", + "Episode 27: reward: 60.000, steps: 60\n", + "Episode 28: reward: 60.000, steps: 60\n", + "Episode 29: reward: 60.000, steps: 60\n", + "Episode 30: reward: -60.000, steps: 60\n", + "Episode 31: reward: -60.000, steps: 60\n", + "Episode 32: reward: 60.000, steps: 60\n", + "Episode 33: reward: 60.000, steps: 60\n", + "Episode 34: reward: 60.000, steps: 60\n", + "Episode 35: reward: 60.000, steps: 60\n", + "Episode 36: reward: 60.000, steps: 60\n", + "Episode 37: reward: -60.000, steps: 60\n", + "Episode 38: reward: -60.000, steps: 60\n", + "Episode 39: reward: -60.000, steps: 60\n", + "Episode 40: reward: 60.000, steps: 60\n", + "Episode 41: reward: -60.000, steps: 60\n", + "Episode 42: reward: 60.000, steps: 60\n", + "Episode 43: reward: -60.000, steps: 60\n", + "Episode 44: reward: -60.000, steps: 60\n", + "Episode 45: reward: -60.000, steps: 60\n", + "Episode 46: reward: 60.000, steps: 60\n", + "Episode 47: reward: 60.000, steps: 60\n", + "Episode 48: reward: -60.000, steps: 60\n", + "Episode 49: reward: 60.000, steps: 60\n", + "Episode 50: reward: 60.000, steps: 60\n", + "Episode 51: reward: -60.000, steps: 60\n", + "Episode 52: reward: 60.000, steps: 60\n", + "Episode 53: reward: -60.000, steps: 60\n", + "Episode 54: reward: 60.000, steps: 60\n", + "Episode 55: reward: 60.000, steps: 60\n", + "Episode 56: reward: -60.000, steps: 60\n", + "Episode 57: reward: 60.000, steps: 60\n", + "Episode 58: reward: -60.000, steps: 60\n", + "Episode 59: reward: -60.000, steps: 60\n", + "Episode 60: reward: 60.000, steps: 60\n", + "Episode 61: reward: -60.000, steps: 60\n", + "Episode 62: reward: -60.000, steps: 60\n", + "Episode 63: reward: 60.000, steps: 60\n", + "Episode 64: reward: -60.000, steps: 60\n", + "Episode 65: reward: 60.000, steps: 60\n", + "Episode 66: reward: -60.000, steps: 60\n", + "Episode 67: reward: 60.000, steps: 60\n", + "Episode 68: reward: -60.000, steps: 60\n", + "Episode 69: reward: -60.000, steps: 60\n", + "Episode 70: reward: -60.000, steps: 60\n", + "Episode 71: reward: -60.000, steps: 60\n", + "Episode 72: reward: -60.000, steps: 60\n", + "Episode 73: reward: 60.000, steps: 60\n", + "Episode 74: reward: -60.000, steps: 60\n", + "Episode 75: reward: 60.000, steps: 60\n", + "Episode 76: reward: 60.000, steps: 60\n", + "Episode 77: reward: -60.000, steps: 60\n", + "Episode 78: reward: -60.000, steps: 60\n", + "Episode 79: reward: 60.000, steps: 60\n", + "Episode 80: reward: -60.000, steps: 60\n", + "Episode 81: reward: 60.000, steps: 60\n", + "Episode 82: reward: 60.000, steps: 60\n", + "Episode 83: reward: -60.000, steps: 60\n", + "Episode 84: reward: 60.000, steps: 60\n", + "Episode 85: reward: -60.000, steps: 60\n", + "Episode 86: reward: 60.000, steps: 60\n", + "Episode 87: reward: -60.000, steps: 60\n", + "Episode 88: reward: 60.000, steps: 60\n", + "Episode 89: reward: 60.000, steps: 60\n", + "Episode 90: reward: -60.000, steps: 60\n", + "Episode 91: reward: -60.000, steps: 60\n", + "Episode 92: reward: 60.000, steps: 60\n", + "Episode 93: reward: -60.000, steps: 60\n", + "Episode 94: reward: -60.000, steps: 60\n", + "Episode 95: reward: -60.000, steps: 60\n", + "Episode 96: reward: 60.000, steps: 60\n", + "Episode 97: reward: 60.000, steps: 60\n", + "Episode 98: reward: -60.000, steps: 60\n", + "Episode 99: reward: -60.000, steps: 60\n", + "Episode 100: reward: 60.000, steps: 60\n", + "-3.6\n" + ] + } + ], + "source": [ + "scores = dqn.test(env, nb_episodes=100, visualize=False)\n", + "print(np.mean(scores.history['episode_reward']))" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": {}, + "outputs": [], + "source": [ + "dqn.save_weights('dqn_weights_discrete.h5f', overwrite=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": {}, + "outputs": [], + "source": [ + "del model\n", + "del dqn\n", + "del env" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": {}, + "outputs": [], + "source": [ + "env = ShowerEnv()\n", + "actions = env.action_space.n\n", + "states = (1,)\n", + "model = build_model(states, actions)\n", + "dqn = build_agent(model, actions)\n", + "dqn.compile(Adam(lr=1e-3), metrics=['mae'])" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": {}, + "outputs": [], + "source": [ + "dqn.load_weights('dqn_weights_discrete.h5f')" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Testing for 100 episodes ...\n", + "Episode 1: reward: 58.000, steps: 60\n", + "Episode 2: reward: 58.000, steps: 60\n", + "Episode 3: reward: 58.000, steps: 60\n", + "Episode 4: reward: 60.000, steps: 60\n", + "Episode 5: reward: 60.000, steps: 60\n", + "Episode 6: reward: 58.000, steps: 60\n", + "Episode 7: reward: 58.000, steps: 60\n", + "Episode 8: reward: 60.000, steps: 60\n", + "Episode 9: reward: 60.000, steps: 60\n", + "Episode 10: reward: 58.000, steps: 60\n", + "Episode 11: reward: 58.000, steps: 60\n", + "Episode 12: reward: 60.000, steps: 60\n", + "Episode 13: reward: 60.000, steps: 60\n", + "Episode 14: reward: 60.000, steps: 60\n", + "Episode 15: reward: 58.000, steps: 60\n", + "Episode 16: reward: 60.000, steps: 60\n", + "Episode 17: reward: 60.000, steps: 60\n", + "Episode 18: reward: 60.000, steps: 60\n", + "Episode 19: reward: 60.000, steps: 60\n", + "Episode 20: reward: 60.000, steps: 60\n", + "Episode 21: reward: 60.000, steps: 60\n", + "Episode 22: reward: 60.000, steps: 60\n", + "Episode 23: reward: 60.000, steps: 60\n", + "Episode 24: reward: 60.000, steps: 60\n", + "Episode 25: reward: 60.000, steps: 60\n", + "Episode 26: reward: 60.000, steps: 60\n", + "Episode 27: reward: 60.000, steps: 60\n", + "Episode 28: reward: 60.000, steps: 60\n", + "Episode 29: reward: 60.000, steps: 60\n", + "Episode 30: reward: 58.000, steps: 60\n", + "Episode 31: reward: 60.000, steps: 60\n", + "Episode 32: reward: 58.000, steps: 60\n", + "Episode 33: reward: 60.000, steps: 60\n", + "Episode 34: reward: 60.000, steps: 60\n", + "Episode 35: reward: 58.000, steps: 60\n", + "Episode 36: reward: 60.000, steps: 60\n", + "Episode 37: reward: 60.000, steps: 60\n", + "Episode 38: reward: 60.000, steps: 60\n", + "Episode 39: reward: 60.000, steps: 60\n", + "Episode 40: reward: 60.000, steps: 60\n", + "Episode 41: reward: 60.000, steps: 60\n", + "Episode 42: reward: 60.000, steps: 60\n", + "Episode 43: reward: 60.000, steps: 60\n", + "Episode 44: reward: 60.000, steps: 60\n", + "Episode 45: reward: 58.000, steps: 60\n", + "Episode 46: reward: 58.000, steps: 60\n", + "Episode 47: reward: 60.000, steps: 60\n", + "Episode 48: reward: 60.000, steps: 60\n", + "Episode 49: reward: 60.000, steps: 60\n", + "Episode 50: reward: 60.000, steps: 60\n", + "Episode 51: reward: 60.000, steps: 60\n", + "Episode 52: reward: 60.000, steps: 60\n", + "Episode 53: reward: 60.000, steps: 60\n", + "Episode 54: reward: 60.000, steps: 60\n", + "Episode 55: reward: 60.000, steps: 60\n", + "Episode 56: reward: 60.000, steps: 60\n", + "Episode 57: reward: 60.000, steps: 60\n", + "Episode 58: reward: 60.000, steps: 60\n", + "Episode 59: reward: 58.000, steps: 60\n", + "Episode 60: reward: 58.000, steps: 60\n", + "Episode 61: reward: 60.000, steps: 60\n", + "Episode 62: reward: 60.000, steps: 60\n", + "Episode 63: reward: 60.000, steps: 60\n", + "Episode 64: reward: 58.000, steps: 60\n", + "Episode 65: reward: 60.000, steps: 60\n", + "Episode 66: reward: 58.000, steps: 60\n", + "Episode 67: reward: 60.000, steps: 60\n", + "Episode 68: reward: 58.000, steps: 60\n", + "Episode 69: reward: 58.000, steps: 60\n", + "Episode 70: reward: 60.000, steps: 60\n", + "Episode 71: reward: 60.000, steps: 60\n", + "Episode 72: reward: 60.000, steps: 60\n", + "Episode 73: reward: 60.000, steps: 60\n", + "Episode 74: reward: 58.000, steps: 60\n", + "Episode 75: reward: 58.000, steps: 60\n", + "Episode 76: reward: 58.000, steps: 60\n", + "Episode 77: reward: 60.000, steps: 60\n", + "Episode 78: reward: 60.000, steps: 60\n", + "Episode 79: reward: 60.000, steps: 60\n", + "Episode 80: reward: 58.000, steps: 60\n", + "Episode 81: reward: 58.000, steps: 60\n", + "Episode 82: reward: 60.000, steps: 60\n", + "Episode 83: reward: 58.000, steps: 60\n", + "Episode 84: reward: 60.000, steps: 60\n", + "Episode 85: reward: 58.000, steps: 60\n", + "Episode 86: reward: 60.000, steps: 60\n", + "Episode 87: reward: 60.000, steps: 60\n", + "Episode 88: reward: 60.000, steps: 60\n", + "Episode 89: reward: 60.000, steps: 60\n", + "Episode 90: reward: 60.000, steps: 60\n", + "Episode 91: reward: 58.000, steps: 60\n", + "Episode 92: reward: 60.000, steps: 60\n", + "Episode 93: reward: 60.000, steps: 60\n", + "Episode 94: reward: 60.000, steps: 60\n", + "Episode 95: reward: 60.000, steps: 60\n", + "Episode 96: reward: 60.000, steps: 60\n", + "Episode 97: reward: 58.000, steps: 60\n", + "Episode 98: reward: 58.000, steps: 60\n", + "Episode 99: reward: 60.000, steps: 60\n", + "Episode 100: reward: 60.000, steps: 60\n", + "59.42\n" + ] + } + ], + "source": [ + "scores = dqn.test(env, nb_episodes=100, visualize=False)\n", + "print(np.mean(scores.history['episode_reward']))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.11" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/RL_test_1/checkpoint b/RL_test_1/checkpoint new file mode 100644 index 0000000..0914d39 --- /dev/null +++ b/RL_test_1/checkpoint @@ -0,0 +1,2 @@ +model_checkpoint_path: "dqn_weights_discrete.h5f" +all_model_checkpoint_paths: "dqn_weights_discrete.h5f" diff --git a/RL_test_1/dqn_weights_box.h5f.data-00000-of-00001 b/RL_test_1/dqn_weights_box.h5f.data-00000-of-00001 new file mode 100644 index 0000000000000000000000000000000000000000..1a8684740877791bb6ece909eb35c523351b4b53 GIT binary patch literal 3653 zcmbtWdsNNo8m@G+<(4R+yC$R?HtF}hdl#i#(v>1zWFtzcElHB@rIa>y<P?OT6)24ykLd!d#up@Fp~j8vbKH9?HK@ZSAcn3G;&Ofhe+VpFnTCHX(Y0 zmofh5t0Zv4JuJS>MKWU&M9sKVYw?7E2dy<|e)XZ4WZmV24Bp~oB@S|=hRRn({Gs!G z--?SvRk_31$4afa!b$jXJa>4$>f850chyyDp zQK@G_YAY0t4-6Ae3^Prh=NgR!YXR7y*yS8B>o_jWOk zd}f8gdl@P@5Fm2PEg-Qw^Kk4 z-$`<%`v`RDXF{^F7`Yb@qw%h9F>6IWq;4z3zy>RnnzfzCnNJ7bfDOpR?8oxahu{dY zM5EOBH#dgrJk3P783g{ZuSlq%hdQMkMV{Fim>stZ z{dgJRyWl8v9=nB4N7}=g=usT^yDPwd$3ytNL=908joqB_O!uZx<}3Y080$Mgw=T>8 z4~t)j_gH6~ZX5@uJJj*6HV^Gzhl6322BxvI!9!LPuf#;4P~HSX-^61ia|R8BJd_kA zp^A4H%9x+Pah27yW>F9*%;A&Di4s-J(;okK>A%yG(Ss7yPrcfM}`LP-EHo5X85GMw=Yy?CNFOBNV_yp_(R0q;aJnBim`q2&`XkVo02;AamT=5y#|h3QzLRIv&LyFSa_;Z zfOV{P+)&yMFa739v zM++M_hX;|SfoNn8SVMfbK7f8KNu}YT$+yV zJ^_NK)zGwe1#z5}4Qq6lfx*ml)JoYXGVF`PCs+Bj!3iiC&h#hXb@M=;n97)r{ zZp$C3b9O5DSSw>#M;CE$+)qw=m%!3xHei)@5GgIioTMa>EVf`Y)gn=_XbN@=?V*P5 zwc2p3!m0`Zato4bEt6!SlH|e$Ia)yig+E{ld1IAdm!RW)DsGOdGLMtKM=FjFtz(Tyd*c80l;z3i;j?E#X zIH@V?IPu0_V^&{)M}j#x=_h>*jf}(El;hCfBZn)hN+G~C6JX&wIIgFKwq-u# z;IbSnJzEL`&-^j{csQ+}X9LMCgS_W7(7Kb0(h7xO`@9-2$FG29 zvj;S$S%^YG75(C$WKnCW*;hpk*#a zGyQwX^XFS}YF{7aY?9ok2-dX`Ty?RLQ4aAyKq4 zEfEzru`xpc8{tqJNnF$*0w zr{F2$92iiX1?t<^gH%O!jlugO>MQXcMu^8YcfZVt( zjHk6bjDMVf5mNx|*ZxZ1?*W`@C%_oJ42VP*+!lQXEuRVCg@!~F=GjCaY}kfxw6)0* zX(i-d@*sw~&BUrsgT|)qfUY%Vz+P$%SDzH2;8-|GH@HZaI2F$tI#SO!k!2a)(f%lUL#37RM(Y$As zSW(nOWzzIOu~V8Bo@=3P8@Ev@$5h6)SB7RgDuI@IsOa6oJWwo3q7{?xGw-D~fO~)d zGGlI#rBh|-Msp2}AY3y1F8Z`a)rZj8(RW0D!BWrA;ki(r{`saFp0D#gI8(>gaa#K1 z#p|RKAZX0DANV!DoyW-g)!4Hx3*23jAlKw2Ga|zSnx3Anox60VXmwh<=)0CK>|=d{ z2d=Egk}XfEdI$lDQ#Yz9_@QxS7`_m+i*o(BSn}0PI$%_bfrUNT+wlrrjS8^gY%Me% z$%g9*A{b>K4f7pKL9wNSPQ7#qXZ{on3v@Jb+R-SGSdYWU=9$1@R}h1;ew?S*0HXA} zcr@n)+>YIk+HNPsGy9svx(4!+PEB8Oujx7rymo>9xNaE8GkADoG+y|{XbLBG5*`srXQ&PHTB#trT(}PUo!vFzn=T| Ksm=ad>VE+XRzJW1 literal 0 HcmV?d00001 diff --git a/RL_test_1/dqn_weights_box.h5f.index b/RL_test_1/dqn_weights_box.h5f.index new file mode 100644 index 0000000000000000000000000000000000000000..518e851d2cfd21577399b75014ab16ade83874a6 GIT binary patch literal 502 zcmZQzVB=tvV&Y(Akl~Ma_HcFf4)FK%3vqPvagFzP@^Wf{qL3lc zb{!A~>WXqruMkxfC)P3vuqzKRYBUI$9y-UX3bfFONDG-24v1;+m^R0FZz9n`W`zYJ z8Vi_AW34ko7=VBgMAW+^ZiRCWd|J4Hfsr|vVF5E-j7w9@eJL}?CT4z+5PlH;yP;d9 H)crO94%KvF literal 0 HcmV?d00001 diff --git a/RL_test_1/dqn_weights_discrete.h5f.data-00000-of-00001 b/RL_test_1/dqn_weights_discrete.h5f.data-00000-of-00001 new file mode 100644 index 0000000000000000000000000000000000000000..46879231ccabdbecc54b02059f005964369d53f3 GIT binary patch literal 3653 zcmbuBd00*R8pn5YcB3sx4y8eZXh2H)_wGcIgHjqzyNIHal<0IjNg<^|VKI=8$dq)kNZm2N@Hh39TO@ahNj~ zbRIjQOj{?0IBCOAZbu+zsUs8lsDyayJw?_~J)J0F{dxUbLNIH zp&D|Ej0pCPg=gE@5VFS@q0bZjSEds23KNLEM6h3BJ-p~Z zB)=MWroSe^Dvn^+8U(e5xoEPx9AmN685w# zrIluTz|TDnJU{@2aT#cM*Avey)5p$04d&_@V^BPA2CTPPCyzg02!m%-VY8wEa=m52 zT9jk8s3QeLa(Z0Xsii2ID#yeMHSpBFKd$>`syq4A5h!SuO@q34nuWsb|k%>CQI zs-Ya6UMfJ|z*?w(@H=TxJWdm`w_#_F8M$FR4YoWTk1c^-SbeqX^gyOHDyxEn?AQX^_ksj!S>6 zLce_p#MW{bY*vZ~*|aXw)*cQ$GZsLcUIJ(@76${9DR^*kDCx4d!K6*bFmlrz9IaIZ z-XY;ow0SAHIj@jNuYXSCvm@}}ygb60IvFDN>?Dp|B4YNdBt|%F#uSB-Fz$f~TKBK! z?s9X47*|C!%5EaFKNb_G_0yokbs>0Vl%mn=Xp$T*5BAD&s43}67B&QcFSF^1-0X4wQd9K#k64 zl15=98itLBG?^2)Bv}QHu9IOnk|y{fVlRx7od5sgW*SLpp!VER5#O0EhB+?g{oS zs&4*(7bV_7SiGtk;xdW2r(Vliv-}p#dQ$)&G}e=FBX9C(lmO$l{zkz=o0s=)9K7>8 z4bS8agiiuOu{PQQR^+eab-s6DMOa zYNB~Mh*&$WGMf~a!oHwb>^#E8kL{EupLm96n$PeA9Bm96cAXylWwUVn=pxK-eMm2E z^WupWDdFzi&2YcW3698@0?0ewGXzQ zNQ7$^a_IB;I8mQvAdJ2>6HaU7VVWQlC3MFK4__RCF@lrmF>MA~1YIGU%PT3VKLztz z4$&TkHriyeh4%S6z;eX}u&p_P>e%Fy+3W#wu0xIb8F`UUHe1m-`!==PyB*GXMN`(u zMc80=0DU_eiLfmbI-4SKdHyF_W^D(G+hrlBHI>RbdXd`JASzHPfs=esG~SqrVVyfs zlQj~}%WN>xn+JULXUw*7g`kt62%`tmAaAufxbJqSJ;H9H>TQadOQ+)7W$pBs*c7dj$sC^h2cd25>my4kO;j z_dUOBtj0}r4?7Wk3*+h9;s}s!n?;kK>}Hhbzvo)7 zsj|9IqlFnE_2jjOCVtSfzlrlGdkUN8(^1%viuL2Z&W z7QWj>4@GC=z1h*=nQ4djM@`3BPfdBN98Qs((`&KsVlR%rl8L>FC&{`w`n(Xq8^RH* zrDwy$Fyd_njZe*kU4{E0UtB}T72k|O&QkQKcqD$THph7i*)&H=hNSpwa1(3+kr1Knh5*m7F_0eh29Lj zPc|CF!a559zFNH#BW~})X)*4gY46MpiL64$r3J(~tda>Xu!oBdxzMtZ4TI%}QF%co znX;w;HT;i)KK~`DTw9E+isPrUt|z0ogFT4kOPT&##t`9FgssXt=$f<$Jd~4pjIS!VV{AxG_8iStgw`96L3I8?5ZmBshgJ-xUlx z_l}T7&n>X?c^#vb^8>8b&Bw(nJBg;&F(SCM3E!{|!O7W1Xft6g_9o{+<&L9359wg# zKqYh)+F<$htst_BLP6E+p?{bysSHYiQ+ppVoR(M`TBanl5@~;#m6~h%!TpDqLX7QX z)?7ZoJAL~Xo=?SFY8k(kY0^Fa#q+@KHs;LUN|H3DvF%K(YeePD7Y1^%(D40m~r1c$K|&~w5G znWh~yOIaTZT>43SryNhmC`PoW%v?l1CJAMqkF9%rrW};7&qcw|xq22l@qVsg2Te_@pQ;(VOrBt;s_OfHd*lMz392I}BEk403!u&#fSA_Zat?>=n zILU-9H`I~&(iyr@lNJA_&2-2n{?%qWWRv)&&1}df@zrL=R*;qC3|k)*;uqi-?i0*b zWwT|aIMVBUf&+d0*|M_I97%6KuZ>@HnLp{Wf6|BjNoW6|v)D^G9&AO|nNIdIZRXpq zaGg2d#g?PA$|rE6&k9ou<9}_SZvL&u{iA6-dFJB9PWCn~i*22a|1*f~#&P+taD{(| zeX9Pp8lj*I@lH#~@Fz zVPTFwp{^W^LM%c|91;o%8VRN<>32?xDhg+(7UiYp;Mc?f)WitV2-KFU@0cj8C}c>q zT?d4Ly4-qnv_uufiM31u?8*a-8Vy3GQnytVffgDOX(6-10Wl38(|sW+=SZ}WSz&>Q z#sVhO-Zo=1.1.1 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorflow==2.3.0) (1.1.2)\n", + "Requirement already satisfied: google-pasta>=0.1.8 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorflow==2.3.0) (0.2.0)\n", + "Requirement already satisfied: opt-einsum>=2.3.2 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorflow==2.3.0) (3.3.0)\n", + "Requirement already satisfied: termcolor>=1.1.0 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorflow==2.3.0) (1.1.0)\n", + "Requirement already satisfied: wrapt>=1.11.1 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorflow==2.3.0) (1.13.3)\n", + "Requirement already satisfied: grpcio>=1.8.6 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorflow==2.3.0) (1.43.0)\n", + "Requirement already satisfied: numpy<1.19.0,>=1.16.0 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorflow==2.3.0) (1.18.5)\n", + "Requirement already satisfied: scipy==1.4.1 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorflow==2.3.0) (1.4.1)\n", + "Requirement already satisfied: six>=1.12.0 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorflow==2.3.0) (1.16.0)\n", + "Requirement already satisfied: astunparse==1.6.3 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorflow==2.3.0) (1.6.3)\n", + "Requirement already satisfied: gast==0.3.3 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorflow==2.3.0) (0.3.3)\n", + "Requirement already satisfied: wheel>=0.26 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorflow==2.3.0) (0.37.1)\n", + "Requirement already satisfied: tensorboard<3,>=2.3.0 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorflow==2.3.0) (2.8.0)\n", + "Requirement already satisfied: protobuf>=3.9.2 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorflow==2.3.0) (3.19.4)\n", + "Requirement already satisfied: absl-py>=0.7.0 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorflow==2.3.0) (1.0.0)\n", + "Requirement already satisfied: tensorflow-estimator<2.4.0,>=2.3.0 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorflow==2.3.0) (2.3.0)\n", + "Requirement already satisfied: h5py<2.11.0,>=2.10.0 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorflow==2.3.0) (2.10.0)\n", + "Requirement already satisfied: tensorboard-data-server<0.7.0,>=0.6.0 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorboard<3,>=2.3.0->tensorflow==2.3.0) (0.6.1)\n", + "Requirement already satisfied: markdown>=2.6.8 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorboard<3,>=2.3.0->tensorflow==2.3.0) (3.3.6)\n", + "Requirement already satisfied: werkzeug>=0.11.15 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorboard<3,>=2.3.0->tensorflow==2.3.0) (2.0.2)\n", + "Requirement already satisfied: requests<3,>=2.21.0 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorboard<3,>=2.3.0->tensorflow==2.3.0) (2.27.1)\n", + "Requirement already satisfied: setuptools>=41.0.0 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorboard<3,>=2.3.0->tensorflow==2.3.0) (58.0.4)\n", + "Requirement already satisfied: google-auth-oauthlib<0.5,>=0.4.1 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorboard<3,>=2.3.0->tensorflow==2.3.0) (0.4.6)\n", + "Requirement already satisfied: google-auth<3,>=1.6.3 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorboard<3,>=2.3.0->tensorflow==2.3.0) (2.6.0)\n", + "Requirement already satisfied: tensorboard-plugin-wit>=1.6.0 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorboard<3,>=2.3.0->tensorflow==2.3.0) (1.8.1)\n", + "Requirement already satisfied: pyasn1-modules>=0.2.1 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from google-auth<3,>=1.6.3->tensorboard<3,>=2.3.0->tensorflow==2.3.0) (0.2.8)\n", + "Requirement already satisfied: cachetools<6.0,>=2.0.0 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from google-auth<3,>=1.6.3->tensorboard<3,>=2.3.0->tensorflow==2.3.0) (5.0.0)\n", + "Requirement already satisfied: rsa<5,>=3.1.4 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from google-auth<3,>=1.6.3->tensorboard<3,>=2.3.0->tensorflow==2.3.0) (4.8)\n", + "Requirement already satisfied: requests-oauthlib>=0.7.0 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from google-auth-oauthlib<0.5,>=0.4.1->tensorboard<3,>=2.3.0->tensorflow==2.3.0) (1.3.1)\n", + "Requirement already satisfied: importlib-metadata>=4.4 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from markdown>=2.6.8->tensorboard<3,>=2.3.0->tensorflow==2.3.0) (4.10.1)\n", + "Requirement already satisfied: typing-extensions>=3.6.4 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from importlib-metadata>=4.4->markdown>=2.6.8->tensorboard<3,>=2.3.0->tensorflow==2.3.0) (4.0.1)\n", + "Requirement already satisfied: zipp>=0.5 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from importlib-metadata>=4.4->markdown>=2.6.8->tensorboard<3,>=2.3.0->tensorflow==2.3.0) (3.7.0)\n", + "Requirement already satisfied: pyasn1<0.5.0,>=0.4.6 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from pyasn1-modules>=0.2.1->google-auth<3,>=1.6.3->tensorboard<3,>=2.3.0->tensorflow==2.3.0) (0.4.8)\n", + "Requirement already satisfied: charset-normalizer~=2.0.0 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from requests<3,>=2.21.0->tensorboard<3,>=2.3.0->tensorflow==2.3.0) (2.0.11)\n", + "Requirement already satisfied: idna<4,>=2.5 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from requests<3,>=2.21.0->tensorboard<3,>=2.3.0->tensorflow==2.3.0) (3.3)\n", + "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from requests<3,>=2.21.0->tensorboard<3,>=2.3.0->tensorflow==2.3.0) (1.26.8)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from requests<3,>=2.21.0->tensorboard<3,>=2.3.0->tensorflow==2.3.0) (2021.10.8)\n", + "Requirement already satisfied: oauthlib>=3.0.0 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from requests-oauthlib>=0.7.0->google-auth-oauthlib<0.5,>=0.4.1->tensorboard<3,>=2.3.0->tensorflow==2.3.0) (3.2.0)\n", + "Requirement already satisfied: gym in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (0.21.0)\n", + "Requirement already satisfied: cloudpickle>=1.2.0 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from gym) (2.0.0)\n", + "Requirement already satisfied: numpy>=1.18.0 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from gym) (1.18.5)\n", + "Requirement already satisfied: importlib-metadata>=4.8.1 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from gym) (4.10.1)\n", + "Requirement already satisfied: typing-extensions>=3.6.4 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from importlib-metadata>=4.8.1->gym) (4.0.1)\n", + "Requirement already satisfied: zipp>=0.5 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from importlib-metadata>=4.8.1->gym) (3.7.0)\n", + "Requirement already satisfied: keras in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (2.8.0)\n", + "Requirement already satisfied: keras-rl2 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (1.0.5)\n", + "Requirement already satisfied: tensorflow in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from keras-rl2) (2.3.0)\n", + "Requirement already satisfied: six>=1.12.0 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorflow->keras-rl2) (1.16.0)\n", + "Requirement already satisfied: h5py<2.11.0,>=2.10.0 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorflow->keras-rl2) (2.10.0)\n", + "Requirement already satisfied: wheel>=0.26 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorflow->keras-rl2) (0.37.1)\n", + "Requirement already satisfied: astunparse==1.6.3 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorflow->keras-rl2) (1.6.3)\n", + "Requirement already satisfied: google-pasta>=0.1.8 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorflow->keras-rl2) (0.2.0)\n", + "Requirement already satisfied: keras-preprocessing<1.2,>=1.1.1 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorflow->keras-rl2) (1.1.2)\n", + "Requirement already satisfied: absl-py>=0.7.0 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorflow->keras-rl2) (1.0.0)\n", + "Requirement already satisfied: protobuf>=3.9.2 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorflow->keras-rl2) (3.19.4)\n", + "Requirement already satisfied: scipy==1.4.1 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorflow->keras-rl2) (1.4.1)\n", + "Requirement already satisfied: tensorflow-estimator<2.4.0,>=2.3.0 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorflow->keras-rl2) (2.3.0)\n", + "Requirement already satisfied: gast==0.3.3 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorflow->keras-rl2) (0.3.3)\n", + "Requirement already satisfied: opt-einsum>=2.3.2 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorflow->keras-rl2) (3.3.0)\n", + "Requirement already satisfied: numpy<1.19.0,>=1.16.0 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorflow->keras-rl2) (1.18.5)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: termcolor>=1.1.0 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorflow->keras-rl2) (1.1.0)\n", + "Requirement already satisfied: wrapt>=1.11.1 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorflow->keras-rl2) (1.13.3)\n", + "Requirement already satisfied: grpcio>=1.8.6 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorflow->keras-rl2) (1.43.0)\n", + "Requirement already satisfied: tensorboard<3,>=2.3.0 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorflow->keras-rl2) (2.8.0)\n", + "Requirement already satisfied: markdown>=2.6.8 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorboard<3,>=2.3.0->tensorflow->keras-rl2) (3.3.6)\n", + "Requirement already satisfied: google-auth-oauthlib<0.5,>=0.4.1 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorboard<3,>=2.3.0->tensorflow->keras-rl2) (0.4.6)\n", + "Requirement already satisfied: setuptools>=41.0.0 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorboard<3,>=2.3.0->tensorflow->keras-rl2) (58.0.4)\n", + "Requirement already satisfied: tensorboard-plugin-wit>=1.6.0 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorboard<3,>=2.3.0->tensorflow->keras-rl2) (1.8.1)\n", + "Requirement already satisfied: tensorboard-data-server<0.7.0,>=0.6.0 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorboard<3,>=2.3.0->tensorflow->keras-rl2) (0.6.1)\n", + "Requirement already satisfied: werkzeug>=0.11.15 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorboard<3,>=2.3.0->tensorflow->keras-rl2) (2.0.2)\n", + "Requirement already satisfied: requests<3,>=2.21.0 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorboard<3,>=2.3.0->tensorflow->keras-rl2) (2.27.1)\n", + "Requirement already satisfied: google-auth<3,>=1.6.3 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorboard<3,>=2.3.0->tensorflow->keras-rl2) (2.6.0)\n", + "Requirement already satisfied: pyasn1-modules>=0.2.1 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from google-auth<3,>=1.6.3->tensorboard<3,>=2.3.0->tensorflow->keras-rl2) (0.2.8)\n", + "Requirement already satisfied: cachetools<6.0,>=2.0.0 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from google-auth<3,>=1.6.3->tensorboard<3,>=2.3.0->tensorflow->keras-rl2) (5.0.0)\n", + "Requirement already satisfied: rsa<5,>=3.1.4 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from google-auth<3,>=1.6.3->tensorboard<3,>=2.3.0->tensorflow->keras-rl2) (4.8)\n", + "Requirement already satisfied: requests-oauthlib>=0.7.0 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from google-auth-oauthlib<0.5,>=0.4.1->tensorboard<3,>=2.3.0->tensorflow->keras-rl2) (1.3.1)\n", + "Requirement already satisfied: importlib-metadata>=4.4 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from markdown>=2.6.8->tensorboard<3,>=2.3.0->tensorflow->keras-rl2) (4.10.1)\n", + "Requirement already satisfied: zipp>=0.5 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from importlib-metadata>=4.4->markdown>=2.6.8->tensorboard<3,>=2.3.0->tensorflow->keras-rl2) (3.7.0)\n", + "Requirement already satisfied: typing-extensions>=3.6.4 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from importlib-metadata>=4.4->markdown>=2.6.8->tensorboard<3,>=2.3.0->tensorflow->keras-rl2) (4.0.1)\n", + "Requirement already satisfied: pyasn1<0.5.0,>=0.4.6 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from pyasn1-modules>=0.2.1->google-auth<3,>=1.6.3->tensorboard<3,>=2.3.0->tensorflow->keras-rl2) (0.4.8)\n", + "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from requests<3,>=2.21.0->tensorboard<3,>=2.3.0->tensorflow->keras-rl2) (1.26.8)\n", + "Requirement already satisfied: charset-normalizer~=2.0.0 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from requests<3,>=2.21.0->tensorboard<3,>=2.3.0->tensorflow->keras-rl2) (2.0.11)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from requests<3,>=2.21.0->tensorboard<3,>=2.3.0->tensorflow->keras-rl2) (2021.10.8)\n", + "Requirement already satisfied: idna<4,>=2.5 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from requests<3,>=2.21.0->tensorboard<3,>=2.3.0->tensorflow->keras-rl2) (3.3)\n", + "Requirement already satisfied: oauthlib>=3.0.0 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from requests-oauthlib>=0.7.0->google-auth-oauthlib<0.5,>=0.4.1->tensorboard<3,>=2.3.0->tensorflow->keras-rl2) (3.2.0)\n" + ] + } + ], + "source": [ + "!pip install tensorflow==2.3.0\n", + "!pip install gym\n", + "!pip install keras\n", + "!pip install keras-rl2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 1. Test Random Environment with OpenAI Gym" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import gym \n", + "import random" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "env = gym.make('CartPole-v0')\n", + "states = env.observation_space.shape[0]\n", + "actions = env.action_space.n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "actions" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Episode:1 Score:19.0\n", + "Episode:2 Score:36.0\n", + "Episode:3 Score:15.0\n", + "Episode:4 Score:23.0\n", + "Episode:5 Score:15.0\n", + "Episode:6 Score:12.0\n", + "Episode:7 Score:40.0\n", + "Episode:8 Score:23.0\n", + "Episode:9 Score:18.0\n", + "Episode:10 Score:12.0\n" + ] + } + ], + "source": [ + "episodes = 10\n", + "for episode in range(1, episodes+1):\n", + " state = env.reset()\n", + " done = False\n", + " score = 0 \n", + " \n", + " while not done:\n", + " env.render()\n", + " action = random.choice([0,1])\n", + " n_state, reward, done, info = env.step(action)\n", + " score+=reward\n", + " print('Episode:{} Score:{}'.format(episode, score))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 2. Create a Deep Learning Model with Keras" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "from tensorflow.keras.models import Sequential\n", + "from tensorflow.keras.layers import Dense, Flatten\n", + "from tensorflow.keras.optimizers import Adam" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "def build_model(states, actions):\n", + " model = Sequential()\n", + " model.add(Flatten(input_shape=(1,states)))\n", + " model.add(Dense(24, activation='relu'))\n", + " model.add(Dense(24, activation='relu'))\n", + " model.add(Dense(actions, activation='linear'))\n", + " return model" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "del model\n", + "\n", + "model = build_model(states, actions)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model.summary()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 3. Build Agent with Keras-RL" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "from rl.agents import DQNAgent\n", + "from rl.policy import BoltzmannQPolicy\n", + "from rl.memory import SequentialMemory" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "def build_agent(model, actions):\n", + " policy = BoltzmannQPolicy()\n", + " memory = SequentialMemory(limit=50000, window_length=1)\n", + " dqn = DQNAgent(model=model, memory=memory, policy=policy, \n", + " nb_actions=actions, nb_steps_warmup=10, target_model_update=1e-2)\n", + " return dqn" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Training for 10000 steps ...\n", + "Interval 1 (0 steps performed)\n", + "WARNING:tensorflow:From /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages/tensorflow/python/keras/engine/training_v1.py:2070: Model.state_updates (from tensorflow.python.keras.engine.training) is deprecated and will be removed in a future version.\n", + "Instructions for updating:\n", + "This property should not be used in TensorFlow 2.0, as updates are applied automatically.\n", + " 1/10000 [..............................] - ETA: 11:14 - reward: 1.0000" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages/rl/memory.py:37: UserWarning: Not enough entries to sample without replacement. Consider increasing your warm-up phase to avoid oversampling!\n", + " warnings.warn('Not enough entries to sample without replacement. Consider increasing your warm-up phase to avoid oversampling!')\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "10000/10000 [==============================] - 76s 8ms/step - reward: 1.0000\n", + "done, took 76.468 seconds\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dqn = build_agent(model, actions)\n", + "dqn.compile(Adam(lr=1e-3), metrics=['mae'])\n", + "dqn.fit(env, nb_steps=10000, visualize=False, verbose=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "scores = dqn.test(env, nb_episodes=5, visualize=False)\n", + "print(np.mean(scores.history['episode_reward']))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "_ = dqn.test(env, nb_episodes=15, visualize=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 4. Reloading Agent from Memory" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dqn.save_weights('dqn_weights.h5f', overwrite=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "del model\n", + "del dqn\n", + "del env" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "env = gym.make('CartPole-v0')\n", + "actions = env.action_space.n\n", + "states = env.observation_space.shape[0]\n", + "model = build_model(states, actions)\n", + "dqn = build_agent(model, actions)\n", + "dqn.compile(Adam(lr=1e-3), metrics=['mae'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dqn.load_weights('dqn_weights.h5f')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "_ = dqn.test(env, nb_episodes=5, visualize=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.11" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/RL_test_2/checkpoint b/RL_test_2/checkpoint new file mode 100644 index 0000000..4f75474 --- /dev/null +++ b/RL_test_2/checkpoint @@ -0,0 +1,2 @@ +model_checkpoint_path: "dqn_weights.h5f" +all_model_checkpoint_paths: "dqn_weights.h5f" diff --git a/RL_test_2/dqn_weights.h5f.data-00000-of-00001 b/RL_test_2/dqn_weights.h5f.data-00000-of-00001 new file mode 100644 index 0000000000000000000000000000000000000000..ed0d663a784499cf328c9a661187950b4e6a3267 GIT binary patch literal 3850 zcmbu7c~p<-!^TV6_mn~SHyY(%RWrmipNa?*Uko%N@# zhR5iY5N)#1tXEJkzMO@&57JU~Nd#W#A*XfHX;Q{d0^5*GcyK|iX;%gc4OOvSFUR{C2l%DZc#Ez?7Yav%*|o@ z5~C3K-8CBFTZ)KT_n6U!K5$C_HBa?dJpB<;wXau_G19Wyb8FyFI3zeM9WZKv{i0 znNqb9wWl4R8mCIUYNQcf>jK$QU4)%dqK8cv(iop^vcGUDwaPX_$)a^o8}?Sn{ZK`G z>g_RGPX$DB9{Z)W5a(KK+3f0A#{GJn1-47_54%S4BBfrz!8|eEFruA(-dHyD_7(Xv zL6ZF5v_7`Awvbl*_#+iRdY_fHN{I9=Wq5k8hNaJv;=gXM#2JwUuYQPQ$jf2(HrA3z zcRPHNl1975QVjErz}%{dh?|i{=dbj`y9x0yw(27~KP-WJdny`=VqlfDmGnEEp-sux zh&dO-Ee}wFNyK_I*>nlRZ8p>D*RI&KDwZs>m4oHyI<&9YL{1+~qQCU&K>mgSj(xL} zhD!?QI?dUTnmQ9feH?Ck?kB6)>Ogm|1ymBA3!TRNEGV3lM)a$Ugz2TS=>5=3rn#8F z7n7QQr}g>-o{ ztCB|6=X9C5%UxyhW76qC_Y=bNH=Z&njiLV&Si@R0tC&T#CSU12op<}t#5!DWvc#-W zqTx%QGMkzX67#Ns#A)e3>5~{q5kEjb-kV6352@kJ0bb}kI)`<-%A#-}iZndeL#yUI zdi-FC;C#CxM(+KVmQ&@NbixZ&H5slW_5mM$0gY7UuW5Cn%SMh(!wX2-_h-h z&$I9i1uXu{bkWSlWOnrWaps^M&lLRQxk8~VT^PVv1ydH8TzbyF54*w^ep|uTcqSk= zx`k!C-(z}fQxRL+2X}?3yk2@Qe07W1n1&^+^+h5babA|*&naOVcOqF(xVXrtPMi%N zUCh+prL%d1OKJ1ReN=N-3>#ylE3%XIVwF>}nNsgwVjG%^Idmtpw6$X=wR#Ydy+x>7 z)J&=q{P^i|qhS=R0)u}Mq4m+@baRpw2G+XaSbHjwNu7b@%Vos!hBCZNf2s{@JWL^Z`2XqV8jhwNbWXgIstVn>_y zN>Hck$|R&cf!b_qpc7heu)AC#UEVl>&DcDOomjY*7R3~Dc}IStbp!cag|s18?=hYo z?A^x}tIIGuPK6bH?x$TJq;S=&Q+QE#8?tm>kqj?i_}K(v!`R29L*Oa=thX71#!=}0 z8coLyxZKEk;M0h?<0z0==v+=K1VBD#E5@oN6J$x#a zU%G+%;jKKlVT2fj%4r(V=5EcBE#IMmK!JR6Ec{$SkpXBkv$&j2fwPH*+SO}mjX@wxi_r(0NEq<2lB;Bi%=n*p`tiP{{ z2a@0O28LavByJb!lb?YB&n%)As>Ru!`%JDy6mTov0J`G2U6UGXB{>xX(D7a((&GL4%F6K!@FiOnJ8t%!_kkt zdwzjjofFT^{F*{S?B9{H`O?UkoIvz;@2E9dr9rHwTR>WRE#*#WlVO%0i7?oSty(+* zcD{Oend<}d9SpVU>STp+23fv*HNR9PRS^241@J?r81kgEQ1Yea>G_Ygz<<6~C-!&7mOBl(OS%TsZM^M)rLq-Liro!$M7-cgB z-s#g&ax;NuR~~~|vmTli6mV;%4ip-vVdv~hdU9MO(hj+xWzA=y*U=^56VmhaGnrTQl8aA~BH8I0f`AR@$is-+ zL~qe5T$q_g?9?;48&A7QMy>#E{kHfe+7KnvR*|cZHn4L0WJC`B02^^_*7bA;IbyL5 zM|}5SrI!b`5ECdbSP1eU8Ey}}h2p|*YhUk(L#<~Q9kJXL?;CY6=|KjyFgO5xb$g6- zh(xu?L6-Z(lIV&@(h0iyqS2WttS6w9wFF1;ZjoNXvbJotJ2-?jl+R_WxHNWo%5A1~ zv4GiDXA!MC4YbCYu@~JVL`KfV?6G17+x||Y&UH#EAHFz^<<0VE6D>_eZF!7Jtn|jH zBUgy&TNQZOTR~@5lTd8vYU%8)nFQ&A1|Zd z`&i++jwysUr4hee3~tKSn9x5AQwEdCiUBL$zHKZ&nshVgt7$NLqQOs!>1Su=#3GKC zA@%TO#Fkef^vg;3h%(4l(Nl8Pnj;Q_`8YA2L2+g&g4?oCqZGj3^Xz1IwilyK-xCY^ zvao#c5(Ex~&^uj##Ag@i%flzQyG@bw{MFgyxOW}6t%G>fXvRB@H{-KfPNK4-0B=7q zcsS8oH{&C{7wyAof~cm~4PhtZXg0{Lox%&mFNvhQ2--%zokcYT2jN}9&L z%96_vmy?$fSCbC#j`RzjWFe;|GfYkW_lsxvx=?@5aKCkH{X;{B8u_8-56p51llWWA zatM?76SEq^hW{;QHG~cS6PqljC?lyR8@M@i-NtoW{DS4g<^RaViC?~NpwJ5$kvm_(FJT)^tqoi2ZKtCxnu~=WvF(f3&(HCR8)F~}2a zSeT^y4 zB-*wIqCnduSm(VLRTL-IHVLphA24e)2$>#roo)}b(1=J2nG_l%G&oE%N|y4IXd#or z0WpmOjHbcsd literal 0 HcmV?d00001 diff --git a/RL_test_3/Untitled.ipynb b/RL_test_3/Untitled.ipynb new file mode 100644 index 0000000..363fcab --- /dev/null +++ b/RL_test_3/Untitled.ipynb @@ -0,0 +1,6 @@ +{ + "cells": [], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} From 370ebd4ec5f9d83ed68b8844108f34eccdf17343 Mon Sep 17 00:00:00 2001 From: Ben Messerly Date: Fri, 26 Aug 2022 14:57:54 -0500 Subject: [PATCH 3/7] Clean up. --- DumbGame/DumbGame0.py | 93 +++++ DumbGame/DumbGame1.py | 62 ++++ RL_test_4/hanabi_ml_2.ipynb | 228 ++++++++++++ RL_test_5/DeepReinforcementLearning.ipynb | 423 ++++++++++++++++++++++ 4 files changed, 806 insertions(+) create mode 100644 DumbGame/DumbGame0.py create mode 100644 DumbGame/DumbGame1.py create mode 100644 RL_test_4/hanabi_ml_2.ipynb create mode 100644 RL_test_5/DeepReinforcementLearning.ipynb diff --git a/DumbGame/DumbGame0.py b/DumbGame/DumbGame0.py new file mode 100644 index 0000000..828668d --- /dev/null +++ b/DumbGame/DumbGame0.py @@ -0,0 +1,93 @@ +import random + +class Game: + + def __init__(self): + print("New Game. Guess the correct order of numbers 0-5 with a partner. 7 strikes and you're out.") + self.over = False + self.n_strikes = 8 + self.numbers = list(range(0,5)) + random.shuffle(self.numbers) + self.acting_player_index = 0 + self.score = 0 + self.state = self.GetState() + + def GetActingPlayerIndex(self): + return self.acting_player_index + + def GetNextPlayerIndex(self): + return (self.acting_player_index + 1) % 2 + + def GetState(self, player_idx=None): + s = [] + s.append(self.n_strikes) + s.append(self.acting_player_index) + s.append(self.score) + # conceal solution from player + if player_idx is not None: + self.state.append("") + else: + s.append(self.numbers) + return s + + def NumberIsPlayable(self, guessed_number): + return self.numbers[self.score] == guessed_number + + def CheckGameOver(self): + if self.n_strikes == 0: + return True + if self.score == len(self.numbers): + return True + return False + + def NextTurn(self): + start_state = self.GetState() + print(f"Player {self.acting_player_index + 1}'s turn to act.") + print("Here's what they know about the game:") + print(self.GetState(self.acting_player_index)) + self.Action() + #self.GetActingPlayer().Act() + self.over = self.CheckGameOver() + self.acting_player_index = self.GetNextPlayerIndex() + end_state = self.GetState() + assert start_state != end_state + return end_state + + def Action(self): + while True: + try: + guessed_number = int(input("Guess a number 0-9> ")) + assert guessed_number in range(0,9) + break + except ValueError: # not an int + continue + except AssertionError: # not 0-9 + continue + if self.NumberIsPlayable(guessed_number): + print("Correct!") + self.score += 1 + else: + self.n_strikes -= 1 + print(f"Wrong. {self.n_strikes} strikes remaining.") + + + +if __name__ == "__main__": + + game = Game() + print(game.GetState()) + + while not game.over: + try: + new_state = game.NextTurn() + print(game.GetState()) + except AssertionError: + print("Error: game state did not change when a turn was taken.") + sys.exit(1) + + print("Game finished.") + + if game.score == len(game.numbers): + print("Fireworks! You Win!") + else: + print("Too bad, you lose with a score of", game.GetScore()) diff --git a/DumbGame/DumbGame1.py b/DumbGame/DumbGame1.py new file mode 100644 index 0000000..f6d8e27 --- /dev/null +++ b/DumbGame/DumbGame1.py @@ -0,0 +1,62 @@ +################################################################################ +# Dumb game to be played by machines +# Guess the correct order of the numbers 1-5 which are shuffled. +# Keep guessing until you get the whole sequence. +# Penalized -1 for every wrong guess. +################################################################################ + +from gym import Env +from gym.spaces import Discrete, Box +import random +import numpy as np + + +class DumbGameEnv(Env): + def __init__(self): + self.n_numbers = 5 + self.answer = list(range(self.n_numbers)) + random.shuffle(self.answer) + self.state = 0 + self.action_space = Discrete(5) + self.observation_space = Box(low=np.array([0],dtype=np.float32), high=np.array([2],dtype=np.float32)) + + def step(self, action): + reward = 0 + if action == self.answer[self.state]: + self.state += 1 + reward = 1 + else: + reward = -1 + + done = self.state == self.n_numbers or self.state < -50 + + info = {} + + # Return step information + return self.state, reward, done, info + + def render(self): + pass + + def reset(self): + random.shuffle(self.answer) + self.state = 0 + self.n_guesses = 0 + return self.state + +if __name__ == "__main__": + env = DumbGameEnv() + #print(env.observation_space.sample()) # 0-1 + #print(env.action_space.sample()) # 0-4 + episodes = 10 + for episode in range(1, episodes+1): + state = env.reset() + done = False + score = 0 + n_guesses = 0 + while not done: + n_guesses += 1 + action = env.action_space.sample() + n_state, reward, done, info = env.step(action) + score+=reward + print(f'Episode:{episode} Score:{score} NGuesses:{n_guesses}') diff --git a/RL_test_4/hanabi_ml_2.ipynb b/RL_test_4/hanabi_ml_2.ipynb new file mode 100644 index 0000000..a83d061 --- /dev/null +++ b/RL_test_4/hanabi_ml_2.ipynb @@ -0,0 +1,228 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from DumbGame import DumbGameEnv\n", + "import numpy as np\n", + "from tensorflow.keras.models import Sequential\n", + "from tensorflow.keras.layers import Dense, Flatten\n", + "from tensorflow.keras.optimizers import Adam" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "States:(1,) Actions:5\n" + ] + } + ], + "source": [ + "env = DumbGameEnv()\n", + "states = env.observation_space.shape\n", + "actions = env.action_space.n\n", + "print(f\"States:{states} Actions:{actions}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "def build_model(states, actions):\n", + " model = Sequential() \n", + " model.add(Dense(24, activation='relu', input_shape=states))\n", + " model.add(Dense(24, activation='relu'))\n", + " model.add(Dense(actions, activation='linear'))\n", + " return model" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "from rl.agents import DQNAgent\n", + "from rl.policy import BoltzmannQPolicy\n", + "from rl.memory import SequentialMemory\n", + "\n", + "def build_agent(model, actions):\n", + " policy = BoltzmannQPolicy()\n", + " memory = SequentialMemory(limit=20000, window_length=1)\n", + " dqn = DQNAgent(model=model, memory=memory, policy=policy, \n", + " nb_actions=actions, nb_steps_warmup=20, target_model_update=0.1)\n", + " return dqn" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model: \"sequential\"\n", + "_________________________________________________________________\n", + "Layer (type) Output Shape Param # \n", + "=================================================================\n", + "dense (Dense) (None, 24) 48 \n", + "_________________________________________________________________\n", + "dense_1 (Dense) (None, 24) 600 \n", + "_________________________________________________________________\n", + "dense_2 (Dense) (None, 5) 125 \n", + "=================================================================\n", + "Total params: 773\n", + "Trainable params: 773\n", + "Non-trainable params: 0\n", + "_________________________________________________________________\n" + ] + } + ], + "source": [ + "model = build_model(states, actions)\n", + "model.summary()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Training for 5000 steps ...\n", + "Interval 1 (0 steps performed)\n", + "WARNING:tensorflow:From /Users/Ben/anaconda3/lib/python3.6/site-packages/tensorflow/python/keras/engine/training_v1.py:2070: Model.state_updates (from tensorflow.python.keras.engine.training) is deprecated and will be removed in a future version.\n", + "Instructions for updating:\n", + "This property should not be used in TensorFlow 2.0, as updates are applied automatically.\n", + " 1/10000 [..............................] - ETA: 8:06 - reward: -1.0000" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/Ben/anaconda3/lib/python3.6/site-packages/rl/memory.py:37: UserWarning: Not enough entries to sample without replacement. Consider increasing your warm-up phase to avoid oversampling!\n", + " warnings.warn('Not enough entries to sample without replacement. Consider increasing your warm-up phase to avoid oversampling!')\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 4995/10000 [=============>................] - ETA: 29s - reward: -0.6040done, took 29.182 seconds\n" + ] + } + ], + "source": [ + "dqn = build_agent(model, actions)\n", + "dqn.compile(Adam(lr=0.1))#, metrics=['mae'])\n", + "history = dqn.fit(env, nb_steps=5000, visualize=False, verbose=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'nb_steps': 5000}\n", + "{'episode_reward': [-16.0, -10.0, -8.0, -24.0, -26.0, -2.0, -8.0, -21.0, -1.0, -2.0, -16.0, 0.0, -31.0, -38.0, -23.0, -38.0, -21.0, -22.0, -4.0, -9.0, -19.0, -19.0, -20.0, -21.0, -8.0, -29.0, -26.0, -18.0, -22.0, -10.0, -17.0, -17.0, -19.0, -50.0, -31.0, -16.0, -4.0, -13.0, -23.0, -29.0, -18.0, -22.0, -21.0, -9.0, -9.0, 0.0, -20.0, -28.0, -20.0, -26.0, -13.0, -25.0, -3.0, -11.0, -9.0, -7.0, -18.0, -11.0, -20.0, -15.0, -14.0, -15.0, -5.0, 0.0, -27.0, -46.0, -13.0, -7.0, -16.0, -16.0, -16.0, -12.0, -9.0, -16.0, -3.0, -13.0, -15.0, -5.0, -44.0, -11.0, 1.0, -26.0, -21.0, -18.0, -27.0, -33.0, -24.0, -12.0, -13.0, -6.0, -22.0, -1.0, -11.0, -2.0, -24.0, -1.0, -14.0, -7.0, -8.0, -27.0, -14.0, -18.0, -31.0, -20.0, -9.0, -3.0, 0.0, -27.0, -7.0, -16.0, -16.0, -10.0, -31.0, -14.0, -7.0, -16.0, -3.0, -24.0, -16.0, -39.0, -44.0, -20.0, -8.0, -2.0, -25.0, -8.0, -24.0, -36.0, -6.0, -20.0, -11.0, -21.0, -27.0, -33.0, -8.0, -12.0, -28.0, -8.0, -35.0, -4.0, -8.0, -27.0, 1.0, -8.0, -9.0, -12.0, -20.0, -14.0, 1.0, -23.0, -15.0, -5.0, -2.0, -14.0, -29.0, -18.0, -24.0, -4.0, -7.0, -26.0, -11.0, -18.0, -15.0, -10.0, -25.0, -6.0, -5.0, -13.0, -8.0, -13.0, -2.0, -7.0, -3.0, -9.0, -20.0, 1.0, -10.0, -4.0, -23.0, -16.0, -24.0, -9.0, -9.0, -6.0, -8.0, -10.0, -26.0, 1.0, -24.0, -14.0, -6.0, -22.0, -3.0, -1.0, -2.0, -14.0, -8.0, -15.0], 'nb_episode_steps': [26, 20, 18, 34, 36, 12, 18, 31, 11, 12, 26, 10, 41, 48, 33, 48, 31, 32, 14, 19, 29, 29, 30, 31, 18, 39, 36, 28, 32, 20, 27, 27, 29, 60, 41, 26, 14, 23, 33, 39, 28, 32, 31, 19, 19, 10, 30, 38, 30, 36, 23, 35, 13, 21, 19, 17, 28, 21, 30, 25, 24, 25, 15, 10, 37, 56, 23, 17, 26, 26, 26, 22, 19, 26, 13, 23, 25, 15, 54, 21, 9, 36, 31, 28, 37, 43, 34, 22, 23, 16, 32, 11, 21, 12, 34, 11, 24, 17, 18, 37, 24, 28, 41, 30, 19, 13, 10, 37, 17, 26, 26, 20, 41, 24, 17, 26, 13, 34, 26, 49, 54, 30, 18, 12, 35, 18, 34, 46, 16, 30, 21, 31, 37, 43, 18, 22, 38, 18, 45, 14, 18, 37, 9, 18, 19, 22, 30, 24, 9, 33, 25, 15, 12, 24, 39, 28, 34, 14, 17, 36, 21, 28, 25, 20, 35, 16, 15, 23, 18, 23, 12, 17, 13, 19, 30, 9, 20, 14, 33, 26, 34, 19, 19, 16, 18, 20, 36, 9, 34, 24, 16, 32, 13, 11, 12, 24, 18, 25], 'nb_steps': [26, 46, 64, 98, 134, 146, 164, 195, 206, 218, 244, 254, 295, 343, 376, 424, 455, 487, 501, 520, 549, 578, 608, 639, 657, 696, 732, 760, 792, 812, 839, 866, 895, 955, 996, 1022, 1036, 1059, 1092, 1131, 1159, 1191, 1222, 1241, 1260, 1270, 1300, 1338, 1368, 1404, 1427, 1462, 1475, 1496, 1515, 1532, 1560, 1581, 1611, 1636, 1660, 1685, 1700, 1710, 1747, 1803, 1826, 1843, 1869, 1895, 1921, 1943, 1962, 1988, 2001, 2024, 2049, 2064, 2118, 2139, 2148, 2184, 2215, 2243, 2280, 2323, 2357, 2379, 2402, 2418, 2450, 2461, 2482, 2494, 2528, 2539, 2563, 2580, 2598, 2635, 2659, 2687, 2728, 2758, 2777, 2790, 2800, 2837, 2854, 2880, 2906, 2926, 2967, 2991, 3008, 3034, 3047, 3081, 3107, 3156, 3210, 3240, 3258, 3270, 3305, 3323, 3357, 3403, 3419, 3449, 3470, 3501, 3538, 3581, 3599, 3621, 3659, 3677, 3722, 3736, 3754, 3791, 3800, 3818, 3837, 3859, 3889, 3913, 3922, 3955, 3980, 3995, 4007, 4031, 4070, 4098, 4132, 4146, 4163, 4199, 4220, 4248, 4273, 4293, 4328, 4344, 4359, 4382, 4400, 4423, 4435, 4452, 4465, 4484, 4514, 4523, 4543, 4557, 4590, 4616, 4650, 4669, 4688, 4704, 4722, 4742, 4778, 4787, 4821, 4845, 4861, 4893, 4906, 4917, 4929, 4953, 4971, 4996]}\n" + ] + } + ], + "source": [ + "print(history.params)\n", + "print(history.history)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Testing for 1 episodes ...\n" + ] + }, + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mscores\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdqn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0menv\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnb_episodes\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvisualize\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/rl/core.py\u001b[0m in \u001b[0;36mtest\u001b[0;34m(self, env, nb_episodes, action_repetition, callbacks, visualize, nb_max_episode_steps, nb_max_start_steps, start_step_policy, verbose)\u001b[0m\n\u001b[1;32m 363\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mnb_max_episode_steps\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mepisode_step\u001b[0m \u001b[0;34m>=\u001b[0m \u001b[0mnb_max_episode_steps\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 364\u001b[0m \u001b[0mdone\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 365\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbackward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreward\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mterminal\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 366\u001b[0m \u001b[0mepisode_reward\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0mreward\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 367\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/rl/agents/dqn.py\u001b[0m in \u001b[0;36mbackward\u001b[0;34m(self, reward, terminal)\u001b[0m\n\u001b[1;32m 240\u001b[0m training=self.training)\n\u001b[1;32m 241\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 242\u001b[0;31m \u001b[0mmetrics\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnan\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0m_\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmetrics_names\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 243\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtraining\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 244\u001b[0m \u001b[0;31m# We're done here. No need to update the experience memory since we only use the working\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/rl/agents/dqn.py\u001b[0m in \u001b[0;36mmetrics_names\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 340\u001b[0m \u001b[0mdummy_output_name\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtrainable_model\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moutput_names\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 341\u001b[0m \u001b[0mmodel_metrics\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mname\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0midx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtrainable_model\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmetrics_names\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0midx\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 342\u001b[0;31m \u001b[0mmodel_metrics\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreplace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdummy_output_name\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m'_'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m''\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mname\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mmodel_metrics\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 343\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 344\u001b[0m \u001b[0mnames\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmodel_metrics\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpolicy\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmetrics_names\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/rl/agents/dqn.py\u001b[0m in \u001b[0;36m\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 340\u001b[0m \u001b[0mdummy_output_name\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtrainable_model\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moutput_names\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 341\u001b[0m \u001b[0mmodel_metrics\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mname\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0midx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtrainable_model\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmetrics_names\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0midx\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 342\u001b[0;31m \u001b[0mmodel_metrics\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreplace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdummy_output_name\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m'_'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m''\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mname\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mmodel_metrics\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 343\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 344\u001b[0m \u001b[0mnames\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmodel_metrics\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpolicy\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmetrics_names\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], + "source": [ + "scores = dqn.test(env, nb_episodes=1, visualize=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#print(np.mean(scores.history['episode_reward']))\n", + "#dqn.get_config()\n", + "#scores = dqn.test(env, nb_episodes=1, visualize=False, verbose=1)\n", + "#test(self, env, nb_episodes=1, action_repetition=1, callbacks=None, visualize=True, nb_max_episode_steps=None, nb_max_start_steps=0, start_step_policy=None, verbose=1)\n", + "#print(np.mean(scores.history['episode_reward']))callbacks = callbacks[:]" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/RL_test_5/DeepReinforcementLearning.ipynb b/RL_test_5/DeepReinforcementLearning.ipynb new file mode 100644 index 0000000..fc934a5 --- /dev/null +++ b/RL_test_5/DeepReinforcementLearning.ipynb @@ -0,0 +1,423 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 0. Install Dependencies" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 1. Test Random Environment with OpenAI Gym" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "States:(1,) Actions:5\n" + ] + } + ], + "source": [ + "from DumbGame import DumbGameEnv\n", + "env = DumbGameEnv()\n", + "states = env.observation_space.shape\n", + "actions = env.action_space.n\n", + "print(f\"States:{states} Actions:{actions}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 2. Create a Deep Learning Model with Keras" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "from tensorflow.keras.models import Sequential\n", + "from tensorflow.keras.layers import Dense, Flatten\n", + "from tensorflow.keras.optimizers import Adam" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "states = env.observation_space.shape\n", + "actions = env.action_space.n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "5" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "actions" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "def build_model(states, actions):\n", + " model = Sequential() \n", + " model.add(Dense(24, activation='relu', input_shape=states))\n", + " model.add(Dense(24, activation='relu'))\n", + " model.add(Dense(actions, activation='linear'))\n", + " return model" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "del model " + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "model = build_model(states, actions)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model: \"sequential_2\"\n", + "_________________________________________________________________\n", + "Layer (type) Output Shape Param # \n", + "=================================================================\n", + "dense_6 (Dense) (None, 24) 48 \n", + "_________________________________________________________________\n", + "dense_7 (Dense) (None, 24) 600 \n", + "_________________________________________________________________\n", + "dense_8 (Dense) (None, 5) 125 \n", + "=================================================================\n", + "Total params: 773\n", + "Trainable params: 773\n", + "Non-trainable params: 0\n", + "_________________________________________________________________\n" + ] + } + ], + "source": [ + "model.summary()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 3. Build Agent with Keras-RL" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "from rl.agents import DQNAgent\n", + "from rl.policy import BoltzmannQPolicy\n", + "from rl.memory import SequentialMemory" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "def build_agent(model, actions):\n", + " policy = BoltzmannQPolicy()\n", + " memory = SequentialMemory(limit=50000, window_length=1)\n", + " dqn = DQNAgent(model=model, memory=memory, policy=policy, \n", + " nb_actions=actions, nb_steps_warmup=10, target_model_update=1e-2)\n", + " return dqn" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Training for 50000 steps ...\n", + "Interval 1 (0 steps performed)\n", + "WARNING:tensorflow:From /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages/tensorflow/python/keras/engine/training_v1.py:2070: Model.state_updates (from tensorflow.python.keras.engine.training) is deprecated and will be removed in a future version.\n", + "Instructions for updating:\n", + "This property should not be used in TensorFlow 2.0, as updates are applied automatically.\n", + " 1/10000 [..............................] - ETA: 9:10 - reward: -1.0000" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages/rl/memory.py:37: UserWarning: Not enough entries to sample without replacement. Consider increasing your warm-up phase to avoid oversampling!\n", + " warnings.warn('Not enough entries to sample without replacement. Consider increasing your warm-up phase to avoid oversampling!')\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "10000/10000 [==============================] - 73s 7ms/step - reward: -0.5974\n", + "402 episodes - episode_reward: -14.821 [-47.000, 5.000] - loss: 1.242 - mae: 5.141 - mean_q: -6.008\n", + "\n", + "Interval 2 (10000 steps performed)\n", + "10000/10000 [==============================] - 70s 7ms/step - reward: -0.5944\n", + "406 episodes - episode_reward: -14.675 [-52.000, 3.000] - loss: 1.479 - mae: 6.070 - mean_q: -7.153\n", + "\n", + "Interval 3 (20000 steps performed)\n", + "10000/10000 [==============================] - 71s 7ms/step - reward: -0.5978\n", + "402 episodes - episode_reward: -14.876 [-59.000, 2.000] - loss: 1.487 - mae: 6.079 - mean_q: -7.167\n", + "\n", + "Interval 4 (30000 steps performed)\n", + "10000/10000 [==============================] - 71s 7ms/step - reward: -0.5982\n", + "402 episodes - episode_reward: -14.883 [-60.000, 4.000] - loss: 1.505 - mae: 6.153 - mean_q: -7.265\n", + "\n", + "Interval 5 (40000 steps performed)\n", + "10000/10000 [==============================] - 73s 7ms/step - reward: -0.6216\n", + "done, took 357.885 seconds\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dqn = build_agent(model, actions)\n", + "dqn.compile(Adam(lr=1e-3), metrics=['mae'])\n", + "dqn.fit(env, nb_steps=50000, visualize=False, verbose=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Testing for 100 episodes ...\n" + ] + }, + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m/var/folders/wp/_fng4ppn01b2j4_j98240s780000gn/T/ipykernel_10921/978772492.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mscores\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdqn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0menv\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnb_episodes\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m100\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvisualize\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmean\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mscores\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhistory\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'episode_reward'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/Home/Working/Hanabi/envs/lib/python3.7/site-packages/rl/core.py\u001b[0m in \u001b[0;36mtest\u001b[0;34m(self, env, nb_episodes, action_repetition, callbacks, visualize, nb_max_episode_steps, nb_max_start_steps, start_step_policy, verbose)\u001b[0m\n\u001b[1;32m 339\u001b[0m \u001b[0mcallbacks\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mon_step_begin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mepisode_step\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 340\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 341\u001b[0;31m \u001b[0maction\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mforward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mobservation\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 342\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mprocessor\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 343\u001b[0m \u001b[0maction\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mprocessor\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mprocess_action\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maction\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/Home/Working/Hanabi/envs/lib/python3.7/site-packages/rl/agents/dqn.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, observation)\u001b[0m\n\u001b[1;32m 222\u001b[0m \u001b[0;31m# Select an action.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 223\u001b[0m \u001b[0mstate\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmemory\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_recent_state\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mobservation\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 224\u001b[0;31m \u001b[0mq_values\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcompute_q_values\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstate\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 225\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtraining\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 226\u001b[0m \u001b[0maction\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpolicy\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mselect_action\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mq_values\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mq_values\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/Home/Working/Hanabi/envs/lib/python3.7/site-packages/rl/agents/dqn.py\u001b[0m in \u001b[0;36mcompute_q_values\u001b[0;34m(self, state)\u001b[0m\n\u001b[1;32m 66\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 67\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mcompute_q_values\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstate\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 68\u001b[0;31m \u001b[0mq_values\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcompute_batch_q_values\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mstate\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mflatten\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 69\u001b[0m \u001b[0;32massert\u001b[0m \u001b[0mq_values\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnb_actions\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 70\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mq_values\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/Home/Working/Hanabi/envs/lib/python3.7/site-packages/rl/agents/dqn.py\u001b[0m in \u001b[0;36mcompute_batch_q_values\u001b[0;34m(self, state_batch)\u001b[0m\n\u001b[1;32m 61\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mcompute_batch_q_values\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstate_batch\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 62\u001b[0m \u001b[0mbatch\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mprocess_state_batch\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstate_batch\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 63\u001b[0;31m \u001b[0mq_values\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpredict_on_batch\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbatch\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 64\u001b[0m \u001b[0;32massert\u001b[0m \u001b[0mq_values\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstate_batch\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnb_actions\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 65\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mq_values\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/Home/Working/Hanabi/envs/lib/python3.7/site-packages/tensorflow/python/keras/engine/training_v1.py\u001b[0m in \u001b[0;36mpredict_on_batch\u001b[0;34m(self, x)\u001b[0m\n\u001b[1;32m 1212\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1213\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_make_predict_function\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1214\u001b[0;31m \u001b[0moutputs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpredict_function\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minputs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1215\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1216\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0moutputs\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/Home/Working/Hanabi/envs/lib/python3.7/site-packages/tensorflow/python/keras/backend.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, inputs)\u001b[0m\n\u001b[1;32m 3823\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3824\u001b[0m fetched = self._callable_fn(*array_vals,\n\u001b[0;32m-> 3825\u001b[0;31m run_metadata=self.run_metadata)\n\u001b[0m\u001b[1;32m 3826\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_call_fetch_callbacks\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfetched\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_fetches\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3827\u001b[0m output_structure = nest.pack_sequence_as(\n", + "\u001b[0;32m~/Home/Working/Hanabi/envs/lib/python3.7/site-packages/tensorflow/python/client/session.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1470\u001b[0m ret = tf_session.TF_SessionRunCallable(self._session._session,\n\u001b[1;32m 1471\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_handle\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1472\u001b[0;31m run_metadata_ptr)\n\u001b[0m\u001b[1;32m 1473\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mrun_metadata\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1474\u001b[0m \u001b[0mproto_data\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtf_session\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mTF_GetBuffer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrun_metadata_ptr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], + "source": [ + "scores = dqn.test(env, nb_episodes=100, visualize=False)\n", + "print(np.mean(scores.history['episode_reward']))" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Testing for 15 episodes ...\n", + "Episode 1: reward: 200.000, steps: 200\n", + "Episode 2: reward: 200.000, steps: 200\n", + "Episode 3: reward: 200.000, steps: 200\n", + "Episode 4: reward: 200.000, steps: 200\n", + "Episode 5: reward: 200.000, steps: 200\n", + "Episode 6: reward: 200.000, steps: 200\n", + "Episode 7: reward: 200.000, steps: 200\n", + "Episode 8: reward: 200.000, steps: 200\n", + "Episode 9: reward: 200.000, steps: 200\n", + "Episode 10: reward: 200.000, steps: 200\n", + "Episode 11: reward: 200.000, steps: 200\n", + "Episode 12: reward: 200.000, steps: 200\n", + "Episode 13: reward: 200.000, steps: 200\n", + "Episode 14: reward: 200.000, steps: 200\n", + "Episode 15: reward: 200.000, steps: 200\n" + ] + } + ], + "source": [ + "_ = dqn.test(env, nb_episodes=15, visualize=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 4. Reloading Agent from Memory" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "dqn.save_weights('dqn_weights.h5f', overwrite=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [], + "source": [ + "del model\n", + "del dqn\n", + "del env" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "env = gym.make('CartPole-v0')\n", + "actions = env.action_space.n\n", + "states = env.observation_space.shape[0]\n", + "model = build_model(states, actions)\n", + "dqn = build_agent(model, actions)\n", + "dqn.compile(Adam(lr=1e-3), metrics=['mae'])" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "dqn.load_weights('dqn_weights.h5f')" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Testing for 5 episodes ...\n", + "WARNING:tensorflow:From /Users/nicholasrenotte/opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/keras/engine/training_v1.py:2070: Model.state_updates (from tensorflow.python.keras.engine.training) is deprecated and will be removed in a future version.\n", + "Instructions for updating:\n", + "This property should not be used in TensorFlow 2.0, as updates are applied automatically.\n", + "Episode 1: reward: 200.000, steps: 200\n", + "Episode 2: reward: 200.000, steps: 200\n", + "Episode 3: reward: 200.000, steps: 200\n", + "Episode 4: reward: 200.000, steps: 200\n", + "Episode 5: reward: 200.000, steps: 200\n" + ] + } + ], + "source": [ + "_ = dqn.test(env, nb_episodes=5, visualize=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.11" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From a56bcb4895e73156b6611e8416475c55337aff27 Mon Sep 17 00:00:00 2001 From: Ben Messerly Date: Fri, 26 Aug 2022 14:58:18 -0500 Subject: [PATCH 4/7] Cleanup. --- DeepReinforcementLearning.ipynb | 423 -------------------------------- DumbGame.py | 62 ----- hanabi_ml_2.ipynb | 228 ----------------- tests/DumbGame.py | 93 ------- 4 files changed, 806 deletions(-) delete mode 100644 DeepReinforcementLearning.ipynb delete mode 100644 DumbGame.py delete mode 100644 hanabi_ml_2.ipynb delete mode 100644 tests/DumbGame.py diff --git a/DeepReinforcementLearning.ipynb b/DeepReinforcementLearning.ipynb deleted file mode 100644 index fc934a5..0000000 --- a/DeepReinforcementLearning.ipynb +++ /dev/null @@ -1,423 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 0. Install Dependencies" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 1. Test Random Environment with OpenAI Gym" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "States:(1,) Actions:5\n" - ] - } - ], - "source": [ - "from DumbGame import DumbGameEnv\n", - "env = DumbGameEnv()\n", - "states = env.observation_space.shape\n", - "actions = env.action_space.n\n", - "print(f\"States:{states} Actions:{actions}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 2. Create a Deep Learning Model with Keras" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "from tensorflow.keras.models import Sequential\n", - "from tensorflow.keras.layers import Dense, Flatten\n", - "from tensorflow.keras.optimizers import Adam" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "states = env.observation_space.shape\n", - "actions = env.action_space.n" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "5" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "actions" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "def build_model(states, actions):\n", - " model = Sequential() \n", - " model.add(Dense(24, activation='relu', input_shape=states))\n", - " model.add(Dense(24, activation='relu'))\n", - " model.add(Dense(actions, activation='linear'))\n", - " return model" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "del model " - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "model = build_model(states, actions)" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Model: \"sequential_2\"\n", - "_________________________________________________________________\n", - "Layer (type) Output Shape Param # \n", - "=================================================================\n", - "dense_6 (Dense) (None, 24) 48 \n", - "_________________________________________________________________\n", - "dense_7 (Dense) (None, 24) 600 \n", - "_________________________________________________________________\n", - "dense_8 (Dense) (None, 5) 125 \n", - "=================================================================\n", - "Total params: 773\n", - "Trainable params: 773\n", - "Non-trainable params: 0\n", - "_________________________________________________________________\n" - ] - } - ], - "source": [ - "model.summary()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 3. Build Agent with Keras-RL" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "from rl.agents import DQNAgent\n", - "from rl.policy import BoltzmannQPolicy\n", - "from rl.memory import SequentialMemory" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "def build_agent(model, actions):\n", - " policy = BoltzmannQPolicy()\n", - " memory = SequentialMemory(limit=50000, window_length=1)\n", - " dqn = DQNAgent(model=model, memory=memory, policy=policy, \n", - " nb_actions=actions, nb_steps_warmup=10, target_model_update=1e-2)\n", - " return dqn" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Training for 50000 steps ...\n", - "Interval 1 (0 steps performed)\n", - "WARNING:tensorflow:From /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages/tensorflow/python/keras/engine/training_v1.py:2070: Model.state_updates (from tensorflow.python.keras.engine.training) is deprecated and will be removed in a future version.\n", - "Instructions for updating:\n", - "This property should not be used in TensorFlow 2.0, as updates are applied automatically.\n", - " 1/10000 [..............................] - ETA: 9:10 - reward: -1.0000" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages/rl/memory.py:37: UserWarning: Not enough entries to sample without replacement. Consider increasing your warm-up phase to avoid oversampling!\n", - " warnings.warn('Not enough entries to sample without replacement. Consider increasing your warm-up phase to avoid oversampling!')\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "10000/10000 [==============================] - 73s 7ms/step - reward: -0.5974\n", - "402 episodes - episode_reward: -14.821 [-47.000, 5.000] - loss: 1.242 - mae: 5.141 - mean_q: -6.008\n", - "\n", - "Interval 2 (10000 steps performed)\n", - "10000/10000 [==============================] - 70s 7ms/step - reward: -0.5944\n", - "406 episodes - episode_reward: -14.675 [-52.000, 3.000] - loss: 1.479 - mae: 6.070 - mean_q: -7.153\n", - "\n", - "Interval 3 (20000 steps performed)\n", - "10000/10000 [==============================] - 71s 7ms/step - reward: -0.5978\n", - "402 episodes - episode_reward: -14.876 [-59.000, 2.000] - loss: 1.487 - mae: 6.079 - mean_q: -7.167\n", - "\n", - "Interval 4 (30000 steps performed)\n", - "10000/10000 [==============================] - 71s 7ms/step - reward: -0.5982\n", - "402 episodes - episode_reward: -14.883 [-60.000, 4.000] - loss: 1.505 - mae: 6.153 - mean_q: -7.265\n", - "\n", - "Interval 5 (40000 steps performed)\n", - "10000/10000 [==============================] - 73s 7ms/step - reward: -0.6216\n", - "done, took 357.885 seconds\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "dqn = build_agent(model, actions)\n", - "dqn.compile(Adam(lr=1e-3), metrics=['mae'])\n", - "dqn.fit(env, nb_steps=50000, visualize=False, verbose=1)" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Testing for 100 episodes ...\n" - ] - }, - { - "ename": "KeyboardInterrupt", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m/var/folders/wp/_fng4ppn01b2j4_j98240s780000gn/T/ipykernel_10921/978772492.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mscores\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdqn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0menv\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnb_episodes\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m100\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvisualize\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmean\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mscores\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhistory\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'episode_reward'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/Home/Working/Hanabi/envs/lib/python3.7/site-packages/rl/core.py\u001b[0m in \u001b[0;36mtest\u001b[0;34m(self, env, nb_episodes, action_repetition, callbacks, visualize, nb_max_episode_steps, nb_max_start_steps, start_step_policy, verbose)\u001b[0m\n\u001b[1;32m 339\u001b[0m \u001b[0mcallbacks\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mon_step_begin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mepisode_step\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 340\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 341\u001b[0;31m \u001b[0maction\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mforward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mobservation\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 342\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mprocessor\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 343\u001b[0m \u001b[0maction\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mprocessor\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mprocess_action\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maction\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/Home/Working/Hanabi/envs/lib/python3.7/site-packages/rl/agents/dqn.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, observation)\u001b[0m\n\u001b[1;32m 222\u001b[0m \u001b[0;31m# Select an action.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 223\u001b[0m \u001b[0mstate\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmemory\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_recent_state\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mobservation\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 224\u001b[0;31m \u001b[0mq_values\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcompute_q_values\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstate\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 225\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtraining\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 226\u001b[0m \u001b[0maction\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpolicy\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mselect_action\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mq_values\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mq_values\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/Home/Working/Hanabi/envs/lib/python3.7/site-packages/rl/agents/dqn.py\u001b[0m in \u001b[0;36mcompute_q_values\u001b[0;34m(self, state)\u001b[0m\n\u001b[1;32m 66\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 67\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mcompute_q_values\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstate\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 68\u001b[0;31m \u001b[0mq_values\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcompute_batch_q_values\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mstate\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mflatten\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 69\u001b[0m \u001b[0;32massert\u001b[0m \u001b[0mq_values\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnb_actions\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 70\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mq_values\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/Home/Working/Hanabi/envs/lib/python3.7/site-packages/rl/agents/dqn.py\u001b[0m in \u001b[0;36mcompute_batch_q_values\u001b[0;34m(self, state_batch)\u001b[0m\n\u001b[1;32m 61\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mcompute_batch_q_values\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstate_batch\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 62\u001b[0m \u001b[0mbatch\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mprocess_state_batch\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstate_batch\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 63\u001b[0;31m \u001b[0mq_values\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpredict_on_batch\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbatch\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 64\u001b[0m \u001b[0;32massert\u001b[0m \u001b[0mq_values\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstate_batch\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnb_actions\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 65\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mq_values\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/Home/Working/Hanabi/envs/lib/python3.7/site-packages/tensorflow/python/keras/engine/training_v1.py\u001b[0m in \u001b[0;36mpredict_on_batch\u001b[0;34m(self, x)\u001b[0m\n\u001b[1;32m 1212\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1213\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_make_predict_function\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1214\u001b[0;31m \u001b[0moutputs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpredict_function\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minputs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1215\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1216\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0moutputs\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/Home/Working/Hanabi/envs/lib/python3.7/site-packages/tensorflow/python/keras/backend.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, inputs)\u001b[0m\n\u001b[1;32m 3823\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3824\u001b[0m fetched = self._callable_fn(*array_vals,\n\u001b[0;32m-> 3825\u001b[0;31m run_metadata=self.run_metadata)\n\u001b[0m\u001b[1;32m 3826\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_call_fetch_callbacks\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfetched\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_fetches\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3827\u001b[0m output_structure = nest.pack_sequence_as(\n", - "\u001b[0;32m~/Home/Working/Hanabi/envs/lib/python3.7/site-packages/tensorflow/python/client/session.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1470\u001b[0m ret = tf_session.TF_SessionRunCallable(self._session._session,\n\u001b[1;32m 1471\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_handle\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1472\u001b[0;31m run_metadata_ptr)\n\u001b[0m\u001b[1;32m 1473\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mrun_metadata\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1474\u001b[0m \u001b[0mproto_data\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtf_session\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mTF_GetBuffer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrun_metadata_ptr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mKeyboardInterrupt\u001b[0m: " - ] - } - ], - "source": [ - "scores = dqn.test(env, nb_episodes=100, visualize=False)\n", - "print(np.mean(scores.history['episode_reward']))" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Testing for 15 episodes ...\n", - "Episode 1: reward: 200.000, steps: 200\n", - "Episode 2: reward: 200.000, steps: 200\n", - "Episode 3: reward: 200.000, steps: 200\n", - "Episode 4: reward: 200.000, steps: 200\n", - "Episode 5: reward: 200.000, steps: 200\n", - "Episode 6: reward: 200.000, steps: 200\n", - "Episode 7: reward: 200.000, steps: 200\n", - "Episode 8: reward: 200.000, steps: 200\n", - "Episode 9: reward: 200.000, steps: 200\n", - "Episode 10: reward: 200.000, steps: 200\n", - "Episode 11: reward: 200.000, steps: 200\n", - "Episode 12: reward: 200.000, steps: 200\n", - "Episode 13: reward: 200.000, steps: 200\n", - "Episode 14: reward: 200.000, steps: 200\n", - "Episode 15: reward: 200.000, steps: 200\n" - ] - } - ], - "source": [ - "_ = dqn.test(env, nb_episodes=15, visualize=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 4. Reloading Agent from Memory" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [], - "source": [ - "dqn.save_weights('dqn_weights.h5f', overwrite=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [], - "source": [ - "del model\n", - "del dqn\n", - "del env" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "env = gym.make('CartPole-v0')\n", - "actions = env.action_space.n\n", - "states = env.observation_space.shape[0]\n", - "model = build_model(states, actions)\n", - "dqn = build_agent(model, actions)\n", - "dqn.compile(Adam(lr=1e-3), metrics=['mae'])" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "dqn.load_weights('dqn_weights.h5f')" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Testing for 5 episodes ...\n", - "WARNING:tensorflow:From /Users/nicholasrenotte/opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/keras/engine/training_v1.py:2070: Model.state_updates (from tensorflow.python.keras.engine.training) is deprecated and will be removed in a future version.\n", - "Instructions for updating:\n", - "This property should not be used in TensorFlow 2.0, as updates are applied automatically.\n", - "Episode 1: reward: 200.000, steps: 200\n", - "Episode 2: reward: 200.000, steps: 200\n", - "Episode 3: reward: 200.000, steps: 200\n", - "Episode 4: reward: 200.000, steps: 200\n", - "Episode 5: reward: 200.000, steps: 200\n" - ] - } - ], - "source": [ - "_ = dqn.test(env, nb_episodes=5, visualize=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.11" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/DumbGame.py b/DumbGame.py deleted file mode 100644 index f6d8e27..0000000 --- a/DumbGame.py +++ /dev/null @@ -1,62 +0,0 @@ -################################################################################ -# Dumb game to be played by machines -# Guess the correct order of the numbers 1-5 which are shuffled. -# Keep guessing until you get the whole sequence. -# Penalized -1 for every wrong guess. -################################################################################ - -from gym import Env -from gym.spaces import Discrete, Box -import random -import numpy as np - - -class DumbGameEnv(Env): - def __init__(self): - self.n_numbers = 5 - self.answer = list(range(self.n_numbers)) - random.shuffle(self.answer) - self.state = 0 - self.action_space = Discrete(5) - self.observation_space = Box(low=np.array([0],dtype=np.float32), high=np.array([2],dtype=np.float32)) - - def step(self, action): - reward = 0 - if action == self.answer[self.state]: - self.state += 1 - reward = 1 - else: - reward = -1 - - done = self.state == self.n_numbers or self.state < -50 - - info = {} - - # Return step information - return self.state, reward, done, info - - def render(self): - pass - - def reset(self): - random.shuffle(self.answer) - self.state = 0 - self.n_guesses = 0 - return self.state - -if __name__ == "__main__": - env = DumbGameEnv() - #print(env.observation_space.sample()) # 0-1 - #print(env.action_space.sample()) # 0-4 - episodes = 10 - for episode in range(1, episodes+1): - state = env.reset() - done = False - score = 0 - n_guesses = 0 - while not done: - n_guesses += 1 - action = env.action_space.sample() - n_state, reward, done, info = env.step(action) - score+=reward - print(f'Episode:{episode} Score:{score} NGuesses:{n_guesses}') diff --git a/hanabi_ml_2.ipynb b/hanabi_ml_2.ipynb deleted file mode 100644 index a83d061..0000000 --- a/hanabi_ml_2.ipynb +++ /dev/null @@ -1,228 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "from DumbGame import DumbGameEnv\n", - "import numpy as np\n", - "from tensorflow.keras.models import Sequential\n", - "from tensorflow.keras.layers import Dense, Flatten\n", - "from tensorflow.keras.optimizers import Adam" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "States:(1,) Actions:5\n" - ] - } - ], - "source": [ - "env = DumbGameEnv()\n", - "states = env.observation_space.shape\n", - "actions = env.action_space.n\n", - "print(f\"States:{states} Actions:{actions}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "def build_model(states, actions):\n", - " model = Sequential() \n", - " model.add(Dense(24, activation='relu', input_shape=states))\n", - " model.add(Dense(24, activation='relu'))\n", - " model.add(Dense(actions, activation='linear'))\n", - " return model" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "from rl.agents import DQNAgent\n", - "from rl.policy import BoltzmannQPolicy\n", - "from rl.memory import SequentialMemory\n", - "\n", - "def build_agent(model, actions):\n", - " policy = BoltzmannQPolicy()\n", - " memory = SequentialMemory(limit=20000, window_length=1)\n", - " dqn = DQNAgent(model=model, memory=memory, policy=policy, \n", - " nb_actions=actions, nb_steps_warmup=20, target_model_update=0.1)\n", - " return dqn" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Model: \"sequential\"\n", - "_________________________________________________________________\n", - "Layer (type) Output Shape Param # \n", - "=================================================================\n", - "dense (Dense) (None, 24) 48 \n", - "_________________________________________________________________\n", - "dense_1 (Dense) (None, 24) 600 \n", - "_________________________________________________________________\n", - "dense_2 (Dense) (None, 5) 125 \n", - "=================================================================\n", - "Total params: 773\n", - "Trainable params: 773\n", - "Non-trainable params: 0\n", - "_________________________________________________________________\n" - ] - } - ], - "source": [ - "model = build_model(states, actions)\n", - "model.summary()" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Training for 5000 steps ...\n", - "Interval 1 (0 steps performed)\n", - "WARNING:tensorflow:From /Users/Ben/anaconda3/lib/python3.6/site-packages/tensorflow/python/keras/engine/training_v1.py:2070: Model.state_updates (from tensorflow.python.keras.engine.training) is deprecated and will be removed in a future version.\n", - "Instructions for updating:\n", - "This property should not be used in TensorFlow 2.0, as updates are applied automatically.\n", - " 1/10000 [..............................] - ETA: 8:06 - reward: -1.0000" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/Ben/anaconda3/lib/python3.6/site-packages/rl/memory.py:37: UserWarning: Not enough entries to sample without replacement. Consider increasing your warm-up phase to avoid oversampling!\n", - " warnings.warn('Not enough entries to sample without replacement. Consider increasing your warm-up phase to avoid oversampling!')\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " 4995/10000 [=============>................] - ETA: 29s - reward: -0.6040done, took 29.182 seconds\n" - ] - } - ], - "source": [ - "dqn = build_agent(model, actions)\n", - "dqn.compile(Adam(lr=0.1))#, metrics=['mae'])\n", - "history = dqn.fit(env, nb_steps=5000, visualize=False, verbose=1)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'nb_steps': 5000}\n", - "{'episode_reward': [-16.0, -10.0, -8.0, -24.0, -26.0, -2.0, -8.0, -21.0, -1.0, -2.0, -16.0, 0.0, -31.0, -38.0, -23.0, -38.0, -21.0, -22.0, -4.0, -9.0, -19.0, -19.0, -20.0, -21.0, -8.0, -29.0, -26.0, -18.0, -22.0, -10.0, -17.0, -17.0, -19.0, -50.0, -31.0, -16.0, -4.0, -13.0, -23.0, -29.0, -18.0, -22.0, -21.0, -9.0, -9.0, 0.0, -20.0, -28.0, -20.0, -26.0, -13.0, -25.0, -3.0, -11.0, -9.0, -7.0, -18.0, -11.0, -20.0, -15.0, -14.0, -15.0, -5.0, 0.0, -27.0, -46.0, -13.0, -7.0, -16.0, -16.0, -16.0, -12.0, -9.0, -16.0, -3.0, -13.0, -15.0, -5.0, -44.0, -11.0, 1.0, -26.0, -21.0, -18.0, -27.0, -33.0, -24.0, -12.0, -13.0, -6.0, -22.0, -1.0, -11.0, -2.0, -24.0, -1.0, -14.0, -7.0, -8.0, -27.0, -14.0, -18.0, -31.0, -20.0, -9.0, -3.0, 0.0, -27.0, -7.0, -16.0, -16.0, -10.0, -31.0, -14.0, -7.0, -16.0, -3.0, -24.0, -16.0, -39.0, -44.0, -20.0, -8.0, -2.0, -25.0, -8.0, -24.0, -36.0, -6.0, -20.0, -11.0, -21.0, -27.0, -33.0, -8.0, -12.0, -28.0, -8.0, -35.0, -4.0, -8.0, -27.0, 1.0, -8.0, -9.0, -12.0, -20.0, -14.0, 1.0, -23.0, -15.0, -5.0, -2.0, -14.0, -29.0, -18.0, -24.0, -4.0, -7.0, -26.0, -11.0, -18.0, -15.0, -10.0, -25.0, -6.0, -5.0, -13.0, -8.0, -13.0, -2.0, -7.0, -3.0, -9.0, -20.0, 1.0, -10.0, -4.0, -23.0, -16.0, -24.0, -9.0, -9.0, -6.0, -8.0, -10.0, -26.0, 1.0, -24.0, -14.0, -6.0, -22.0, -3.0, -1.0, -2.0, -14.0, -8.0, -15.0], 'nb_episode_steps': [26, 20, 18, 34, 36, 12, 18, 31, 11, 12, 26, 10, 41, 48, 33, 48, 31, 32, 14, 19, 29, 29, 30, 31, 18, 39, 36, 28, 32, 20, 27, 27, 29, 60, 41, 26, 14, 23, 33, 39, 28, 32, 31, 19, 19, 10, 30, 38, 30, 36, 23, 35, 13, 21, 19, 17, 28, 21, 30, 25, 24, 25, 15, 10, 37, 56, 23, 17, 26, 26, 26, 22, 19, 26, 13, 23, 25, 15, 54, 21, 9, 36, 31, 28, 37, 43, 34, 22, 23, 16, 32, 11, 21, 12, 34, 11, 24, 17, 18, 37, 24, 28, 41, 30, 19, 13, 10, 37, 17, 26, 26, 20, 41, 24, 17, 26, 13, 34, 26, 49, 54, 30, 18, 12, 35, 18, 34, 46, 16, 30, 21, 31, 37, 43, 18, 22, 38, 18, 45, 14, 18, 37, 9, 18, 19, 22, 30, 24, 9, 33, 25, 15, 12, 24, 39, 28, 34, 14, 17, 36, 21, 28, 25, 20, 35, 16, 15, 23, 18, 23, 12, 17, 13, 19, 30, 9, 20, 14, 33, 26, 34, 19, 19, 16, 18, 20, 36, 9, 34, 24, 16, 32, 13, 11, 12, 24, 18, 25], 'nb_steps': [26, 46, 64, 98, 134, 146, 164, 195, 206, 218, 244, 254, 295, 343, 376, 424, 455, 487, 501, 520, 549, 578, 608, 639, 657, 696, 732, 760, 792, 812, 839, 866, 895, 955, 996, 1022, 1036, 1059, 1092, 1131, 1159, 1191, 1222, 1241, 1260, 1270, 1300, 1338, 1368, 1404, 1427, 1462, 1475, 1496, 1515, 1532, 1560, 1581, 1611, 1636, 1660, 1685, 1700, 1710, 1747, 1803, 1826, 1843, 1869, 1895, 1921, 1943, 1962, 1988, 2001, 2024, 2049, 2064, 2118, 2139, 2148, 2184, 2215, 2243, 2280, 2323, 2357, 2379, 2402, 2418, 2450, 2461, 2482, 2494, 2528, 2539, 2563, 2580, 2598, 2635, 2659, 2687, 2728, 2758, 2777, 2790, 2800, 2837, 2854, 2880, 2906, 2926, 2967, 2991, 3008, 3034, 3047, 3081, 3107, 3156, 3210, 3240, 3258, 3270, 3305, 3323, 3357, 3403, 3419, 3449, 3470, 3501, 3538, 3581, 3599, 3621, 3659, 3677, 3722, 3736, 3754, 3791, 3800, 3818, 3837, 3859, 3889, 3913, 3922, 3955, 3980, 3995, 4007, 4031, 4070, 4098, 4132, 4146, 4163, 4199, 4220, 4248, 4273, 4293, 4328, 4344, 4359, 4382, 4400, 4423, 4435, 4452, 4465, 4484, 4514, 4523, 4543, 4557, 4590, 4616, 4650, 4669, 4688, 4704, 4722, 4742, 4778, 4787, 4821, 4845, 4861, 4893, 4906, 4917, 4929, 4953, 4971, 4996]}\n" - ] - } - ], - "source": [ - "print(history.params)\n", - "print(history.history)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Testing for 1 episodes ...\n" - ] - }, - { - "ename": "KeyboardInterrupt", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mscores\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdqn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0menv\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnb_episodes\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvisualize\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/rl/core.py\u001b[0m in \u001b[0;36mtest\u001b[0;34m(self, env, nb_episodes, action_repetition, callbacks, visualize, nb_max_episode_steps, nb_max_start_steps, start_step_policy, verbose)\u001b[0m\n\u001b[1;32m 363\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mnb_max_episode_steps\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mepisode_step\u001b[0m \u001b[0;34m>=\u001b[0m \u001b[0mnb_max_episode_steps\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 364\u001b[0m \u001b[0mdone\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 365\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbackward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreward\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mterminal\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 366\u001b[0m \u001b[0mepisode_reward\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0mreward\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 367\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/rl/agents/dqn.py\u001b[0m in \u001b[0;36mbackward\u001b[0;34m(self, reward, terminal)\u001b[0m\n\u001b[1;32m 240\u001b[0m training=self.training)\n\u001b[1;32m 241\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 242\u001b[0;31m \u001b[0mmetrics\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnan\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0m_\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmetrics_names\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 243\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtraining\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 244\u001b[0m \u001b[0;31m# We're done here. No need to update the experience memory since we only use the working\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/rl/agents/dqn.py\u001b[0m in \u001b[0;36mmetrics_names\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 340\u001b[0m \u001b[0mdummy_output_name\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtrainable_model\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moutput_names\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 341\u001b[0m \u001b[0mmodel_metrics\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mname\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0midx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtrainable_model\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmetrics_names\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0midx\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 342\u001b[0;31m \u001b[0mmodel_metrics\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreplace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdummy_output_name\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m'_'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m''\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mname\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mmodel_metrics\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 343\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 344\u001b[0m \u001b[0mnames\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmodel_metrics\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpolicy\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmetrics_names\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/rl/agents/dqn.py\u001b[0m in \u001b[0;36m\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 340\u001b[0m \u001b[0mdummy_output_name\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtrainable_model\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moutput_names\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 341\u001b[0m \u001b[0mmodel_metrics\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mname\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0midx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtrainable_model\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmetrics_names\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0midx\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 342\u001b[0;31m \u001b[0mmodel_metrics\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreplace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdummy_output_name\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m'_'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m''\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mname\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mmodel_metrics\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 343\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 344\u001b[0m \u001b[0mnames\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmodel_metrics\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpolicy\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmetrics_names\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mKeyboardInterrupt\u001b[0m: " - ] - } - ], - "source": [ - "scores = dqn.test(env, nb_episodes=1, visualize=False)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#print(np.mean(scores.history['episode_reward']))\n", - "#dqn.get_config()\n", - "#scores = dqn.test(env, nb_episodes=1, visualize=False, verbose=1)\n", - "#test(self, env, nb_episodes=1, action_repetition=1, callbacks=None, visualize=True, nb_max_episode_steps=None, nb_max_start_steps=0, start_step_policy=None, verbose=1)\n", - "#print(np.mean(scores.history['episode_reward']))callbacks = callbacks[:]" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.8" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/tests/DumbGame.py b/tests/DumbGame.py deleted file mode 100644 index 828668d..0000000 --- a/tests/DumbGame.py +++ /dev/null @@ -1,93 +0,0 @@ -import random - -class Game: - - def __init__(self): - print("New Game. Guess the correct order of numbers 0-5 with a partner. 7 strikes and you're out.") - self.over = False - self.n_strikes = 8 - self.numbers = list(range(0,5)) - random.shuffle(self.numbers) - self.acting_player_index = 0 - self.score = 0 - self.state = self.GetState() - - def GetActingPlayerIndex(self): - return self.acting_player_index - - def GetNextPlayerIndex(self): - return (self.acting_player_index + 1) % 2 - - def GetState(self, player_idx=None): - s = [] - s.append(self.n_strikes) - s.append(self.acting_player_index) - s.append(self.score) - # conceal solution from player - if player_idx is not None: - self.state.append("") - else: - s.append(self.numbers) - return s - - def NumberIsPlayable(self, guessed_number): - return self.numbers[self.score] == guessed_number - - def CheckGameOver(self): - if self.n_strikes == 0: - return True - if self.score == len(self.numbers): - return True - return False - - def NextTurn(self): - start_state = self.GetState() - print(f"Player {self.acting_player_index + 1}'s turn to act.") - print("Here's what they know about the game:") - print(self.GetState(self.acting_player_index)) - self.Action() - #self.GetActingPlayer().Act() - self.over = self.CheckGameOver() - self.acting_player_index = self.GetNextPlayerIndex() - end_state = self.GetState() - assert start_state != end_state - return end_state - - def Action(self): - while True: - try: - guessed_number = int(input("Guess a number 0-9> ")) - assert guessed_number in range(0,9) - break - except ValueError: # not an int - continue - except AssertionError: # not 0-9 - continue - if self.NumberIsPlayable(guessed_number): - print("Correct!") - self.score += 1 - else: - self.n_strikes -= 1 - print(f"Wrong. {self.n_strikes} strikes remaining.") - - - -if __name__ == "__main__": - - game = Game() - print(game.GetState()) - - while not game.over: - try: - new_state = game.NextTurn() - print(game.GetState()) - except AssertionError: - print("Error: game state did not change when a turn was taken.") - sys.exit(1) - - print("Game finished.") - - if game.score == len(game.numbers): - print("Fireworks! You Win!") - else: - print("Too bad, you lose with a score of", game.GetScore()) From 4dbb04cf76ef78f4dc0ebc4501727cf1bfaa1a44 Mon Sep 17 00:00:00 2001 From: Ben Messerly Date: Fri, 26 Aug 2022 16:00:48 -0500 Subject: [PATCH 5/7] Remove notebooks in favor of python files. Everything synced with jupytext. --- ...ICustonEnvironmentReinforcementLearning.py | 143 ++++ ...m Environment Reinforcement Learning.ipynb | 701 ------------------ ...ICustomEnvironmentReinforcementLearning.py | 169 +++++ RL_test_2/Deep Reinforcement Learning.ipynb | 451 ----------- RL_test_2/DeepReinforcementLearning.py | 115 +++ RL_test_3/Untitled.ipynb | 6 - RL_test_3/Untitled.py | 10 + RL_test_4/hanabi_ml_2.ipynb | 228 ------ RL_test_4/hanabi_ml_2.py | 68 ++ RL_test_5/DeepReinforcementLearning.ipynb | 423 ----------- RL_test_5/DeepReinforcementLearning.py | 95 +++ 11 files changed, 600 insertions(+), 1809 deletions(-) create mode 100644 RL_test_0/OpenAICustonEnvironmentReinforcementLearning.py delete mode 100644 RL_test_1/OpenAI Custom Environment Reinforcement Learning.ipynb create mode 100644 RL_test_1/OpenAICustomEnvironmentReinforcementLearning.py delete mode 100644 RL_test_2/Deep Reinforcement Learning.ipynb create mode 100644 RL_test_2/DeepReinforcementLearning.py delete mode 100644 RL_test_3/Untitled.ipynb create mode 100644 RL_test_3/Untitled.py delete mode 100644 RL_test_4/hanabi_ml_2.ipynb create mode 100644 RL_test_4/hanabi_ml_2.py delete mode 100644 RL_test_5/DeepReinforcementLearning.ipynb create mode 100644 RL_test_5/DeepReinforcementLearning.py diff --git a/RL_test_0/OpenAICustonEnvironmentReinforcementLearning.py b/RL_test_0/OpenAICustonEnvironmentReinforcementLearning.py new file mode 100644 index 0000000..8486a3a --- /dev/null +++ b/RL_test_0/OpenAICustonEnvironmentReinforcementLearning.py @@ -0,0 +1,143 @@ +# --- +# jupyter: +# jupytext: +# formats: ipynb,py +# text_representation: +# extension: .py +# format_name: light +# format_version: '1.5' +# jupytext_version: 1.14.1 +# kernelspec: +# display_name: Python 3 (ipykernel) +# language: python +# name: python3 +# --- + +# # 0. Install Dependencies + +# # 1. Test Random Environment with OpenAI Gym + +from gym import Env +from gym.spaces import Discrete, Box +import numpy as np +import random + + +class ShowerEnv(Env): + def __init__(self): + # Actions we can take, down, stay, up + self.action_space = Discrete(3) + # Temperature array + self.observation_space = Box(low=np.array([0]), high=np.array([100])) + # Set start temp + self.state = 38 + random.randint(-3,3) + # Set shower length + self.shower_length = 60 + + def step(self, action): + # Apply action + # 0 -1 = -1 temperature + # 1 -1 = 0 + # 2 -1 = 1 temperature + self.state += action -1 + # Reduce shower length by 1 second + self.shower_length -= 1 + + # Calculate reward + if self.state >=37 and self.state <=39: + reward =1 + else: + reward = -1 + + # Check if shower is done + if self.shower_length <= 0: + done = True + else: + done = False + + # Apply temperature noise + #self.state += random.randint(-1,1) + # Set placeholder for info + info = {} + + # Return step information + return self.state, reward, done, info + + def render(self): + # Implement viz + pass + + def reset(self): + # Reset shower temperature + self.state = 38 + random.randint(-3,3) + # Reset shower time + self.shower_length = 60 + return self.state + + + +env = ShowerEnv() + +env.observation_space.sample() + +episodes = 10 +for episode in range(1, episodes+1): + state = env.reset() + done = False + score = 0 + + while not done: + #env.render() + action = env.action_space.sample() + n_state, reward, done, info = env.step(action) + score+=reward + print('Episode:{} Score:{}'.format(episode, score)) + +# # 2. Create a Deep Learning Model with Keras + +import numpy as np +from tensorflow.keras.models import Sequential +from tensorflow.keras.layers import Dense, Flatten +from tensorflow.keras.optimizers import Adam + +states = env.observation_space.shape +actions = env.action_space.n + +actions + + +def build_model(states, actions): + model = Sequential() + model.add(Dense(24, activation='relu', input_shape=states)) + model.add(Dense(24, activation='relu')) + model.add(Dense(actions, activation='linear')) + return model + + +del model + +model = build_model(states, actions) + +model.summary() + +# # 3. Build Agent with Keras-RL + +from rl.agents import DQNAgent +from rl.policy import BoltzmannQPolicy +from rl.memory import SequentialMemory + + +def build_agent(model, actions): + policy = BoltzmannQPolicy() + memory = SequentialMemory(limit=50000, window_length=1) + dqn = DQNAgent(model=model, memory=memory, policy=policy, + nb_actions=actions, nb_steps_warmup=10, target_model_update=1e-2) + return dqn + + +dqn = build_agent(model, actions) +dqn.compile(Adam(lr=1e-3), metrics=['mae']) +dqn.fit(env, nb_steps=50000, visualize=False, verbose=1) + +scores = dqn.test(env, nb_episodes=100, visualize=False) +print(np.mean(scores.history['episode_reward'])) diff --git a/RL_test_1/OpenAI Custom Environment Reinforcement Learning.ipynb b/RL_test_1/OpenAI Custom Environment Reinforcement Learning.ipynb deleted file mode 100644 index 031259c..0000000 --- a/RL_test_1/OpenAI Custom Environment Reinforcement Learning.ipynb +++ /dev/null @@ -1,701 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 0. Install Dependencies" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 1. Test Random Environment with OpenAI Gym" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "from gym import Env\n", - "from gym.spaces import Discrete, Box\n", - "import numpy as np\n", - "import random" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [], - "source": [ - "class ShowerEnv(Env):\n", - " def __init__(self):\n", - " # Actions we can take, down, stay, up\n", - " self.action_space = Discrete(5)\n", - " # Temperature array\n", - " self.observation_space = Discrete(100)\n", - " # Set start temp\n", - " self.state = 38 + random.randint(-3,3)\n", - " # Set shower length\n", - " self.shower_length = 60\n", - " \n", - " self.answer = list(range(3))\n", - " random.shuffle(self.answer)\n", - " \n", - " def step(self, action):\n", - " # Apply action\n", - " # 0 -1 = -1 temperature\n", - " # 1 -1 = 0 \n", - " # 2 -1 = 1 temperature \n", - " self.state += action -2 \n", - " # Reduce shower length by 1 second\n", - " self.shower_length -= 1 \n", - " \n", - " # Calculate reward\n", - " if self.state >=37 and self.state <=39: \n", - " reward =1 \n", - " else: \n", - " reward = -1 \n", - " \n", - " # Check if shower is done\n", - " if self.shower_length <= 0: \n", - " done = True\n", - " else:\n", - " done = False\n", - " \n", - " # Apply temperature noise\n", - " #self.state += random.randint(-1,1)\n", - " # Set placeholder for info\n", - " info = {}\n", - " \n", - " # Return step information\n", - " return self.state, reward, done, info\n", - "\n", - " def render(self):\n", - " # Implement viz\n", - " pass\n", - " \n", - " def reset(self):\n", - " # Reset shower temperature\n", - " self.state = 38 + random.randint(-3,3)\n", - " # Reset shower time\n", - " self.shower_length = 60\n", - " random.shuffle(self.answer)\n", - " return self.state" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [], - "source": [ - "env = ShowerEnv()" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "76" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "env.observation_space.sample()" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "assert not env.observation_space.contains(1.5)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Episode:1 Score:10\n", - "Episode:2 Score:-56\n", - "Episode:3 Score:-34\n", - "Episode:4 Score:-58\n", - "Episode:5 Score:-60\n", - "Episode:6 Score:-50\n", - "Episode:7 Score:-6\n", - "Episode:8 Score:-60\n", - "Episode:9 Score:4\n", - "Episode:10 Score:44\n" - ] - } - ], - "source": [ - "episodes = 10\n", - "for episode in range(1, episodes+1):\n", - " state = env.reset()\n", - " done = False\n", - " score = 0 \n", - " \n", - " while not done:\n", - " #env.render()\n", - " action = env.action_space.sample()\n", - " n_state, reward, done, info = env.step(action)\n", - " score+=reward\n", - " print('Episode:{} Score:{}'.format(episode, score))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 2. Create a Deep Learning Model with Keras" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "from tensorflow.keras.models import Sequential\n", - "from tensorflow.keras.layers import Dense, Flatten\n", - "from tensorflow.keras.optimizers import Adam" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "#states = env.observation_space.shape\n", - "states = (1,)\n", - "actions = env.action_space.n" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "3" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "actions" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(1,)" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "states" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "def build_model(states, actions):\n", - " model = Sequential() \n", - " model.add(Dense(24, activation='relu', input_shape=states))\n", - " model.add(Dense(24, activation='relu'))\n", - " model.add(Dense(actions, activation='linear'))\n", - " return model" - ] - }, - { - "cell_type": "code", - "execution_count": 62, - "metadata": {}, - "outputs": [], - "source": [ - "del model" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [], - "source": [ - "model = build_model(states, actions)" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Model: \"sequential_2\"\n", - "_________________________________________________________________\n", - "Layer (type) Output Shape Param # \n", - "=================================================================\n", - "dense_6 (Dense) (None, 24) 48 \n", - "_________________________________________________________________\n", - "dense_7 (Dense) (None, 24) 600 \n", - "_________________________________________________________________\n", - "dense_8 (Dense) (None, 3) 75 \n", - "=================================================================\n", - "Total params: 723\n", - "Trainable params: 723\n", - "Non-trainable params: 0\n", - "_________________________________________________________________\n" - ] - } - ], - "source": [ - "model.summary()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 3. Build Agent with Keras-RL" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [], - "source": [ - "from rl.agents import DQNAgent\n", - "from rl.policy import BoltzmannQPolicy\n", - "from rl.memory import SequentialMemory" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [], - "source": [ - "def build_agent(model, actions):\n", - " policy = BoltzmannQPolicy()\n", - " memory = SequentialMemory(limit=50000, window_length=1)\n", - " dqn = DQNAgent(model=model, memory=memory, policy=policy, \n", - " nb_actions=actions, nb_steps_warmup=10, target_model_update=1e-2)\n", - " return dqn" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Training for 50000 steps ...\n", - "Interval 1 (0 steps performed)\n", - "WARNING:tensorflow:From /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages/tensorflow/python/keras/engine/training_v1.py:2070: Model.state_updates (from tensorflow.python.keras.engine.training) is deprecated and will be removed in a future version.\n", - "Instructions for updating:\n", - "This property should not be used in TensorFlow 2.0, as updates are applied automatically.\n", - " 1/10000 [..............................] - ETA: 11:37 - reward: 1.0000" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages/rl/memory.py:37: UserWarning: Not enough entries to sample without replacement. Consider increasing your warm-up phase to avoid oversampling!\n", - " warnings.warn('Not enough entries to sample without replacement. Consider increasing your warm-up phase to avoid oversampling!')\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "10000/10000 [==============================] - 71s 7ms/step - reward: -0.4972\n", - "166 episodes - episode_reward: -30.193 [-60.000, 60.000] - loss: 0.802 - mae: 7.679 - mean_q: -9.525\n", - "\n", - "Interval 2 (10000 steps performed)\n", - "10000/10000 [==============================] - 72s 7ms/step - reward: -0.0406\n", - "167 episodes - episode_reward: -2.072 [-60.000, 60.000] - loss: 1.381 - mae: 8.380 - mean_q: -5.159\n", - "\n", - "Interval 3 (20000 steps performed)\n", - "10000/10000 [==============================] - 74s 7ms/step - reward: 0.1818\n", - "167 episodes - episode_reward: 10.766 [-60.000, 60.000] - loss: 2.197 - mae: 8.740 - mean_q: 1.123\n", - "\n", - "Interval 4 (30000 steps performed)\n", - "10000/10000 [==============================] - 79s 8ms/step - reward: 0.3792\n", - "166 episodes - episode_reward: 22.602 [-60.000, 60.000] - loss: 6.342 - mae: 10.959 - mean_q: 12.510\n", - "\n", - "Interval 5 (40000 steps performed)\n", - "10000/10000 [==============================] - 83s 8ms/step - reward: 0.3358\n", - "done, took 378.766 seconds\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "dqn = build_agent(model, actions)\n", - "dqn.compile(Adam(lr=1e-3), metrics=['mae'])\n", - "dqn.fit(env, nb_steps=50000, visualize=False, verbose=1)" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": { - "scrolled": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Testing for 100 episodes ...\n", - "Episode 1: reward: -60.000, steps: 60\n", - "Episode 2: reward: 60.000, steps: 60\n", - "Episode 3: reward: 60.000, steps: 60\n", - "Episode 4: reward: -60.000, steps: 60\n", - "Episode 5: reward: 60.000, steps: 60\n", - "Episode 6: reward: -60.000, steps: 60\n", - "Episode 7: reward: -60.000, steps: 60\n", - "Episode 8: reward: -60.000, steps: 60\n", - "Episode 9: reward: -60.000, steps: 60\n", - "Episode 10: reward: -60.000, steps: 60\n", - "Episode 11: reward: 60.000, steps: 60\n", - "Episode 12: reward: 60.000, steps: 60\n", - "Episode 13: reward: 60.000, steps: 60\n", - "Episode 14: reward: -60.000, steps: 60\n", - "Episode 15: reward: 60.000, steps: 60\n", - "Episode 16: reward: 60.000, steps: 60\n", - "Episode 17: reward: 60.000, steps: 60\n", - "Episode 18: reward: -60.000, steps: 60\n", - "Episode 19: reward: -60.000, steps: 60\n", - "Episode 20: reward: 60.000, steps: 60\n", - "Episode 21: reward: -60.000, steps: 60\n", - "Episode 22: reward: -60.000, steps: 60\n", - "Episode 23: reward: -60.000, steps: 60\n", - "Episode 24: reward: 60.000, steps: 60\n", - "Episode 25: reward: -60.000, steps: 60\n", - "Episode 26: reward: -60.000, steps: 60\n", - "Episode 27: reward: 60.000, steps: 60\n", - "Episode 28: reward: 60.000, steps: 60\n", - "Episode 29: reward: 60.000, steps: 60\n", - "Episode 30: reward: -60.000, steps: 60\n", - "Episode 31: reward: -60.000, steps: 60\n", - "Episode 32: reward: 60.000, steps: 60\n", - "Episode 33: reward: 60.000, steps: 60\n", - "Episode 34: reward: 60.000, steps: 60\n", - "Episode 35: reward: 60.000, steps: 60\n", - "Episode 36: reward: 60.000, steps: 60\n", - "Episode 37: reward: -60.000, steps: 60\n", - "Episode 38: reward: -60.000, steps: 60\n", - "Episode 39: reward: -60.000, steps: 60\n", - "Episode 40: reward: 60.000, steps: 60\n", - "Episode 41: reward: -60.000, steps: 60\n", - "Episode 42: reward: 60.000, steps: 60\n", - "Episode 43: reward: -60.000, steps: 60\n", - "Episode 44: reward: -60.000, steps: 60\n", - "Episode 45: reward: -60.000, steps: 60\n", - "Episode 46: reward: 60.000, steps: 60\n", - "Episode 47: reward: 60.000, steps: 60\n", - "Episode 48: reward: -60.000, steps: 60\n", - "Episode 49: reward: 60.000, steps: 60\n", - "Episode 50: reward: 60.000, steps: 60\n", - "Episode 51: reward: -60.000, steps: 60\n", - "Episode 52: reward: 60.000, steps: 60\n", - "Episode 53: reward: -60.000, steps: 60\n", - "Episode 54: reward: 60.000, steps: 60\n", - "Episode 55: reward: 60.000, steps: 60\n", - "Episode 56: reward: -60.000, steps: 60\n", - "Episode 57: reward: 60.000, steps: 60\n", - "Episode 58: reward: -60.000, steps: 60\n", - "Episode 59: reward: -60.000, steps: 60\n", - "Episode 60: reward: 60.000, steps: 60\n", - "Episode 61: reward: -60.000, steps: 60\n", - "Episode 62: reward: -60.000, steps: 60\n", - "Episode 63: reward: 60.000, steps: 60\n", - "Episode 64: reward: -60.000, steps: 60\n", - "Episode 65: reward: 60.000, steps: 60\n", - "Episode 66: reward: -60.000, steps: 60\n", - "Episode 67: reward: 60.000, steps: 60\n", - "Episode 68: reward: -60.000, steps: 60\n", - "Episode 69: reward: -60.000, steps: 60\n", - "Episode 70: reward: -60.000, steps: 60\n", - "Episode 71: reward: -60.000, steps: 60\n", - "Episode 72: reward: -60.000, steps: 60\n", - "Episode 73: reward: 60.000, steps: 60\n", - "Episode 74: reward: -60.000, steps: 60\n", - "Episode 75: reward: 60.000, steps: 60\n", - "Episode 76: reward: 60.000, steps: 60\n", - "Episode 77: reward: -60.000, steps: 60\n", - "Episode 78: reward: -60.000, steps: 60\n", - "Episode 79: reward: 60.000, steps: 60\n", - "Episode 80: reward: -60.000, steps: 60\n", - "Episode 81: reward: 60.000, steps: 60\n", - "Episode 82: reward: 60.000, steps: 60\n", - "Episode 83: reward: -60.000, steps: 60\n", - "Episode 84: reward: 60.000, steps: 60\n", - "Episode 85: reward: -60.000, steps: 60\n", - "Episode 86: reward: 60.000, steps: 60\n", - "Episode 87: reward: -60.000, steps: 60\n", - "Episode 88: reward: 60.000, steps: 60\n", - "Episode 89: reward: 60.000, steps: 60\n", - "Episode 90: reward: -60.000, steps: 60\n", - "Episode 91: reward: -60.000, steps: 60\n", - "Episode 92: reward: 60.000, steps: 60\n", - "Episode 93: reward: -60.000, steps: 60\n", - "Episode 94: reward: -60.000, steps: 60\n", - "Episode 95: reward: -60.000, steps: 60\n", - "Episode 96: reward: 60.000, steps: 60\n", - "Episode 97: reward: 60.000, steps: 60\n", - "Episode 98: reward: -60.000, steps: 60\n", - "Episode 99: reward: -60.000, steps: 60\n", - "Episode 100: reward: 60.000, steps: 60\n", - "-3.6\n" - ] - } - ], - "source": [ - "scores = dqn.test(env, nb_episodes=100, visualize=False)\n", - "print(np.mean(scores.history['episode_reward']))" - ] - }, - { - "cell_type": "code", - "execution_count": 67, - "metadata": {}, - "outputs": [], - "source": [ - "dqn.save_weights('dqn_weights_discrete.h5f', overwrite=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 68, - "metadata": {}, - "outputs": [], - "source": [ - "del model\n", - "del dqn\n", - "del env" - ] - }, - { - "cell_type": "code", - "execution_count": 69, - "metadata": {}, - "outputs": [], - "source": [ - "env = ShowerEnv()\n", - "actions = env.action_space.n\n", - "states = (1,)\n", - "model = build_model(states, actions)\n", - "dqn = build_agent(model, actions)\n", - "dqn.compile(Adam(lr=1e-3), metrics=['mae'])" - ] - }, - { - "cell_type": "code", - "execution_count": 70, - "metadata": {}, - "outputs": [], - "source": [ - "dqn.load_weights('dqn_weights_discrete.h5f')" - ] - }, - { - "cell_type": "code", - "execution_count": 71, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Testing for 100 episodes ...\n", - "Episode 1: reward: 58.000, steps: 60\n", - "Episode 2: reward: 58.000, steps: 60\n", - "Episode 3: reward: 58.000, steps: 60\n", - "Episode 4: reward: 60.000, steps: 60\n", - "Episode 5: reward: 60.000, steps: 60\n", - "Episode 6: reward: 58.000, steps: 60\n", - "Episode 7: reward: 58.000, steps: 60\n", - "Episode 8: reward: 60.000, steps: 60\n", - "Episode 9: reward: 60.000, steps: 60\n", - "Episode 10: reward: 58.000, steps: 60\n", - "Episode 11: reward: 58.000, steps: 60\n", - "Episode 12: reward: 60.000, steps: 60\n", - "Episode 13: reward: 60.000, steps: 60\n", - "Episode 14: reward: 60.000, steps: 60\n", - "Episode 15: reward: 58.000, steps: 60\n", - "Episode 16: reward: 60.000, steps: 60\n", - "Episode 17: reward: 60.000, steps: 60\n", - "Episode 18: reward: 60.000, steps: 60\n", - "Episode 19: reward: 60.000, steps: 60\n", - "Episode 20: reward: 60.000, steps: 60\n", - "Episode 21: reward: 60.000, steps: 60\n", - "Episode 22: reward: 60.000, steps: 60\n", - "Episode 23: reward: 60.000, steps: 60\n", - "Episode 24: reward: 60.000, steps: 60\n", - "Episode 25: reward: 60.000, steps: 60\n", - "Episode 26: reward: 60.000, steps: 60\n", - "Episode 27: reward: 60.000, steps: 60\n", - "Episode 28: reward: 60.000, steps: 60\n", - "Episode 29: reward: 60.000, steps: 60\n", - "Episode 30: reward: 58.000, steps: 60\n", - "Episode 31: reward: 60.000, steps: 60\n", - "Episode 32: reward: 58.000, steps: 60\n", - "Episode 33: reward: 60.000, steps: 60\n", - "Episode 34: reward: 60.000, steps: 60\n", - "Episode 35: reward: 58.000, steps: 60\n", - "Episode 36: reward: 60.000, steps: 60\n", - "Episode 37: reward: 60.000, steps: 60\n", - "Episode 38: reward: 60.000, steps: 60\n", - "Episode 39: reward: 60.000, steps: 60\n", - "Episode 40: reward: 60.000, steps: 60\n", - "Episode 41: reward: 60.000, steps: 60\n", - "Episode 42: reward: 60.000, steps: 60\n", - "Episode 43: reward: 60.000, steps: 60\n", - "Episode 44: reward: 60.000, steps: 60\n", - "Episode 45: reward: 58.000, steps: 60\n", - "Episode 46: reward: 58.000, steps: 60\n", - "Episode 47: reward: 60.000, steps: 60\n", - "Episode 48: reward: 60.000, steps: 60\n", - "Episode 49: reward: 60.000, steps: 60\n", - "Episode 50: reward: 60.000, steps: 60\n", - "Episode 51: reward: 60.000, steps: 60\n", - "Episode 52: reward: 60.000, steps: 60\n", - "Episode 53: reward: 60.000, steps: 60\n", - "Episode 54: reward: 60.000, steps: 60\n", - "Episode 55: reward: 60.000, steps: 60\n", - "Episode 56: reward: 60.000, steps: 60\n", - "Episode 57: reward: 60.000, steps: 60\n", - "Episode 58: reward: 60.000, steps: 60\n", - "Episode 59: reward: 58.000, steps: 60\n", - "Episode 60: reward: 58.000, steps: 60\n", - "Episode 61: reward: 60.000, steps: 60\n", - "Episode 62: reward: 60.000, steps: 60\n", - "Episode 63: reward: 60.000, steps: 60\n", - "Episode 64: reward: 58.000, steps: 60\n", - "Episode 65: reward: 60.000, steps: 60\n", - "Episode 66: reward: 58.000, steps: 60\n", - "Episode 67: reward: 60.000, steps: 60\n", - "Episode 68: reward: 58.000, steps: 60\n", - "Episode 69: reward: 58.000, steps: 60\n", - "Episode 70: reward: 60.000, steps: 60\n", - "Episode 71: reward: 60.000, steps: 60\n", - "Episode 72: reward: 60.000, steps: 60\n", - "Episode 73: reward: 60.000, steps: 60\n", - "Episode 74: reward: 58.000, steps: 60\n", - "Episode 75: reward: 58.000, steps: 60\n", - "Episode 76: reward: 58.000, steps: 60\n", - "Episode 77: reward: 60.000, steps: 60\n", - "Episode 78: reward: 60.000, steps: 60\n", - "Episode 79: reward: 60.000, steps: 60\n", - "Episode 80: reward: 58.000, steps: 60\n", - "Episode 81: reward: 58.000, steps: 60\n", - "Episode 82: reward: 60.000, steps: 60\n", - "Episode 83: reward: 58.000, steps: 60\n", - "Episode 84: reward: 60.000, steps: 60\n", - "Episode 85: reward: 58.000, steps: 60\n", - "Episode 86: reward: 60.000, steps: 60\n", - "Episode 87: reward: 60.000, steps: 60\n", - "Episode 88: reward: 60.000, steps: 60\n", - "Episode 89: reward: 60.000, steps: 60\n", - "Episode 90: reward: 60.000, steps: 60\n", - "Episode 91: reward: 58.000, steps: 60\n", - "Episode 92: reward: 60.000, steps: 60\n", - "Episode 93: reward: 60.000, steps: 60\n", - "Episode 94: reward: 60.000, steps: 60\n", - "Episode 95: reward: 60.000, steps: 60\n", - "Episode 96: reward: 60.000, steps: 60\n", - "Episode 97: reward: 58.000, steps: 60\n", - "Episode 98: reward: 58.000, steps: 60\n", - "Episode 99: reward: 60.000, steps: 60\n", - "Episode 100: reward: 60.000, steps: 60\n", - "59.42\n" - ] - } - ], - "source": [ - "scores = dqn.test(env, nb_episodes=100, visualize=False)\n", - "print(np.mean(scores.history['episode_reward']))" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.11" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/RL_test_1/OpenAICustomEnvironmentReinforcementLearning.py b/RL_test_1/OpenAICustomEnvironmentReinforcementLearning.py new file mode 100644 index 0000000..d01eb0b --- /dev/null +++ b/RL_test_1/OpenAICustomEnvironmentReinforcementLearning.py @@ -0,0 +1,169 @@ +# --- +# jupyter: +# jupytext: +# formats: ipynb,py +# text_representation: +# extension: .py +# format_name: light +# format_version: '1.5' +# jupytext_version: 1.14.1 +# kernelspec: +# display_name: Python 3 (ipykernel) +# language: python +# name: python3 +# --- + +# # 0. Install Dependencies + +# # 1. Test Random Environment with OpenAI Gym + +from gym import Env +from gym.spaces import Discrete, Box +import numpy as np +import random + + +class ShowerEnv(Env): + def __init__(self): + # Actions we can take, down, stay, up + self.action_space = Discrete(5) + # Temperature array + self.observation_space = Discrete(100) + # Set start temp + self.state = 38 + random.randint(-3,3) + # Set shower length + self.shower_length = 60 + + self.answer = list(range(3)) + random.shuffle(self.answer) + + def step(self, action): + # Apply action + # 0 -1 = -1 temperature + # 1 -1 = 0 + # 2 -1 = 1 temperature + self.state += action -2 + # Reduce shower length by 1 second + self.shower_length -= 1 + + # Calculate reward + if self.state >=37 and self.state <=39: + reward =1 + else: + reward = -1 + + # Check if shower is done + if self.shower_length <= 0: + done = True + else: + done = False + + # Apply temperature noise + #self.state += random.randint(-1,1) + # Set placeholder for info + info = {} + + # Return step information + return self.state, reward, done, info + + def render(self): + # Implement viz + pass + + def reset(self): + # Reset shower temperature + self.state = 38 + random.randint(-3,3) + # Reset shower time + self.shower_length = 60 + random.shuffle(self.answer) + return self.state + + +env = ShowerEnv() + +env.observation_space.sample() + +assert not env.observation_space.contains(1.5) + +episodes = 10 +for episode in range(1, episodes+1): + state = env.reset() + done = False + score = 0 + + while not done: + #env.render() + action = env.action_space.sample() + n_state, reward, done, info = env.step(action) + score+=reward + print('Episode:{} Score:{}'.format(episode, score)) + +# # 2. Create a Deep Learning Model with Keras + +import numpy as np +from tensorflow.keras.models import Sequential +from tensorflow.keras.layers import Dense, Flatten +from tensorflow.keras.optimizers import Adam + +#states = env.observation_space.shape +states = (1,) +actions = env.action_space.n + +actions + +states + + +def build_model(states, actions): + model = Sequential() + model.add(Dense(24, activation='relu', input_shape=states)) + model.add(Dense(24, activation='relu')) + model.add(Dense(actions, activation='linear')) + return model + + +del model + +model = build_model(states, actions) + +model.summary() + +# # 3. Build Agent with Keras-RL + +from rl.agents import DQNAgent +from rl.policy import BoltzmannQPolicy +from rl.memory import SequentialMemory + + +def build_agent(model, actions): + policy = BoltzmannQPolicy() + memory = SequentialMemory(limit=50000, window_length=1) + dqn = DQNAgent(model=model, memory=memory, policy=policy, + nb_actions=actions, nb_steps_warmup=10, target_model_update=1e-2) + return dqn + + +dqn = build_agent(model, actions) +dqn.compile(Adam(lr=1e-3), metrics=['mae']) +dqn.fit(env, nb_steps=50000, visualize=False, verbose=1) + +scores = dqn.test(env, nb_episodes=100, visualize=False) +print(np.mean(scores.history['episode_reward'])) + +dqn.save_weights('dqn_weights_discrete.h5f', overwrite=True) + +del model +del dqn +del env + +env = ShowerEnv() +actions = env.action_space.n +states = (1,) +model = build_model(states, actions) +dqn = build_agent(model, actions) +dqn.compile(Adam(lr=1e-3), metrics=['mae']) + +dqn.load_weights('dqn_weights_discrete.h5f') + +scores = dqn.test(env, nb_episodes=100, visualize=False) +print(np.mean(scores.history['episode_reward'])) diff --git a/RL_test_2/Deep Reinforcement Learning.ipynb b/RL_test_2/Deep Reinforcement Learning.ipynb deleted file mode 100644 index 66b93de..0000000 --- a/RL_test_2/Deep Reinforcement Learning.ipynb +++ /dev/null @@ -1,451 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 0. Install Dependencies" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Requirement already satisfied: tensorflow==2.3.0 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (2.3.0)\n", - "Requirement already satisfied: keras-preprocessing<1.2,>=1.1.1 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorflow==2.3.0) (1.1.2)\n", - "Requirement already satisfied: google-pasta>=0.1.8 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorflow==2.3.0) (0.2.0)\n", - "Requirement already satisfied: opt-einsum>=2.3.2 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorflow==2.3.0) (3.3.0)\n", - "Requirement already satisfied: termcolor>=1.1.0 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorflow==2.3.0) (1.1.0)\n", - "Requirement already satisfied: wrapt>=1.11.1 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorflow==2.3.0) (1.13.3)\n", - "Requirement already satisfied: grpcio>=1.8.6 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorflow==2.3.0) (1.43.0)\n", - "Requirement already satisfied: numpy<1.19.0,>=1.16.0 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorflow==2.3.0) (1.18.5)\n", - "Requirement already satisfied: scipy==1.4.1 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorflow==2.3.0) (1.4.1)\n", - "Requirement already satisfied: six>=1.12.0 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorflow==2.3.0) (1.16.0)\n", - "Requirement already satisfied: astunparse==1.6.3 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorflow==2.3.0) (1.6.3)\n", - "Requirement already satisfied: gast==0.3.3 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorflow==2.3.0) (0.3.3)\n", - "Requirement already satisfied: wheel>=0.26 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorflow==2.3.0) (0.37.1)\n", - "Requirement already satisfied: tensorboard<3,>=2.3.0 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorflow==2.3.0) (2.8.0)\n", - "Requirement already satisfied: protobuf>=3.9.2 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorflow==2.3.0) (3.19.4)\n", - "Requirement already satisfied: absl-py>=0.7.0 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorflow==2.3.0) (1.0.0)\n", - "Requirement already satisfied: tensorflow-estimator<2.4.0,>=2.3.0 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorflow==2.3.0) (2.3.0)\n", - "Requirement already satisfied: h5py<2.11.0,>=2.10.0 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorflow==2.3.0) (2.10.0)\n", - "Requirement already satisfied: tensorboard-data-server<0.7.0,>=0.6.0 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorboard<3,>=2.3.0->tensorflow==2.3.0) (0.6.1)\n", - "Requirement already satisfied: markdown>=2.6.8 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorboard<3,>=2.3.0->tensorflow==2.3.0) (3.3.6)\n", - "Requirement already satisfied: werkzeug>=0.11.15 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorboard<3,>=2.3.0->tensorflow==2.3.0) (2.0.2)\n", - "Requirement already satisfied: requests<3,>=2.21.0 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorboard<3,>=2.3.0->tensorflow==2.3.0) (2.27.1)\n", - "Requirement already satisfied: setuptools>=41.0.0 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorboard<3,>=2.3.0->tensorflow==2.3.0) (58.0.4)\n", - "Requirement already satisfied: google-auth-oauthlib<0.5,>=0.4.1 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorboard<3,>=2.3.0->tensorflow==2.3.0) (0.4.6)\n", - "Requirement already satisfied: google-auth<3,>=1.6.3 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorboard<3,>=2.3.0->tensorflow==2.3.0) (2.6.0)\n", - "Requirement already satisfied: tensorboard-plugin-wit>=1.6.0 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorboard<3,>=2.3.0->tensorflow==2.3.0) (1.8.1)\n", - "Requirement already satisfied: pyasn1-modules>=0.2.1 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from google-auth<3,>=1.6.3->tensorboard<3,>=2.3.0->tensorflow==2.3.0) (0.2.8)\n", - "Requirement already satisfied: cachetools<6.0,>=2.0.0 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from google-auth<3,>=1.6.3->tensorboard<3,>=2.3.0->tensorflow==2.3.0) (5.0.0)\n", - "Requirement already satisfied: rsa<5,>=3.1.4 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from google-auth<3,>=1.6.3->tensorboard<3,>=2.3.0->tensorflow==2.3.0) (4.8)\n", - "Requirement already satisfied: requests-oauthlib>=0.7.0 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from google-auth-oauthlib<0.5,>=0.4.1->tensorboard<3,>=2.3.0->tensorflow==2.3.0) (1.3.1)\n", - "Requirement already satisfied: importlib-metadata>=4.4 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from markdown>=2.6.8->tensorboard<3,>=2.3.0->tensorflow==2.3.0) (4.10.1)\n", - "Requirement already satisfied: typing-extensions>=3.6.4 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from importlib-metadata>=4.4->markdown>=2.6.8->tensorboard<3,>=2.3.0->tensorflow==2.3.0) (4.0.1)\n", - "Requirement already satisfied: zipp>=0.5 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from importlib-metadata>=4.4->markdown>=2.6.8->tensorboard<3,>=2.3.0->tensorflow==2.3.0) (3.7.0)\n", - "Requirement already satisfied: pyasn1<0.5.0,>=0.4.6 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from pyasn1-modules>=0.2.1->google-auth<3,>=1.6.3->tensorboard<3,>=2.3.0->tensorflow==2.3.0) (0.4.8)\n", - "Requirement already satisfied: charset-normalizer~=2.0.0 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from requests<3,>=2.21.0->tensorboard<3,>=2.3.0->tensorflow==2.3.0) (2.0.11)\n", - "Requirement already satisfied: idna<4,>=2.5 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from requests<3,>=2.21.0->tensorboard<3,>=2.3.0->tensorflow==2.3.0) (3.3)\n", - "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from requests<3,>=2.21.0->tensorboard<3,>=2.3.0->tensorflow==2.3.0) (1.26.8)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from requests<3,>=2.21.0->tensorboard<3,>=2.3.0->tensorflow==2.3.0) (2021.10.8)\n", - "Requirement already satisfied: oauthlib>=3.0.0 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from requests-oauthlib>=0.7.0->google-auth-oauthlib<0.5,>=0.4.1->tensorboard<3,>=2.3.0->tensorflow==2.3.0) (3.2.0)\n", - "Requirement already satisfied: gym in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (0.21.0)\n", - "Requirement already satisfied: cloudpickle>=1.2.0 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from gym) (2.0.0)\n", - "Requirement already satisfied: numpy>=1.18.0 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from gym) (1.18.5)\n", - "Requirement already satisfied: importlib-metadata>=4.8.1 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from gym) (4.10.1)\n", - "Requirement already satisfied: typing-extensions>=3.6.4 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from importlib-metadata>=4.8.1->gym) (4.0.1)\n", - "Requirement already satisfied: zipp>=0.5 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from importlib-metadata>=4.8.1->gym) (3.7.0)\n", - "Requirement already satisfied: keras in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (2.8.0)\n", - "Requirement already satisfied: keras-rl2 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (1.0.5)\n", - "Requirement already satisfied: tensorflow in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from keras-rl2) (2.3.0)\n", - "Requirement already satisfied: six>=1.12.0 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorflow->keras-rl2) (1.16.0)\n", - "Requirement already satisfied: h5py<2.11.0,>=2.10.0 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorflow->keras-rl2) (2.10.0)\n", - "Requirement already satisfied: wheel>=0.26 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorflow->keras-rl2) (0.37.1)\n", - "Requirement already satisfied: astunparse==1.6.3 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorflow->keras-rl2) (1.6.3)\n", - "Requirement already satisfied: google-pasta>=0.1.8 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorflow->keras-rl2) (0.2.0)\n", - "Requirement already satisfied: keras-preprocessing<1.2,>=1.1.1 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorflow->keras-rl2) (1.1.2)\n", - "Requirement already satisfied: absl-py>=0.7.0 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorflow->keras-rl2) (1.0.0)\n", - "Requirement already satisfied: protobuf>=3.9.2 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorflow->keras-rl2) (3.19.4)\n", - "Requirement already satisfied: scipy==1.4.1 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorflow->keras-rl2) (1.4.1)\n", - "Requirement already satisfied: tensorflow-estimator<2.4.0,>=2.3.0 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorflow->keras-rl2) (2.3.0)\n", - "Requirement already satisfied: gast==0.3.3 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorflow->keras-rl2) (0.3.3)\n", - "Requirement already satisfied: opt-einsum>=2.3.2 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorflow->keras-rl2) (3.3.0)\n", - "Requirement already satisfied: numpy<1.19.0,>=1.16.0 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorflow->keras-rl2) (1.18.5)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Requirement already satisfied: termcolor>=1.1.0 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorflow->keras-rl2) (1.1.0)\n", - "Requirement already satisfied: wrapt>=1.11.1 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorflow->keras-rl2) (1.13.3)\n", - "Requirement already satisfied: grpcio>=1.8.6 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorflow->keras-rl2) (1.43.0)\n", - "Requirement already satisfied: tensorboard<3,>=2.3.0 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorflow->keras-rl2) (2.8.0)\n", - "Requirement already satisfied: markdown>=2.6.8 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorboard<3,>=2.3.0->tensorflow->keras-rl2) (3.3.6)\n", - "Requirement already satisfied: google-auth-oauthlib<0.5,>=0.4.1 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorboard<3,>=2.3.0->tensorflow->keras-rl2) (0.4.6)\n", - "Requirement already satisfied: setuptools>=41.0.0 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorboard<3,>=2.3.0->tensorflow->keras-rl2) (58.0.4)\n", - "Requirement already satisfied: tensorboard-plugin-wit>=1.6.0 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorboard<3,>=2.3.0->tensorflow->keras-rl2) (1.8.1)\n", - "Requirement already satisfied: tensorboard-data-server<0.7.0,>=0.6.0 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorboard<3,>=2.3.0->tensorflow->keras-rl2) (0.6.1)\n", - "Requirement already satisfied: werkzeug>=0.11.15 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorboard<3,>=2.3.0->tensorflow->keras-rl2) (2.0.2)\n", - "Requirement already satisfied: requests<3,>=2.21.0 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorboard<3,>=2.3.0->tensorflow->keras-rl2) (2.27.1)\n", - "Requirement already satisfied: google-auth<3,>=1.6.3 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from tensorboard<3,>=2.3.0->tensorflow->keras-rl2) (2.6.0)\n", - "Requirement already satisfied: pyasn1-modules>=0.2.1 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from google-auth<3,>=1.6.3->tensorboard<3,>=2.3.0->tensorflow->keras-rl2) (0.2.8)\n", - "Requirement already satisfied: cachetools<6.0,>=2.0.0 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from google-auth<3,>=1.6.3->tensorboard<3,>=2.3.0->tensorflow->keras-rl2) (5.0.0)\n", - "Requirement already satisfied: rsa<5,>=3.1.4 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from google-auth<3,>=1.6.3->tensorboard<3,>=2.3.0->tensorflow->keras-rl2) (4.8)\n", - "Requirement already satisfied: requests-oauthlib>=0.7.0 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from google-auth-oauthlib<0.5,>=0.4.1->tensorboard<3,>=2.3.0->tensorflow->keras-rl2) (1.3.1)\n", - "Requirement already satisfied: importlib-metadata>=4.4 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from markdown>=2.6.8->tensorboard<3,>=2.3.0->tensorflow->keras-rl2) (4.10.1)\n", - "Requirement already satisfied: zipp>=0.5 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from importlib-metadata>=4.4->markdown>=2.6.8->tensorboard<3,>=2.3.0->tensorflow->keras-rl2) (3.7.0)\n", - "Requirement already satisfied: typing-extensions>=3.6.4 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from importlib-metadata>=4.4->markdown>=2.6.8->tensorboard<3,>=2.3.0->tensorflow->keras-rl2) (4.0.1)\n", - "Requirement already satisfied: pyasn1<0.5.0,>=0.4.6 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from pyasn1-modules>=0.2.1->google-auth<3,>=1.6.3->tensorboard<3,>=2.3.0->tensorflow->keras-rl2) (0.4.8)\n", - "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from requests<3,>=2.21.0->tensorboard<3,>=2.3.0->tensorflow->keras-rl2) (1.26.8)\n", - "Requirement already satisfied: charset-normalizer~=2.0.0 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from requests<3,>=2.21.0->tensorboard<3,>=2.3.0->tensorflow->keras-rl2) (2.0.11)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from requests<3,>=2.21.0->tensorboard<3,>=2.3.0->tensorflow->keras-rl2) (2021.10.8)\n", - "Requirement already satisfied: idna<4,>=2.5 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from requests<3,>=2.21.0->tensorboard<3,>=2.3.0->tensorflow->keras-rl2) (3.3)\n", - "Requirement already satisfied: oauthlib>=3.0.0 in /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages (from requests-oauthlib>=0.7.0->google-auth-oauthlib<0.5,>=0.4.1->tensorboard<3,>=2.3.0->tensorflow->keras-rl2) (3.2.0)\n" - ] - } - ], - "source": [ - "!pip install tensorflow==2.3.0\n", - "!pip install gym\n", - "!pip install keras\n", - "!pip install keras-rl2" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 1. Test Random Environment with OpenAI Gym" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "import gym \n", - "import random" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "env = gym.make('CartPole-v0')\n", - "states = env.observation_space.shape[0]\n", - "actions = env.action_space.n" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "2" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "actions" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Episode:1 Score:19.0\n", - "Episode:2 Score:36.0\n", - "Episode:3 Score:15.0\n", - "Episode:4 Score:23.0\n", - "Episode:5 Score:15.0\n", - "Episode:6 Score:12.0\n", - "Episode:7 Score:40.0\n", - "Episode:8 Score:23.0\n", - "Episode:9 Score:18.0\n", - "Episode:10 Score:12.0\n" - ] - } - ], - "source": [ - "episodes = 10\n", - "for episode in range(1, episodes+1):\n", - " state = env.reset()\n", - " done = False\n", - " score = 0 \n", - " \n", - " while not done:\n", - " env.render()\n", - " action = random.choice([0,1])\n", - " n_state, reward, done, info = env.step(action)\n", - " score+=reward\n", - " print('Episode:{} Score:{}'.format(episode, score))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 2. Create a Deep Learning Model with Keras" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "from tensorflow.keras.models import Sequential\n", - "from tensorflow.keras.layers import Dense, Flatten\n", - "from tensorflow.keras.optimizers import Adam" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "def build_model(states, actions):\n", - " model = Sequential()\n", - " model.add(Flatten(input_shape=(1,states)))\n", - " model.add(Dense(24, activation='relu'))\n", - " model.add(Dense(24, activation='relu'))\n", - " model.add(Dense(actions, activation='linear'))\n", - " return model" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "del model\n", - "\n", - "model = build_model(states, actions)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "model.summary()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 3. Build Agent with Keras-RL" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "from rl.agents import DQNAgent\n", - "from rl.policy import BoltzmannQPolicy\n", - "from rl.memory import SequentialMemory" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "def build_agent(model, actions):\n", - " policy = BoltzmannQPolicy()\n", - " memory = SequentialMemory(limit=50000, window_length=1)\n", - " dqn = DQNAgent(model=model, memory=memory, policy=policy, \n", - " nb_actions=actions, nb_steps_warmup=10, target_model_update=1e-2)\n", - " return dqn" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Training for 10000 steps ...\n", - "Interval 1 (0 steps performed)\n", - "WARNING:tensorflow:From /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages/tensorflow/python/keras/engine/training_v1.py:2070: Model.state_updates (from tensorflow.python.keras.engine.training) is deprecated and will be removed in a future version.\n", - "Instructions for updating:\n", - "This property should not be used in TensorFlow 2.0, as updates are applied automatically.\n", - " 1/10000 [..............................] - ETA: 11:14 - reward: 1.0000" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages/rl/memory.py:37: UserWarning: Not enough entries to sample without replacement. Consider increasing your warm-up phase to avoid oversampling!\n", - " warnings.warn('Not enough entries to sample without replacement. Consider increasing your warm-up phase to avoid oversampling!')\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "10000/10000 [==============================] - 76s 8ms/step - reward: 1.0000\n", - "done, took 76.468 seconds\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "dqn = build_agent(model, actions)\n", - "dqn.compile(Adam(lr=1e-3), metrics=['mae'])\n", - "dqn.fit(env, nb_steps=10000, visualize=False, verbose=1)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "scores = dqn.test(env, nb_episodes=5, visualize=False)\n", - "print(np.mean(scores.history['episode_reward']))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "_ = dqn.test(env, nb_episodes=15, visualize=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 4. Reloading Agent from Memory" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dqn.save_weights('dqn_weights.h5f', overwrite=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "del model\n", - "del dqn\n", - "del env" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "env = gym.make('CartPole-v0')\n", - "actions = env.action_space.n\n", - "states = env.observation_space.shape[0]\n", - "model = build_model(states, actions)\n", - "dqn = build_agent(model, actions)\n", - "dqn.compile(Adam(lr=1e-3), metrics=['mae'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dqn.load_weights('dqn_weights.h5f')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "_ = dqn.test(env, nb_episodes=5, visualize=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.11" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/RL_test_2/DeepReinforcementLearning.py b/RL_test_2/DeepReinforcementLearning.py new file mode 100644 index 0000000..358559f --- /dev/null +++ b/RL_test_2/DeepReinforcementLearning.py @@ -0,0 +1,115 @@ +# --- +# jupyter: +# jupytext: +# formats: ipynb,py:light +# text_representation: +# extension: .py +# format_name: light +# format_version: '1.5' +# jupytext_version: 1.14.1 +# kernelspec: +# display_name: Python 3 (ipykernel) +# language: python +# name: python3 +# --- + +# # 0. Install Dependencies + +# !pip install tensorflow==2.3.0 +# !pip install gym +# !pip install keras +# !pip install keras-rl2 + +# # 1. Test Random Environment with OpenAI Gym + +import gym +import random + +env = gym.make('CartPole-v0') +states = env.observation_space.shape[0] +actions = env.action_space.n + +actions + +episodes = 10 +for episode in range(1, episodes+1): + state = env.reset() + done = False + score = 0 + + while not done: + env.render() + action = random.choice([0,1]) + n_state, reward, done, info = env.step(action) + score+=reward + print('Episode:{} Score:{}'.format(episode, score)) + +# # 2. Create a Deep Learning Model with Keras + +import numpy as np +from tensorflow.keras.models import Sequential +from tensorflow.keras.layers import Dense, Flatten +from tensorflow.keras.optimizers import Adam + + +def build_model(states, actions): + model = Sequential() + model.add(Flatten(input_shape=(1,states))) + model.add(Dense(24, activation='relu')) + model.add(Dense(24, activation='relu')) + model.add(Dense(actions, activation='linear')) + return model + + +# + +del model + +model = build_model(states, actions) +# - + +model.summary() + +# # 3. Build Agent with Keras-RL + +from rl.agents import DQNAgent +from rl.policy import BoltzmannQPolicy +from rl.memory import SequentialMemory + + +def build_agent(model, actions): + policy = BoltzmannQPolicy() + memory = SequentialMemory(limit=50000, window_length=1) + dqn = DQNAgent(model=model, memory=memory, policy=policy, + nb_actions=actions, nb_steps_warmup=10, target_model_update=1e-2) + return dqn + + +dqn = build_agent(model, actions) +dqn.compile(Adam(lr=1e-3), metrics=['mae']) +dqn.fit(env, nb_steps=10000, visualize=False, verbose=1) + +scores = dqn.test(env, nb_episodes=5, visualize=False) +print(np.mean(scores.history['episode_reward'])) + +_ = dqn.test(env, nb_episodes=15, visualize=True) + +# # 4. Reloading Agent from Memory + +dqn.save_weights('dqn_weights.h5f', overwrite=True) + +del model +del dqn +del env + +env = gym.make('CartPole-v0') +actions = env.action_space.n +states = env.observation_space.shape[0] +model = build_model(states, actions) +dqn = build_agent(model, actions) +dqn.compile(Adam(lr=1e-3), metrics=['mae']) + +dqn.load_weights('dqn_weights.h5f') + +_ = dqn.test(env, nb_episodes=5, visualize=True) + + diff --git a/RL_test_3/Untitled.ipynb b/RL_test_3/Untitled.ipynb deleted file mode 100644 index 363fcab..0000000 --- a/RL_test_3/Untitled.ipynb +++ /dev/null @@ -1,6 +0,0 @@ -{ - "cells": [], - "metadata": {}, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/RL_test_3/Untitled.py b/RL_test_3/Untitled.py new file mode 100644 index 0000000..9cc76c1 --- /dev/null +++ b/RL_test_3/Untitled.py @@ -0,0 +1,10 @@ +# --- +# jupyter: +# jupytext: +# formats: ipynb,py +# text_representation: +# extension: .py +# format_name: light +# format_version: '1.5' +# jupytext_version: 1.14.1 +# --- diff --git a/RL_test_4/hanabi_ml_2.ipynb b/RL_test_4/hanabi_ml_2.ipynb deleted file mode 100644 index a83d061..0000000 --- a/RL_test_4/hanabi_ml_2.ipynb +++ /dev/null @@ -1,228 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "from DumbGame import DumbGameEnv\n", - "import numpy as np\n", - "from tensorflow.keras.models import Sequential\n", - "from tensorflow.keras.layers import Dense, Flatten\n", - "from tensorflow.keras.optimizers import Adam" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "States:(1,) Actions:5\n" - ] - } - ], - "source": [ - "env = DumbGameEnv()\n", - "states = env.observation_space.shape\n", - "actions = env.action_space.n\n", - "print(f\"States:{states} Actions:{actions}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "def build_model(states, actions):\n", - " model = Sequential() \n", - " model.add(Dense(24, activation='relu', input_shape=states))\n", - " model.add(Dense(24, activation='relu'))\n", - " model.add(Dense(actions, activation='linear'))\n", - " return model" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "from rl.agents import DQNAgent\n", - "from rl.policy import BoltzmannQPolicy\n", - "from rl.memory import SequentialMemory\n", - "\n", - "def build_agent(model, actions):\n", - " policy = BoltzmannQPolicy()\n", - " memory = SequentialMemory(limit=20000, window_length=1)\n", - " dqn = DQNAgent(model=model, memory=memory, policy=policy, \n", - " nb_actions=actions, nb_steps_warmup=20, target_model_update=0.1)\n", - " return dqn" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Model: \"sequential\"\n", - "_________________________________________________________________\n", - "Layer (type) Output Shape Param # \n", - "=================================================================\n", - "dense (Dense) (None, 24) 48 \n", - "_________________________________________________________________\n", - "dense_1 (Dense) (None, 24) 600 \n", - "_________________________________________________________________\n", - "dense_2 (Dense) (None, 5) 125 \n", - "=================================================================\n", - "Total params: 773\n", - "Trainable params: 773\n", - "Non-trainable params: 0\n", - "_________________________________________________________________\n" - ] - } - ], - "source": [ - "model = build_model(states, actions)\n", - "model.summary()" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Training for 5000 steps ...\n", - "Interval 1 (0 steps performed)\n", - "WARNING:tensorflow:From /Users/Ben/anaconda3/lib/python3.6/site-packages/tensorflow/python/keras/engine/training_v1.py:2070: Model.state_updates (from tensorflow.python.keras.engine.training) is deprecated and will be removed in a future version.\n", - "Instructions for updating:\n", - "This property should not be used in TensorFlow 2.0, as updates are applied automatically.\n", - " 1/10000 [..............................] - ETA: 8:06 - reward: -1.0000" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/Ben/anaconda3/lib/python3.6/site-packages/rl/memory.py:37: UserWarning: Not enough entries to sample without replacement. Consider increasing your warm-up phase to avoid oversampling!\n", - " warnings.warn('Not enough entries to sample without replacement. Consider increasing your warm-up phase to avoid oversampling!')\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " 4995/10000 [=============>................] - ETA: 29s - reward: -0.6040done, took 29.182 seconds\n" - ] - } - ], - "source": [ - "dqn = build_agent(model, actions)\n", - "dqn.compile(Adam(lr=0.1))#, metrics=['mae'])\n", - "history = dqn.fit(env, nb_steps=5000, visualize=False, verbose=1)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'nb_steps': 5000}\n", - "{'episode_reward': [-16.0, -10.0, -8.0, -24.0, -26.0, -2.0, -8.0, -21.0, -1.0, -2.0, -16.0, 0.0, -31.0, -38.0, -23.0, -38.0, -21.0, -22.0, -4.0, -9.0, -19.0, -19.0, -20.0, -21.0, -8.0, -29.0, -26.0, -18.0, -22.0, -10.0, -17.0, -17.0, -19.0, -50.0, -31.0, -16.0, -4.0, -13.0, -23.0, -29.0, -18.0, -22.0, -21.0, -9.0, -9.0, 0.0, -20.0, -28.0, -20.0, -26.0, -13.0, -25.0, -3.0, -11.0, -9.0, -7.0, -18.0, -11.0, -20.0, -15.0, -14.0, -15.0, -5.0, 0.0, -27.0, -46.0, -13.0, -7.0, -16.0, -16.0, -16.0, -12.0, -9.0, -16.0, -3.0, -13.0, -15.0, -5.0, -44.0, -11.0, 1.0, -26.0, -21.0, -18.0, -27.0, -33.0, -24.0, -12.0, -13.0, -6.0, -22.0, -1.0, -11.0, -2.0, -24.0, -1.0, -14.0, -7.0, -8.0, -27.0, -14.0, -18.0, -31.0, -20.0, -9.0, -3.0, 0.0, -27.0, -7.0, -16.0, -16.0, -10.0, -31.0, -14.0, -7.0, -16.0, -3.0, -24.0, -16.0, -39.0, -44.0, -20.0, -8.0, -2.0, -25.0, -8.0, -24.0, -36.0, -6.0, -20.0, -11.0, -21.0, -27.0, -33.0, -8.0, -12.0, -28.0, -8.0, -35.0, -4.0, -8.0, -27.0, 1.0, -8.0, -9.0, -12.0, -20.0, -14.0, 1.0, -23.0, -15.0, -5.0, -2.0, -14.0, -29.0, -18.0, -24.0, -4.0, -7.0, -26.0, -11.0, -18.0, -15.0, -10.0, -25.0, -6.0, -5.0, -13.0, -8.0, -13.0, -2.0, -7.0, -3.0, -9.0, -20.0, 1.0, -10.0, -4.0, -23.0, -16.0, -24.0, -9.0, -9.0, -6.0, -8.0, -10.0, -26.0, 1.0, -24.0, -14.0, -6.0, -22.0, -3.0, -1.0, -2.0, -14.0, -8.0, -15.0], 'nb_episode_steps': [26, 20, 18, 34, 36, 12, 18, 31, 11, 12, 26, 10, 41, 48, 33, 48, 31, 32, 14, 19, 29, 29, 30, 31, 18, 39, 36, 28, 32, 20, 27, 27, 29, 60, 41, 26, 14, 23, 33, 39, 28, 32, 31, 19, 19, 10, 30, 38, 30, 36, 23, 35, 13, 21, 19, 17, 28, 21, 30, 25, 24, 25, 15, 10, 37, 56, 23, 17, 26, 26, 26, 22, 19, 26, 13, 23, 25, 15, 54, 21, 9, 36, 31, 28, 37, 43, 34, 22, 23, 16, 32, 11, 21, 12, 34, 11, 24, 17, 18, 37, 24, 28, 41, 30, 19, 13, 10, 37, 17, 26, 26, 20, 41, 24, 17, 26, 13, 34, 26, 49, 54, 30, 18, 12, 35, 18, 34, 46, 16, 30, 21, 31, 37, 43, 18, 22, 38, 18, 45, 14, 18, 37, 9, 18, 19, 22, 30, 24, 9, 33, 25, 15, 12, 24, 39, 28, 34, 14, 17, 36, 21, 28, 25, 20, 35, 16, 15, 23, 18, 23, 12, 17, 13, 19, 30, 9, 20, 14, 33, 26, 34, 19, 19, 16, 18, 20, 36, 9, 34, 24, 16, 32, 13, 11, 12, 24, 18, 25], 'nb_steps': [26, 46, 64, 98, 134, 146, 164, 195, 206, 218, 244, 254, 295, 343, 376, 424, 455, 487, 501, 520, 549, 578, 608, 639, 657, 696, 732, 760, 792, 812, 839, 866, 895, 955, 996, 1022, 1036, 1059, 1092, 1131, 1159, 1191, 1222, 1241, 1260, 1270, 1300, 1338, 1368, 1404, 1427, 1462, 1475, 1496, 1515, 1532, 1560, 1581, 1611, 1636, 1660, 1685, 1700, 1710, 1747, 1803, 1826, 1843, 1869, 1895, 1921, 1943, 1962, 1988, 2001, 2024, 2049, 2064, 2118, 2139, 2148, 2184, 2215, 2243, 2280, 2323, 2357, 2379, 2402, 2418, 2450, 2461, 2482, 2494, 2528, 2539, 2563, 2580, 2598, 2635, 2659, 2687, 2728, 2758, 2777, 2790, 2800, 2837, 2854, 2880, 2906, 2926, 2967, 2991, 3008, 3034, 3047, 3081, 3107, 3156, 3210, 3240, 3258, 3270, 3305, 3323, 3357, 3403, 3419, 3449, 3470, 3501, 3538, 3581, 3599, 3621, 3659, 3677, 3722, 3736, 3754, 3791, 3800, 3818, 3837, 3859, 3889, 3913, 3922, 3955, 3980, 3995, 4007, 4031, 4070, 4098, 4132, 4146, 4163, 4199, 4220, 4248, 4273, 4293, 4328, 4344, 4359, 4382, 4400, 4423, 4435, 4452, 4465, 4484, 4514, 4523, 4543, 4557, 4590, 4616, 4650, 4669, 4688, 4704, 4722, 4742, 4778, 4787, 4821, 4845, 4861, 4893, 4906, 4917, 4929, 4953, 4971, 4996]}\n" - ] - } - ], - "source": [ - "print(history.params)\n", - "print(history.history)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Testing for 1 episodes ...\n" - ] - }, - { - "ename": "KeyboardInterrupt", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mscores\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdqn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0menv\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnb_episodes\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvisualize\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/rl/core.py\u001b[0m in \u001b[0;36mtest\u001b[0;34m(self, env, nb_episodes, action_repetition, callbacks, visualize, nb_max_episode_steps, nb_max_start_steps, start_step_policy, verbose)\u001b[0m\n\u001b[1;32m 363\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mnb_max_episode_steps\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mepisode_step\u001b[0m \u001b[0;34m>=\u001b[0m \u001b[0mnb_max_episode_steps\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 364\u001b[0m \u001b[0mdone\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 365\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbackward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreward\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mterminal\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 366\u001b[0m \u001b[0mepisode_reward\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0mreward\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 367\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/rl/agents/dqn.py\u001b[0m in \u001b[0;36mbackward\u001b[0;34m(self, reward, terminal)\u001b[0m\n\u001b[1;32m 240\u001b[0m training=self.training)\n\u001b[1;32m 241\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 242\u001b[0;31m \u001b[0mmetrics\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnan\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0m_\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmetrics_names\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 243\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtraining\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 244\u001b[0m \u001b[0;31m# We're done here. No need to update the experience memory since we only use the working\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/rl/agents/dqn.py\u001b[0m in \u001b[0;36mmetrics_names\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 340\u001b[0m \u001b[0mdummy_output_name\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtrainable_model\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moutput_names\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 341\u001b[0m \u001b[0mmodel_metrics\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mname\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0midx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtrainable_model\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmetrics_names\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0midx\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 342\u001b[0;31m \u001b[0mmodel_metrics\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreplace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdummy_output_name\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m'_'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m''\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mname\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mmodel_metrics\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 343\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 344\u001b[0m \u001b[0mnames\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmodel_metrics\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpolicy\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmetrics_names\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/rl/agents/dqn.py\u001b[0m in \u001b[0;36m\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 340\u001b[0m \u001b[0mdummy_output_name\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtrainable_model\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moutput_names\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 341\u001b[0m \u001b[0mmodel_metrics\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mname\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0midx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtrainable_model\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmetrics_names\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0midx\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 342\u001b[0;31m \u001b[0mmodel_metrics\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreplace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdummy_output_name\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m'_'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m''\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mname\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mmodel_metrics\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 343\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 344\u001b[0m \u001b[0mnames\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmodel_metrics\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpolicy\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmetrics_names\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mKeyboardInterrupt\u001b[0m: " - ] - } - ], - "source": [ - "scores = dqn.test(env, nb_episodes=1, visualize=False)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#print(np.mean(scores.history['episode_reward']))\n", - "#dqn.get_config()\n", - "#scores = dqn.test(env, nb_episodes=1, visualize=False, verbose=1)\n", - "#test(self, env, nb_episodes=1, action_repetition=1, callbacks=None, visualize=True, nb_max_episode_steps=None, nb_max_start_steps=0, start_step_policy=None, verbose=1)\n", - "#print(np.mean(scores.history['episode_reward']))callbacks = callbacks[:]" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.8" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/RL_test_4/hanabi_ml_2.py b/RL_test_4/hanabi_ml_2.py new file mode 100644 index 0000000..b56f0cd --- /dev/null +++ b/RL_test_4/hanabi_ml_2.py @@ -0,0 +1,68 @@ +# --- +# jupyter: +# jupytext: +# formats: ipynb,py +# text_representation: +# extension: .py +# format_name: light +# format_version: '1.5' +# jupytext_version: 1.14.1 +# kernelspec: +# display_name: Python 3 +# language: python +# name: python3 +# --- + +from DumbGame import DumbGameEnv +import numpy as np +from tensorflow.keras.models import Sequential +from tensorflow.keras.layers import Dense, Flatten +from tensorflow.keras.optimizers import Adam + +env = DumbGameEnv() +states = env.observation_space.shape +actions = env.action_space.n +print(f"States:{states} Actions:{actions}") + + +def build_model(states, actions): + model = Sequential() + model.add(Dense(24, activation='relu', input_shape=states)) + model.add(Dense(24, activation='relu')) + model.add(Dense(actions, activation='linear')) + return model + + +# + +from rl.agents import DQNAgent +from rl.policy import BoltzmannQPolicy +from rl.memory import SequentialMemory + +def build_agent(model, actions): + policy = BoltzmannQPolicy() + memory = SequentialMemory(limit=20000, window_length=1) + dqn = DQNAgent(model=model, memory=memory, policy=policy, + nb_actions=actions, nb_steps_warmup=20, target_model_update=0.1) + return dqn + + +# - + +model = build_model(states, actions) +model.summary() + +dqn = build_agent(model, actions) +dqn.compile(Adam(lr=0.1))#, metrics=['mae']) +history = dqn.fit(env, nb_steps=5000, visualize=False, verbose=1) + +print(history.params) +print(history.history) + +scores = dqn.test(env, nb_episodes=1, visualize=False) + +# + +#print(np.mean(scores.history['episode_reward'])) +#dqn.get_config() +#scores = dqn.test(env, nb_episodes=1, visualize=False, verbose=1) +#test(self, env, nb_episodes=1, action_repetition=1, callbacks=None, visualize=True, nb_max_episode_steps=None, nb_max_start_steps=0, start_step_policy=None, verbose=1) +#print(np.mean(scores.history['episode_reward']))callbacks = callbacks[:] diff --git a/RL_test_5/DeepReinforcementLearning.ipynb b/RL_test_5/DeepReinforcementLearning.ipynb deleted file mode 100644 index fc934a5..0000000 --- a/RL_test_5/DeepReinforcementLearning.ipynb +++ /dev/null @@ -1,423 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 0. Install Dependencies" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 1. Test Random Environment with OpenAI Gym" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "States:(1,) Actions:5\n" - ] - } - ], - "source": [ - "from DumbGame import DumbGameEnv\n", - "env = DumbGameEnv()\n", - "states = env.observation_space.shape\n", - "actions = env.action_space.n\n", - "print(f\"States:{states} Actions:{actions}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 2. Create a Deep Learning Model with Keras" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "from tensorflow.keras.models import Sequential\n", - "from tensorflow.keras.layers import Dense, Flatten\n", - "from tensorflow.keras.optimizers import Adam" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "states = env.observation_space.shape\n", - "actions = env.action_space.n" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "5" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "actions" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "def build_model(states, actions):\n", - " model = Sequential() \n", - " model.add(Dense(24, activation='relu', input_shape=states))\n", - " model.add(Dense(24, activation='relu'))\n", - " model.add(Dense(actions, activation='linear'))\n", - " return model" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "del model " - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "model = build_model(states, actions)" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Model: \"sequential_2\"\n", - "_________________________________________________________________\n", - "Layer (type) Output Shape Param # \n", - "=================================================================\n", - "dense_6 (Dense) (None, 24) 48 \n", - "_________________________________________________________________\n", - "dense_7 (Dense) (None, 24) 600 \n", - "_________________________________________________________________\n", - "dense_8 (Dense) (None, 5) 125 \n", - "=================================================================\n", - "Total params: 773\n", - "Trainable params: 773\n", - "Non-trainable params: 0\n", - "_________________________________________________________________\n" - ] - } - ], - "source": [ - "model.summary()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 3. Build Agent with Keras-RL" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "from rl.agents import DQNAgent\n", - "from rl.policy import BoltzmannQPolicy\n", - "from rl.memory import SequentialMemory" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "def build_agent(model, actions):\n", - " policy = BoltzmannQPolicy()\n", - " memory = SequentialMemory(limit=50000, window_length=1)\n", - " dqn = DQNAgent(model=model, memory=memory, policy=policy, \n", - " nb_actions=actions, nb_steps_warmup=10, target_model_update=1e-2)\n", - " return dqn" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Training for 50000 steps ...\n", - "Interval 1 (0 steps performed)\n", - "WARNING:tensorflow:From /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages/tensorflow/python/keras/engine/training_v1.py:2070: Model.state_updates (from tensorflow.python.keras.engine.training) is deprecated and will be removed in a future version.\n", - "Instructions for updating:\n", - "This property should not be used in TensorFlow 2.0, as updates are applied automatically.\n", - " 1/10000 [..............................] - ETA: 9:10 - reward: -1.0000" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages/rl/memory.py:37: UserWarning: Not enough entries to sample without replacement. Consider increasing your warm-up phase to avoid oversampling!\n", - " warnings.warn('Not enough entries to sample without replacement. Consider increasing your warm-up phase to avoid oversampling!')\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "10000/10000 [==============================] - 73s 7ms/step - reward: -0.5974\n", - "402 episodes - episode_reward: -14.821 [-47.000, 5.000] - loss: 1.242 - mae: 5.141 - mean_q: -6.008\n", - "\n", - "Interval 2 (10000 steps performed)\n", - "10000/10000 [==============================] - 70s 7ms/step - reward: -0.5944\n", - "406 episodes - episode_reward: -14.675 [-52.000, 3.000] - loss: 1.479 - mae: 6.070 - mean_q: -7.153\n", - "\n", - "Interval 3 (20000 steps performed)\n", - "10000/10000 [==============================] - 71s 7ms/step - reward: -0.5978\n", - "402 episodes - episode_reward: -14.876 [-59.000, 2.000] - loss: 1.487 - mae: 6.079 - mean_q: -7.167\n", - "\n", - "Interval 4 (30000 steps performed)\n", - "10000/10000 [==============================] - 71s 7ms/step - reward: -0.5982\n", - "402 episodes - episode_reward: -14.883 [-60.000, 4.000] - loss: 1.505 - mae: 6.153 - mean_q: -7.265\n", - "\n", - "Interval 5 (40000 steps performed)\n", - "10000/10000 [==============================] - 73s 7ms/step - reward: -0.6216\n", - "done, took 357.885 seconds\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "dqn = build_agent(model, actions)\n", - "dqn.compile(Adam(lr=1e-3), metrics=['mae'])\n", - "dqn.fit(env, nb_steps=50000, visualize=False, verbose=1)" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Testing for 100 episodes ...\n" - ] - }, - { - "ename": "KeyboardInterrupt", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m/var/folders/wp/_fng4ppn01b2j4_j98240s780000gn/T/ipykernel_10921/978772492.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mscores\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdqn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0menv\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnb_episodes\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m100\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvisualize\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmean\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mscores\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhistory\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'episode_reward'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/Home/Working/Hanabi/envs/lib/python3.7/site-packages/rl/core.py\u001b[0m in \u001b[0;36mtest\u001b[0;34m(self, env, nb_episodes, action_repetition, callbacks, visualize, nb_max_episode_steps, nb_max_start_steps, start_step_policy, verbose)\u001b[0m\n\u001b[1;32m 339\u001b[0m \u001b[0mcallbacks\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mon_step_begin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mepisode_step\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 340\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 341\u001b[0;31m \u001b[0maction\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mforward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mobservation\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 342\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mprocessor\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 343\u001b[0m \u001b[0maction\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mprocessor\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mprocess_action\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maction\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/Home/Working/Hanabi/envs/lib/python3.7/site-packages/rl/agents/dqn.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, observation)\u001b[0m\n\u001b[1;32m 222\u001b[0m \u001b[0;31m# Select an action.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 223\u001b[0m \u001b[0mstate\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmemory\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_recent_state\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mobservation\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 224\u001b[0;31m \u001b[0mq_values\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcompute_q_values\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstate\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 225\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtraining\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 226\u001b[0m \u001b[0maction\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpolicy\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mselect_action\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mq_values\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mq_values\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/Home/Working/Hanabi/envs/lib/python3.7/site-packages/rl/agents/dqn.py\u001b[0m in \u001b[0;36mcompute_q_values\u001b[0;34m(self, state)\u001b[0m\n\u001b[1;32m 66\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 67\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mcompute_q_values\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstate\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 68\u001b[0;31m \u001b[0mq_values\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcompute_batch_q_values\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mstate\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mflatten\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 69\u001b[0m \u001b[0;32massert\u001b[0m \u001b[0mq_values\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnb_actions\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 70\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mq_values\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/Home/Working/Hanabi/envs/lib/python3.7/site-packages/rl/agents/dqn.py\u001b[0m in \u001b[0;36mcompute_batch_q_values\u001b[0;34m(self, state_batch)\u001b[0m\n\u001b[1;32m 61\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mcompute_batch_q_values\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstate_batch\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 62\u001b[0m \u001b[0mbatch\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mprocess_state_batch\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstate_batch\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 63\u001b[0;31m \u001b[0mq_values\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpredict_on_batch\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbatch\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 64\u001b[0m \u001b[0;32massert\u001b[0m \u001b[0mq_values\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstate_batch\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnb_actions\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 65\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mq_values\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/Home/Working/Hanabi/envs/lib/python3.7/site-packages/tensorflow/python/keras/engine/training_v1.py\u001b[0m in \u001b[0;36mpredict_on_batch\u001b[0;34m(self, x)\u001b[0m\n\u001b[1;32m 1212\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1213\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_make_predict_function\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1214\u001b[0;31m \u001b[0moutputs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpredict_function\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minputs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1215\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1216\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0moutputs\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/Home/Working/Hanabi/envs/lib/python3.7/site-packages/tensorflow/python/keras/backend.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, inputs)\u001b[0m\n\u001b[1;32m 3823\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3824\u001b[0m fetched = self._callable_fn(*array_vals,\n\u001b[0;32m-> 3825\u001b[0;31m run_metadata=self.run_metadata)\n\u001b[0m\u001b[1;32m 3826\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_call_fetch_callbacks\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfetched\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_fetches\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3827\u001b[0m output_structure = nest.pack_sequence_as(\n", - "\u001b[0;32m~/Home/Working/Hanabi/envs/lib/python3.7/site-packages/tensorflow/python/client/session.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1470\u001b[0m ret = tf_session.TF_SessionRunCallable(self._session._session,\n\u001b[1;32m 1471\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_handle\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1472\u001b[0;31m run_metadata_ptr)\n\u001b[0m\u001b[1;32m 1473\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mrun_metadata\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1474\u001b[0m \u001b[0mproto_data\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtf_session\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mTF_GetBuffer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrun_metadata_ptr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mKeyboardInterrupt\u001b[0m: " - ] - } - ], - "source": [ - "scores = dqn.test(env, nb_episodes=100, visualize=False)\n", - "print(np.mean(scores.history['episode_reward']))" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Testing for 15 episodes ...\n", - "Episode 1: reward: 200.000, steps: 200\n", - "Episode 2: reward: 200.000, steps: 200\n", - "Episode 3: reward: 200.000, steps: 200\n", - "Episode 4: reward: 200.000, steps: 200\n", - "Episode 5: reward: 200.000, steps: 200\n", - "Episode 6: reward: 200.000, steps: 200\n", - "Episode 7: reward: 200.000, steps: 200\n", - "Episode 8: reward: 200.000, steps: 200\n", - "Episode 9: reward: 200.000, steps: 200\n", - "Episode 10: reward: 200.000, steps: 200\n", - "Episode 11: reward: 200.000, steps: 200\n", - "Episode 12: reward: 200.000, steps: 200\n", - "Episode 13: reward: 200.000, steps: 200\n", - "Episode 14: reward: 200.000, steps: 200\n", - "Episode 15: reward: 200.000, steps: 200\n" - ] - } - ], - "source": [ - "_ = dqn.test(env, nb_episodes=15, visualize=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 4. Reloading Agent from Memory" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [], - "source": [ - "dqn.save_weights('dqn_weights.h5f', overwrite=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [], - "source": [ - "del model\n", - "del dqn\n", - "del env" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "env = gym.make('CartPole-v0')\n", - "actions = env.action_space.n\n", - "states = env.observation_space.shape[0]\n", - "model = build_model(states, actions)\n", - "dqn = build_agent(model, actions)\n", - "dqn.compile(Adam(lr=1e-3), metrics=['mae'])" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "dqn.load_weights('dqn_weights.h5f')" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Testing for 5 episodes ...\n", - "WARNING:tensorflow:From /Users/nicholasrenotte/opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/keras/engine/training_v1.py:2070: Model.state_updates (from tensorflow.python.keras.engine.training) is deprecated and will be removed in a future version.\n", - "Instructions for updating:\n", - "This property should not be used in TensorFlow 2.0, as updates are applied automatically.\n", - "Episode 1: reward: 200.000, steps: 200\n", - "Episode 2: reward: 200.000, steps: 200\n", - "Episode 3: reward: 200.000, steps: 200\n", - "Episode 4: reward: 200.000, steps: 200\n", - "Episode 5: reward: 200.000, steps: 200\n" - ] - } - ], - "source": [ - "_ = dqn.test(env, nb_episodes=5, visualize=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.11" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/RL_test_5/DeepReinforcementLearning.py b/RL_test_5/DeepReinforcementLearning.py new file mode 100644 index 0000000..e362fd9 --- /dev/null +++ b/RL_test_5/DeepReinforcementLearning.py @@ -0,0 +1,95 @@ +# --- +# jupyter: +# jupytext: +# text_representation: +# extension: .py +# format_name: light +# format_version: '1.5' +# jupytext_version: 1.14.1 +# kernelspec: +# display_name: Python 3 (ipykernel) +# language: python +# name: python3 +# --- + +# # 0. Install Dependencies + +# # 1. Test Random Environment with OpenAI Gym + +from DumbGame import DumbGameEnv +env = DumbGameEnv() +states = env.observation_space.shape +actions = env.action_space.n +print(f"States:{states} Actions:{actions}") + +# # 2. Create a Deep Learning Model with Keras + +import numpy as np +from tensorflow.keras.models import Sequential +from tensorflow.keras.layers import Dense, Flatten +from tensorflow.keras.optimizers import Adam + +states = env.observation_space.shape +actions = env.action_space.n + +actions + + +def build_model(states, actions): + model = Sequential() + model.add(Dense(24, activation='relu', input_shape=states)) + model.add(Dense(24, activation='relu')) + model.add(Dense(actions, activation='linear')) + return model + + +del model + +model = build_model(states, actions) + +model.summary() + +# # 3. Build Agent with Keras-RL + +from rl.agents import DQNAgent +from rl.policy import BoltzmannQPolicy +from rl.memory import SequentialMemory + + +def build_agent(model, actions): + policy = BoltzmannQPolicy() + memory = SequentialMemory(limit=50000, window_length=1) + dqn = DQNAgent(model=model, memory=memory, policy=policy, + nb_actions=actions, nb_steps_warmup=10, target_model_update=1e-2) + return dqn + + +dqn = build_agent(model, actions) +dqn.compile(Adam(lr=1e-3), metrics=['mae']) +dqn.fit(env, nb_steps=50000, visualize=False, verbose=1) + +scores = dqn.test(env, nb_episodes=100, visualize=False) +print(np.mean(scores.history['episode_reward'])) + +_ = dqn.test(env, nb_episodes=15, visualize=True) + +# # 4. Reloading Agent from Memory + +dqn.save_weights('dqn_weights.h5f', overwrite=True) + +del model +del dqn +del env + +env = gym.make('CartPole-v0') +actions = env.action_space.n +states = env.observation_space.shape[0] +model = build_model(states, actions) +dqn = build_agent(model, actions) +dqn.compile(Adam(lr=1e-3), metrics=['mae']) + +dqn.load_weights('dqn_weights.h5f') + +_ = dqn.test(env, nb_episodes=5, visualize=True) + + From 53fdda8893707ff82369dcb0b1f1d3527f600a38 Mon Sep 17 00:00:00 2001 From: Ben Messerly Date: Fri, 26 Aug 2022 16:04:23 -0500 Subject: [PATCH 6/7] Cleanup. --- ...stonEnvironmentReinforcementLearning.ipynb | 513 ------------------ RL_test_1/checkpoint | 2 - RL_test_2/checkpoint | 2 - 3 files changed, 517 deletions(-) delete mode 100644 RL_test_0/OpenAICustonEnvironmentReinforcementLearning.ipynb delete mode 100644 RL_test_1/checkpoint delete mode 100644 RL_test_2/checkpoint diff --git a/RL_test_0/OpenAICustonEnvironmentReinforcementLearning.ipynb b/RL_test_0/OpenAICustonEnvironmentReinforcementLearning.ipynb deleted file mode 100644 index 4cd38d1..0000000 --- a/RL_test_0/OpenAICustonEnvironmentReinforcementLearning.ipynb +++ /dev/null @@ -1,513 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 0. Install Dependencies" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 1. Test Random Environment with OpenAI Gym" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "from gym import Env\n", - "from gym.spaces import Discrete, Box\n", - "import numpy as np\n", - "import random" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "class ShowerEnv(Env):\n", - " def __init__(self):\n", - " # Actions we can take, down, stay, up\n", - " self.action_space = Discrete(3)\n", - " # Temperature array\n", - " self.observation_space = Box(low=np.array([0]), high=np.array([100]))\n", - " # Set start temp\n", - " self.state = 38 + random.randint(-3,3)\n", - " # Set shower length\n", - " self.shower_length = 60\n", - " \n", - " def step(self, action):\n", - " # Apply action\n", - " # 0 -1 = -1 temperature\n", - " # 1 -1 = 0 \n", - " # 2 -1 = 1 temperature \n", - " self.state += action -1 \n", - " # Reduce shower length by 1 second\n", - " self.shower_length -= 1 \n", - " \n", - " # Calculate reward\n", - " if self.state >=37 and self.state <=39: \n", - " reward =1 \n", - " else: \n", - " reward = -1 \n", - " \n", - " # Check if shower is done\n", - " if self.shower_length <= 0: \n", - " done = True\n", - " else:\n", - " done = False\n", - " \n", - " # Apply temperature noise\n", - " #self.state += random.randint(-1,1)\n", - " # Set placeholder for info\n", - " info = {}\n", - " \n", - " # Return step information\n", - " return self.state, reward, done, info\n", - "\n", - " def render(self):\n", - " # Implement viz\n", - " pass\n", - " \n", - " def reset(self):\n", - " # Reset shower temperature\n", - " self.state = 38 + random.randint(-3,3)\n", - " # Reset shower time\n", - " self.shower_length = 60 \n", - " return self.state\n", - " " - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages/gym/spaces/box.py:74: UserWarning: \u001b[33mWARN: Box bound precision lowered by casting to float32\u001b[0m\n", - " \"Box bound precision lowered by casting to {}\".format(self.dtype)\n" - ] - } - ], - "source": [ - "env = ShowerEnv()" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([94.43672], dtype=float32)" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "env.observation_space.sample()" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Episode:1 Score:-60\n", - "Episode:2 Score:12\n", - "Episode:3 Score:-46\n", - "Episode:4 Score:-38\n", - "Episode:5 Score:34\n", - "Episode:6 Score:-22\n", - "Episode:7 Score:-44\n", - "Episode:8 Score:-34\n", - "Episode:9 Score:-56\n", - "Episode:10 Score:-54\n" - ] - } - ], - "source": [ - "episodes = 10\n", - "for episode in range(1, episodes+1):\n", - " state = env.reset()\n", - " done = False\n", - " score = 0 \n", - " \n", - " while not done:\n", - " #env.render()\n", - " action = env.action_space.sample()\n", - " n_state, reward, done, info = env.step(action)\n", - " score+=reward\n", - " print('Episode:{} Score:{}'.format(episode, score))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 2. Create a Deep Learning Model with Keras" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "from tensorflow.keras.models import Sequential\n", - "from tensorflow.keras.layers import Dense, Flatten\n", - "from tensorflow.keras.optimizers import Adam" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "states = env.observation_space.shape\n", - "actions = env.action_space.n" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "3" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "actions" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "def build_model(states, actions):\n", - " model = Sequential() \n", - " model.add(Dense(24, activation='relu', input_shape=states))\n", - " model.add(Dense(24, activation='relu'))\n", - " model.add(Dense(actions, activation='linear'))\n", - " return model" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [], - "source": [ - "del model " - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [], - "source": [ - "model = build_model(states, actions)" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Model: \"sequential\"\n", - "_________________________________________________________________\n", - "Layer (type) Output Shape Param # \n", - "=================================================================\n", - "dense (Dense) (None, 24) 48 \n", - "_________________________________________________________________\n", - "dense_1 (Dense) (None, 24) 600 \n", - "_________________________________________________________________\n", - "dense_2 (Dense) (None, 3) 75 \n", - "=================================================================\n", - "Total params: 723\n", - "Trainable params: 723\n", - "Non-trainable params: 0\n", - "_________________________________________________________________\n" - ] - } - ], - "source": [ - "model.summary()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 3. Build Agent with Keras-RL" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "from rl.agents import DQNAgent\n", - "from rl.policy import BoltzmannQPolicy\n", - "from rl.memory import SequentialMemory" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "def build_agent(model, actions):\n", - " policy = BoltzmannQPolicy()\n", - " memory = SequentialMemory(limit=50000, window_length=1)\n", - " dqn = DQNAgent(model=model, memory=memory, policy=policy, \n", - " nb_actions=actions, nb_steps_warmup=10, target_model_update=1e-2)\n", - " return dqn" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Training for 50000 steps ...\n", - "Interval 1 (0 steps performed)\n", - "WARNING:tensorflow:From /Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages/tensorflow/python/keras/engine/training_v1.py:2070: Model.state_updates (from tensorflow.python.keras.engine.training) is deprecated and will be removed in a future version.\n", - "Instructions for updating:\n", - "This property should not be used in TensorFlow 2.0, as updates are applied automatically.\n", - " 1/10000 [..............................] - ETA: 11:07 - reward: 1.0000" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/Ben/Home/Working/Hanabi/envs/lib/python3.7/site-packages/rl/memory.py:37: UserWarning: Not enough entries to sample without replacement. Consider increasing your warm-up phase to avoid oversampling!\n", - " warnings.warn('Not enough entries to sample without replacement. Consider increasing your warm-up phase to avoid oversampling!')\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "10000/10000 [==============================] - 71s 7ms/step - reward: -0.5238\n", - "166 episodes - episode_reward: -31.410 [-60.000, 32.000] - loss: 0.820 - mae: 6.553 - mean_q: -6.437\n", - "\n", - "Interval 2 (10000 steps performed)\n", - "10000/10000 [==============================] - 72s 7ms/step - reward: -0.4258\n", - "167 episodes - episode_reward: -25.725 [-60.000, 44.000] - loss: 1.651 - mae: 9.015 - mean_q: -12.825\n", - "\n", - "Interval 3 (20000 steps performed)\n", - "10000/10000 [==============================] - 73s 7ms/step - reward: -0.4308\n", - "167 episodes - episode_reward: -25.713 [-60.000, 50.000] - loss: 1.489 - mae: 8.015 - mean_q: -11.281\n", - "\n", - "Interval 4 (30000 steps performed)\n", - "10000/10000 [==============================] - 77s 8ms/step - reward: 0.1662\n", - "166 episodes - episode_reward: 9.843 [-60.000, 56.000] - loss: 0.848 - mae: 4.911 - mean_q: -5.555\n", - "\n", - "Interval 5 (40000 steps performed)\n", - "10000/10000 [==============================] - 92s 9ms/step - reward: 0.7746\n", - "done, took 384.441 seconds\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "dqn = build_agent(model, actions)\n", - "dqn.compile(Adam(lr=1e-3), metrics=['mae'])\n", - "dqn.fit(env, nb_steps=50000, visualize=False, verbose=1)" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Testing for 100 episodes ...\n", - "Episode 1: reward: 60.000, steps: 60\n", - "Episode 2: reward: 60.000, steps: 60\n", - "Episode 3: reward: 58.000, steps: 60\n", - "Episode 4: reward: 60.000, steps: 60\n", - "Episode 5: reward: 60.000, steps: 60\n", - "Episode 6: reward: 60.000, steps: 60\n", - "Episode 7: reward: 60.000, steps: 60\n", - "Episode 8: reward: 60.000, steps: 60\n", - "Episode 9: reward: 60.000, steps: 60\n", - "Episode 10: reward: 60.000, steps: 60\n", - "Episode 11: reward: 58.000, steps: 60\n", - "Episode 12: reward: 60.000, steps: 60\n", - "Episode 13: reward: 58.000, steps: 60\n", - "Episode 14: reward: 58.000, steps: 60\n", - "Episode 15: reward: 60.000, steps: 60\n", - "Episode 16: reward: 60.000, steps: 60\n", - "Episode 17: reward: 60.000, steps: 60\n", - "Episode 18: reward: 60.000, steps: 60\n", - "Episode 19: reward: 58.000, steps: 60\n", - "Episode 20: reward: 60.000, steps: 60\n", - "Episode 21: reward: 60.000, steps: 60\n", - "Episode 22: reward: 60.000, steps: 60\n", - "Episode 23: reward: 60.000, steps: 60\n", - "Episode 24: reward: 60.000, steps: 60\n", - "Episode 25: reward: 60.000, steps: 60\n", - "Episode 26: reward: 60.000, steps: 60\n", - "Episode 27: reward: 60.000, steps: 60\n", - "Episode 28: reward: 60.000, steps: 60\n", - "Episode 29: reward: 60.000, steps: 60\n", - "Episode 30: reward: 60.000, steps: 60\n", - "Episode 31: reward: 58.000, steps: 60\n", - "Episode 32: reward: 60.000, steps: 60\n", - "Episode 33: reward: 58.000, steps: 60\n", - "Episode 34: reward: 58.000, steps: 60\n", - "Episode 35: reward: 60.000, steps: 60\n", - "Episode 36: reward: 58.000, steps: 60\n", - "Episode 37: reward: 60.000, steps: 60\n", - "Episode 38: reward: 58.000, steps: 60\n", - "Episode 39: reward: 60.000, steps: 60\n", - "Episode 40: reward: 58.000, steps: 60\n", - "Episode 41: reward: 60.000, steps: 60\n", - "Episode 42: reward: 58.000, steps: 60\n", - "Episode 43: reward: 60.000, steps: 60\n", - "Episode 44: reward: 58.000, steps: 60\n", - "Episode 45: reward: 58.000, steps: 60\n", - "Episode 46: reward: 60.000, steps: 60\n", - "Episode 47: reward: 60.000, steps: 60\n", - "Episode 48: reward: 58.000, steps: 60\n", - "Episode 49: reward: 60.000, steps: 60\n", - "Episode 50: reward: 60.000, steps: 60\n", - "Episode 51: reward: 60.000, steps: 60\n", - "Episode 52: reward: 58.000, steps: 60\n", - "Episode 53: reward: 60.000, steps: 60\n", - "Episode 54: reward: 60.000, steps: 60\n", - "Episode 55: reward: 60.000, steps: 60\n", - "Episode 56: reward: 60.000, steps: 60\n", - "Episode 57: reward: 60.000, steps: 60\n", - "Episode 58: reward: 60.000, steps: 60\n", - "Episode 59: reward: 58.000, steps: 60\n", - "Episode 60: reward: 60.000, steps: 60\n", - "Episode 61: reward: 58.000, steps: 60\n", - "Episode 62: reward: 60.000, steps: 60\n", - "Episode 63: reward: 60.000, steps: 60\n", - "Episode 64: reward: 60.000, steps: 60\n", - "Episode 65: reward: 60.000, steps: 60\n", - "Episode 66: reward: 58.000, steps: 60\n", - "Episode 67: reward: 60.000, steps: 60\n", - "Episode 68: reward: 58.000, steps: 60\n", - "Episode 69: reward: 58.000, steps: 60\n", - "Episode 70: reward: 60.000, steps: 60\n", - "Episode 71: reward: 60.000, steps: 60\n", - "Episode 72: reward: 60.000, steps: 60\n", - "Episode 73: reward: 60.000, steps: 60\n", - "Episode 74: reward: 60.000, steps: 60\n", - "Episode 75: reward: 58.000, steps: 60\n", - "Episode 76: reward: 60.000, steps: 60\n", - "Episode 77: reward: 60.000, steps: 60\n", - "Episode 78: reward: 58.000, steps: 60\n", - "Episode 79: reward: 60.000, steps: 60\n", - "Episode 80: reward: 58.000, steps: 60\n", - "Episode 81: reward: 60.000, steps: 60\n", - "Episode 82: reward: 60.000, steps: 60\n", - "Episode 83: reward: 60.000, steps: 60\n", - "Episode 84: reward: 58.000, steps: 60\n", - "Episode 85: reward: 60.000, steps: 60\n", - "Episode 86: reward: 60.000, steps: 60\n", - "Episode 87: reward: 60.000, steps: 60\n", - "Episode 88: reward: 60.000, steps: 60\n", - "Episode 89: reward: 60.000, steps: 60\n", - "Episode 90: reward: 58.000, steps: 60\n", - "Episode 91: reward: 60.000, steps: 60\n", - "Episode 92: reward: 58.000, steps: 60\n", - "Episode 93: reward: 60.000, steps: 60\n", - "Episode 94: reward: 58.000, steps: 60\n", - "Episode 95: reward: 60.000, steps: 60\n", - "Episode 96: reward: 60.000, steps: 60\n", - "Episode 97: reward: 58.000, steps: 60\n", - "Episode 98: reward: 58.000, steps: 60\n", - "Episode 99: reward: 60.000, steps: 60\n", - "Episode 100: reward: 60.000, steps: 60\n", - "59.4\n" - ] - } - ], - "source": [ - "scores = dqn.test(env, nb_episodes=100, visualize=False)\n", - "print(np.mean(scores.history['episode_reward']))" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.11" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/RL_test_1/checkpoint b/RL_test_1/checkpoint deleted file mode 100644 index 0914d39..0000000 --- a/RL_test_1/checkpoint +++ /dev/null @@ -1,2 +0,0 @@ -model_checkpoint_path: "dqn_weights_discrete.h5f" -all_model_checkpoint_paths: "dqn_weights_discrete.h5f" diff --git a/RL_test_2/checkpoint b/RL_test_2/checkpoint deleted file mode 100644 index 4f75474..0000000 --- a/RL_test_2/checkpoint +++ /dev/null @@ -1,2 +0,0 @@ -model_checkpoint_path: "dqn_weights.h5f" -all_model_checkpoint_paths: "dqn_weights.h5f" From b869ce691236b2c7c3d6c0da250b56e734541da6 Mon Sep 17 00:00:00 2001 From: Ben Messerly Date: Fri, 26 Aug 2022 16:08:43 -0500 Subject: [PATCH 7/7] Modifications to py files after syncing. --- RL_test_0/OpenAICustonEnvironmentReinforcementLearning.py | 2 +- RL_test_1/OpenAICustomEnvironmentReinforcementLearning.py | 2 +- RL_test_3/Untitled.py | 2 +- RL_test_4/hanabi_ml_2.py | 2 +- RL_test_5/DeepReinforcementLearning.py | 1 + 5 files changed, 5 insertions(+), 4 deletions(-) diff --git a/RL_test_0/OpenAICustonEnvironmentReinforcementLearning.py b/RL_test_0/OpenAICustonEnvironmentReinforcementLearning.py index 8486a3a..2b7f653 100644 --- a/RL_test_0/OpenAICustonEnvironmentReinforcementLearning.py +++ b/RL_test_0/OpenAICustonEnvironmentReinforcementLearning.py @@ -1,7 +1,7 @@ # --- # jupyter: # jupytext: -# formats: ipynb,py +# formats: ipynb,py:light # text_representation: # extension: .py # format_name: light diff --git a/RL_test_1/OpenAICustomEnvironmentReinforcementLearning.py b/RL_test_1/OpenAICustomEnvironmentReinforcementLearning.py index d01eb0b..d87e19e 100644 --- a/RL_test_1/OpenAICustomEnvironmentReinforcementLearning.py +++ b/RL_test_1/OpenAICustomEnvironmentReinforcementLearning.py @@ -1,7 +1,7 @@ # --- # jupyter: # jupytext: -# formats: ipynb,py +# formats: ipynb,py:light # text_representation: # extension: .py # format_name: light diff --git a/RL_test_3/Untitled.py b/RL_test_3/Untitled.py index 9cc76c1..a861436 100644 --- a/RL_test_3/Untitled.py +++ b/RL_test_3/Untitled.py @@ -1,7 +1,7 @@ # --- # jupyter: # jupytext: -# formats: ipynb,py +# formats: ipynb,py:light # text_representation: # extension: .py # format_name: light diff --git a/RL_test_4/hanabi_ml_2.py b/RL_test_4/hanabi_ml_2.py index b56f0cd..9003bc8 100644 --- a/RL_test_4/hanabi_ml_2.py +++ b/RL_test_4/hanabi_ml_2.py @@ -1,7 +1,7 @@ # --- # jupyter: # jupytext: -# formats: ipynb,py +# formats: ipynb,py:light # text_representation: # extension: .py # format_name: light diff --git a/RL_test_5/DeepReinforcementLearning.py b/RL_test_5/DeepReinforcementLearning.py index e362fd9..e7c06b8 100644 --- a/RL_test_5/DeepReinforcementLearning.py +++ b/RL_test_5/DeepReinforcementLearning.py @@ -1,6 +1,7 @@ # --- # jupyter: # jupytext: +# formats: ipynb,py:light # text_representation: # extension: .py # format_name: light