diff --git a/NeuronalNetwork/environment/environment_fitness.py b/NeuronalNetwork/environment/environment_fitness.py index 66f64bb..fa7e269 100644 --- a/NeuronalNetwork/environment/environment_fitness.py +++ b/NeuronalNetwork/environment/environment_fitness.py @@ -2,8 +2,7 @@ import math from .environment_node_data import NodeData, Mode - - +from math import sqrt class FitnessData: """ Class for calculating the fitness function reward from the current simulation state. Possible reward calculation @@ -148,35 +147,55 @@ def calculate_reward(self, robot_x: float, robot_y: float, robot_orientation: fl :return: Reward, done. """ done = False - reward = -1 + reward = -1.0 - #distance_start_to_end = self.__distance_start_to_end() + distance_start_to_end = self._distance_start_to_end() + distance_robot_to_end = self._distance_robot_to_end(robot_x, robot_y) distance_robot_to_start = self._distance_robot_to_start(robot_x, robot_y) + distance_between_last_step = self._distance_between_last_step(robot_x, robot_y) + distance_robot_to_end_last = self._distance_robot_to_end(self._robot_x_last, self._robot_y_last) distance_robot_to_end_diff = distance_robot_to_end_last - distance_robot_to_end; + + distance_robot_to_end_diff_abs = abs(distance_robot_to_end_diff)*5 + + diff_rotation_to_end_last = math.fabs(self.angle_difference_from_robot_to_end(self._robot_x_last, self._robot_y_last,self._robot_orientation_last)) + diff_rotation_to_end = math.fabs(self.angle_difference_from_robot_to_end(robot_x, robot_y,robot_orientation)) + + diff_rotations = math.fabs(diff_rotation_to_end - diff_rotation_to_end_last)*5 + + if diff_rotation_to_end > diff_rotation_to_end_last: + diff_rotations *= -2.0 + else: + diff_rotations *= 1.3 + + if distance_robot_to_end > sqrt(distance_between_last_step**2 + distance_robot_to_end_last**2): + + distance_robot_to_end_diff_abs *= -2.0 + - if distance_robot_to_end_diff < 0: - distance_robot_to_end_diff *= 2 else: - distance_robot_to_end_diff *= 1.5 - reward += distance_robot_to_end_diff + distance_robot_to_end_diff_abs *= 1.3 + + reward += distance_robot_to_end_diff_abs + reward += diff_rotations #reward = distance_between_last_step + (1 - distance_robot_to_end / distance_start_to_end) + distance_between_last_step # reward += 10 * max((10 - self._distance_robot_to_end(robot_x, robot_y)) / 10, 0) - # reward += distance_robot_to_start - # reward += distance_between_last_step - # reward -= distance_robot_to_end + #reward += distance_robot_to_start * 0.3 + #reward -= distance_start_to_end * 0.1 + #reward += distance_between_last_step + #reward -= distance_robot_to_end * 0.1 - reward += ((math.pi - math.fabs(self._difference_two_angles(robot_orientation, self._orientation_robot_to_end(robot_x, robot_y)))) / math.pi) * 1 #reward += max((5 - self._distance_robot_to_end(robot_x, robot_y)) / 5, 0) #reward = 0 if env_done: - reward = -10 #- distance_robot_to_end / distance_start_to_end * 100 + reward = -20 #- distance_robot_to_end / distance_start_to_end * 100 done = True elif distance_robot_to_end < self._node_data.get_node_end().radius(): reward = 10 diff --git a/NeuronalNetwork/ga3c/Config.py b/NeuronalNetwork/ga3c/Config.py index ba9e248..d2e3e4b 100644 --- a/NeuronalNetwork/ga3c/Config.py +++ b/NeuronalNetwork/ga3c/Config.py @@ -33,21 +33,34 @@ class Config: # Environment configuration # Path of the world - PATH_TO_WORLD = ["../Simulation2d/world/roblab"] + PATH_TO_WORLD = ["../Simulation2d/world/square"] # Use this for multiple Environments in parallel - # PATH_TO_WORLD = ["../Simulation2d/world/room", "../Simulation2d/world/four_rooms"] + #train 1 + #PATH_TO_WORLD = ["../Simulation2d/world/square","../Simulation2d/world/square", "../Simulation2d/world/square"] + #train_2 + PATH_TO_WORLD = ["../Simulation2d/world/room","../Simulation2d/world/room"] + #PATH_TO_WORLD = ["../Simulation2d/world/four_rooms","../Simulation2d/world/four_rooms","../Simulation2d/world/four_rooms","../Simulation2d/world/four_rooms","../Simulation2d/world/four_rooms","../Simulation2d/world/four_rooms"] + #train_3 + #PATH_TO_WORLD = ["../Simulation2d/world/four_rooms","../Simulation2d/world/four_rooms","../Simulation2d/world/four_rooms","../Simulation2d/world/four_rooms","../Simulation2d/world/four_rooms","../Simulation2d/world/four_rooms","../Simulation2d/world/four_rooms","../Simulation2d/world/four_rooms","../Simulation2d/world/four_rooms","../Simulation2d/world/four_rooms"] + + #train_4 + NETWORK_DIR ="" + + NETWORK_DIR ="" + PATH_TO_WORLD = ["../Simulation2d/world/roblab"] + #train_5 + #PATH_TO_WORLD = ["../Simulation2d/world/room"] # Mode MODE=Mode.ALL_RANDOM # Terminate the simulation TERMINATE_AT_END=False # Cluster size of the lidar - CLUSTER_SIZE=1 + CLUSTER_SIZE=18 # use observation rotation vector USE_OBSERVATION_ROTATION=True # Observation rotation vector size - OBSERVATION_ROTATION_SIZE=128 - + OBSERVATION_ROTATION_SIZE=16 # Visualize for training VISUALIZE = True @@ -56,9 +69,9 @@ class Config: # Enable to train TRAIN_MODELS = True # Load old models. Throws if the model doesn't exist - LOAD_CHECKPOINT = False + LOAD_CHECKPOINT =True # If 0, the latest checkpoint is loaded - LOAD_EPISODE = 0 + LOAD_EPISODE = 0 ######################################################################### # Number of agents, predictors, trainers and other system settings @@ -67,7 +80,7 @@ class Config: # Number of Agents AGENTS = 32#32 # Number of Predictors - PREDICTORS = 2 #2 + PREDICTORS = 2#2 # Number of Trainers TRAINERS = 2 #2 @@ -83,7 +96,7 @@ class Config: # Algorithm parameters # Max step Iteration -> if read the environment ist done. 0 for endless. - MAX_STEP_ITERATION = 300 + MAX_STEP_ITERATION = 1000 # Discount factor DISCOUNT = 0.99 @@ -96,16 +109,16 @@ class Config: REWARD_MAX = 1 # Max size of the queue - MAX_QUEUE_SIZE = 100 #100 - PREDICTION_BATCH_SIZE = 128 #128 + MAX_QUEUE_SIZE = 200 #100 + PREDICTION_BATCH_SIZE = 256 #128 # Input of the DNN - STACKED_FRAMES = 4 - OBSERVATION_SIZE=1081+OBSERVATION_ROTATION_SIZE + STACKED_FRAMES = 1 + OBSERVATION_SIZE=60 # Total number of episodes and annealing frequency - EPISODES = 400000 - ANNEALING_EPISODE_COUNT = 400000 + EPISODES = 100000 + ANNEALING_EPISODE_COUNT = 100000 # Entropy regualrization hyper-parameter BETA_START = 0.01 @@ -129,20 +142,20 @@ class Config: # Epsilon (regularize policy lag in GA3C) LOG_EPSILON = 1e-6 # Training min batch size - increasing the batch size increases the stability of the algorithm, but make learning slower - TRAINING_MIN_BATCH_SIZE = 32 #0 + TRAINING_MIN_BATCH_SIZE = 128 #0 ######################################################################### # Log and save # Enable TensorBoard - TENSORBOARD = False + TENSORBOARD = True # Update TensorBoard every X training steps TENSORBOARD_UPDATE_FREQUENCY = 1000 # Enable to save models every SAVE_FREQUENCY episodes SAVE_MODELS = True # Save every SAVE_FREQUENCY episodes - SAVE_FREQUENCY = 1000 + SAVE_FREQUENCY = 200 # Print stats every PRINT_STATS_FREQUENCY episodes PRINT_STATS_FREQUENCY = 1 @@ -152,7 +165,7 @@ class Config: # Results filename RESULTS_FILENAME = 'results.txt' # Network checkpoint name - NETWORK_NAME = 'network' + NETWORK_NAME = 'network60x60' ######################################################################### # More experimental parameters here diff --git a/NeuronalNetwork/ga3c/Environment.py b/NeuronalNetwork/ga3c/Environment.py index 1b34f7c..5440bab 100644 --- a/NeuronalNetwork/ga3c/Environment.py +++ b/NeuronalNetwork/ga3c/Environment.py @@ -37,6 +37,7 @@ from .GameManager import GameManager from action_mapper import ACTION_SIZE +from .Grid_map import GridMap class Environment: def __init__(self, id=-1): @@ -52,10 +53,13 @@ def __init__(self, id=-1): def _get_current_state(self): if not self.frame_q.full(): return None # frame queue is not full yet. - x_ = np.array(self.frame_q.queue) + _maps = np.array([i[0] for i in self.frame_q.queue]) + _rotations = np.array([i[1] for i in self.frame_q.queue]) + _maps = np.reshape(_maps,(60,60,2)) + _rotations = np.reshape(_rotations,(16,1)) #x_ = np.transpose(x_, [1, 2, 0]) # move channels - x_ = np.transpose(x_, [1, 0]) # move channels - return x_ + #x_ = np.transpose(x_, [1, 0]) # move channels + return np.array([_maps, _rotations]) def _update_frame_q(self, frame): if self.frame_q.full(): diff --git a/NeuronalNetwork/ga3c/GameManager.py b/NeuronalNetwork/ga3c/GameManager.py index e15ce2c..2e00b08 100644 --- a/NeuronalNetwork/ga3c/GameManager.py +++ b/NeuronalNetwork/ga3c/GameManager.py @@ -26,6 +26,8 @@ from action_mapper import map_action from environment.environment import Environment +import numpy as np +from .Grid_map import GridMap from .Config import Config @@ -55,22 +57,39 @@ def __init__(self, id): def reset(self): observation, _, _, _ = self.env.reset() - return observation + input_laser,rotation = self.process_observation(observation) + map = GridMap(input_laser) + obs = np.array([[map.States_map,map.Reward_map],[rotation]]) + return obs + def step(self, action): self._update_display() if action is None: observation, reward, done, info = self.env.step(0, 0, 20) + + input_laser, rotation = self.process_observation(observation) + map = GridMap(input_laser) + obs = np.array([[map.States_map, map.Reward_map], [rotation]]) reward = 0 done = False else: + linear, angular = map_action(action) observation, reward, done, info = self.env.step(linear, angular,20) - return observation, reward, done, info + input_laser, rotation = self.process_observation(observation) + map = GridMap(input_laser) + obs = np.array([[map.States_map, map.Reward_map], [rotation]]) + + return obs, reward, done, info def _update_display(self): if self.visualize: self.env.visualize() def observation_size(self): - return self.env.observation_size() \ No newline at end of file + return self.env.observation_size() + def process_observation(self,observation): + laser_scan= np.array(observation[:Config.OBSERVATION_SIZE]) + oriontaion=np.array(observation[Config.OBSERVATION_SIZE:]) + return laser_scan, oriontaion \ No newline at end of file diff --git a/NeuronalNetwork/ga3c/Grid_map.py b/NeuronalNetwork/ga3c/Grid_map.py new file mode 100644 index 0000000..e1a27e5 --- /dev/null +++ b/NeuronalNetwork/ga3c/Grid_map.py @@ -0,0 +1,72 @@ +import numpy as np + + +class GridMap: + + def __init__(self, input_laser): + self.state = self.normalise(input_laser) + self.height = self.state.size + self.witdh=self.state.size + self.States_map= self.S_map() + self.Reward_map= self.R_map() + + + def S_map(self): + state = np.reshape(self.state, (self.witdh)) + States_map = np.zeros((1,self.height)) + for s_ in state: + if int(((s_*200)* self.height)/100) 0 : + ones = int(self.height - zeros) + else: + ones = 0 + col = np.zeros((zeros), dtype=float) + col = np.append(col, np.ones((ones), dtype=float)) + col = np.reshape(col,(1,self.height)) + States_map=np.append(States_map,col,axis=0) + States_map= np.rot90(np.reshape(np.delete(States_map, 0, 0), (self.witdh, self.height))) + return States_map + def R_map(self): + Reward_map=np.zeros((1,self.witdh)) + States_map= np.rot90(self.States_map,1) + + #States_map=np.transpose(self.States_map) + for s in States_map: + ones= np.count_nonzero(s)-1 + zeros = 0 + if (ones) <0 : + ones = 0 + zeros-=1 + + zeros += np.count_nonzero(s-1) + collisions = np.ones((ones),dtype=float) * (-1) + free = np.ones((zeros),dtype=float) * (-0.04) + + if zeros==self.height: + ziel=zeros*1 + elif zeros< self.height and zeros >= ((self.height)/2): + ziel = zeros * 0.5 + else: + ziel= ones * (-1) + R_col = np.array(np.append(np.append(collisions,ziel ),free)) + R_col = np.reshape(R_col,(1,self.witdh)) + Reward_map=np.append(Reward_map,R_col,axis=0) + Reward_map = np.delete(Reward_map,0,0) + Reward_map = np.rot90(Reward_map,-1) + + return Reward_map + def normalise(self,s): + s = np.array ([s]) + shape = s.shape + max_min = np.amax(s) - np.amin(s) + min = np.amin(s) + if not max_min==0: + s = (s - min) / max_min + #s = (s + 1) / 2 + else: + s = (s - min) + #s = (s + 1) / 2 + return np.reshape(s, shape) diff --git a/NeuronalNetwork/ga3c/NetworkVP_Hsm.py b/NeuronalNetwork/ga3c/NetworkVP_Hsm.py new file mode 100644 index 0000000..bf40f1a --- /dev/null +++ b/NeuronalNetwork/ga3c/NetworkVP_Hsm.py @@ -0,0 +1,352 @@ +# Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import os +import re +import numpy as np +import tensorflow as tf + +from .Config import Config + + +class NetworkVP_Hsm: + def __init__(self, device, model_name, num_actions): + self.device = device + self.model_name = model_name + self.num_actions = num_actions + + self.observation_size = Config.OBSERVATION_SIZE + # @todo rotation_size + self.rotation_size = Config.OBSERVATION_ROTATION_SIZE + self.observation_channels = Config.STACKED_FRAMES + + self.learning_rate = Config.LEARNING_RATE_START + self.beta = Config.BETA_START + self.log_epsilon = Config.LOG_EPSILON + + self.graph = tf.Graph() + with self.graph.as_default() as g: + with tf.device(self.device): + self._create_graph() + + self.sess = tf.Session( + graph=self.graph, + config=tf.ConfigProto( + allow_soft_placement=True, + log_device_placement=False, + gpu_options=tf.GPUOptions(allow_growth=True))) + self.sess.run(tf.global_variables_initializer()) + + if Config.TENSORBOARD: self._create_tensor_board() + if Config.LOAD_CHECKPOINT or Config.SAVE_MODELS: + vars = tf.global_variables() + self.saver = tf.train.Saver({var.name: var for var in vars}, max_to_keep=0) + + def _create_graph(self): + # + # self.x = tf.placeholder( + # tf.float32, [None, self.observation_size, self.observation_channels], name='X') + # self.x = tf.expand_dims(self.x, -1) + # self.y_r = tf.placeholder(tf.float32, [None], name='Yr') + # + # self.var_beta = tf.placeholder(tf.float32, name='beta', shape=[]) + # self.var_learning_rate = tf.placeholder(tf.float32, name='lr', shape=[]) + # + # self.global_step = tf.Variable(0, trainable=False, name='step') + # + # # As implemented in A3C paper + # self.n1 = self.conv2d_layer(self.x, 8, 16, 'conv11', strides=[1, 4, 4, 1]) + # self.n2 = self.conv2d_layer(self.n1, 4, 32, 'conv12', strides=[1, 2, 2, 1]) + # self.action_index = tf.placeholder(tf.float32, [None, self.num_actions]) + # _input = self.n2 + # + # flatten_input_shape = _input.get_shape() + # nb_elements = flatten_input_shape[1] * flatten_input_shape[2] * flatten_input_shape[3] + # + # self.flat = tf.reshape(_input, shape=[-1, nb_elements._value]) + # self.d1 = self.dense_layer(self.flat, 256, 'dense1') + # + # self.logits_v = tf.squeeze(self.dense_layer(self.d1, 1, 'logits_v', func=None), axis=[1]) + # self.cost_v = 0.5 * tf.reduce_sum(tf.square(self.y_r - self.logits_v), axis=0) + + # @todo add input2D + change observation_size in config + + self.input2D = tf.placeholder(tf.float32, [None, self.observation_size, self.observation_size, + 2 * self.observation_channels], name='input2D') + + self.rotation = tf.placeholder( + tf.float32, [None, self.rotation_size, self.observation_channels], name='rotation') + + + # self.x = tf.placeholder( + # tf.float32, [None, self.observation_size, self.observation_channels], name='X') + self.y_r = tf.placeholder(tf.float32, [None], name='Yr') + + # @todo add change observation size + rotation size + # rotation with con1D + # input with conv2d + # flattern input and output of conv1d + + self.var_beta = tf.placeholder(tf.float32, name='beta', shape=[]) + self.var_learning_rate = tf.placeholder(tf.float32, name='lr', shape=[]) + + self.global_step = tf.Variable(0, trainable=False, name='step') + # @todo add flattern conv2d + + # As implemented in A3C paper + + self.conv2D_1 = self.conv2d_layer(self.input2D, 5, 16, 'con2d_1', strides=[1, 2, 2, 1]) + self.conv2D_2 = self.conv2d_layer(self.conv2D_1, 5, 32, 'con2d_2', strides=[1, 4, 4, 1]) + + _input2D = self.conv2D_2 + flatten_input2D_shape = _input2D.get_shape() + nb_elements = flatten_input2D_shape[1] * flatten_input2D_shape[2] * flatten_input2D_shape[3] + + self.flat2D = tf.reshape(_input2D, shape=[-1, nb_elements._value]) + + self.n1 = self.conv1d_layer(self.rotation, 9, 16, 'conv1_1', stride=5) # @todo statt x =>rotation + self.n2 = self.conv1d_layer(self.n1, 5, 32, 'conv1_2', stride=3) + self.action_index = tf.placeholder(tf.float32, [None, self.num_actions]) + + _input_rotation = self.n2 + + # @todo add merge conv2d to the conv1d + + flatten_input_rotation_shape = _input_rotation.get_shape() + nb_elements = flatten_input_rotation_shape[1] * flatten_input_rotation_shape[2] + + self.flat_rotation = tf.reshape(_input_rotation, shape=[-1, nb_elements._value]) + self.concat = tf.concat([self.flat2D, self.flat_rotation], -1) + # @todo concat flat2D flat rotation + self.d1 = self.dense_layer(self.concat, 128, 'dense1') + + self.logits_v = tf.squeeze(self.dense_layer(self.d1, 1, 'logits_v', func=None), axis=[1]) + self.cost_v = 0.5 * tf.reduce_sum(tf.square(self.y_r - self.logits_v), axis=0) + + self.logits_p = self.dense_layer(self.d1, self.num_actions, 'logits_p', func=None) + + if Config.USE_LOG_SOFTMAX: + self.softmax_p = tf.nn.softmax(self.logits_p) + self.log_softmax_p = tf.nn.log_softmax(self.logits_p) + self.log_selected_action_prob = tf.reduce_sum(self.log_softmax_p * self.action_index, axis=1) + + self.cost_p_1 = self.log_selected_action_prob * (self.y_r - tf.stop_gradient(self.logits_v)) + self.cost_p_2 = -1 * self.var_beta * tf.reduce_sum(self.log_softmax_p * self.softmax_p, axis=1) + else: + self.softmax_p = (tf.nn.softmax(self.logits_p) + Config.MIN_POLICY) / ( + 1.0 + Config.MIN_POLICY * self.num_actions) + self.selected_action_prob = tf.reduce_sum(self.softmax_p * self.action_index, axis=1) + + self.cost_p_1 = tf.log(tf.maximum(self.selected_action_prob, self.log_epsilon)) \ + * (self.y_r - tf.stop_gradient(self.logits_v)) + self.cost_p_2 = -1 * self.var_beta * \ + tf.reduce_sum(tf.log(tf.maximum(self.softmax_p, self.log_epsilon)) * + self.softmax_p, axis=1) + + self.cost_p_1_agg = tf.reduce_sum(self.cost_p_1, axis=0) + self.cost_p_2_agg = tf.reduce_sum(self.cost_p_2, axis=0) + self.cost_p = -(self.cost_p_1_agg + self.cost_p_2_agg) + + if Config.DUAL_RMSPROP: + self.opt_p = tf.train.RMSPropOptimizer( + learning_rate=self.var_learning_rate, + decay=Config.RMSPROP_DECAY, + momentum=Config.RMSPROP_MOMENTUM, + epsilon=Config.RMSPROP_EPSILON) + + self.opt_v = tf.train.RMSPropOptimizer( + learning_rate=self.var_learning_rate, + decay=Config.RMSPROP_DECAY, + momentum=Config.RMSPROP_MOMENTUM, + epsilon=Config.RMSPROP_EPSILON) + else: + self.cost_all = self.cost_p + self.cost_v + self.opt = tf.train.RMSPropOptimizer( + learning_rate=self.var_learning_rate, + decay=Config.RMSPROP_DECAY, + momentum=Config.RMSPROP_MOMENTUM, + epsilon=Config.RMSPROP_EPSILON) + + if Config.USE_GRAD_CLIP: + if Config.DUAL_RMSPROP: + self.opt_grad_v = self.opt_v.compute_gradients(self.cost_v) + self.opt_grad_v_clipped = [(tf.clip_by_norm(g, Config.GRAD_CLIP_NORM), v) + for g, v in self.opt_grad_v if not g is None] + self.train_op_v = self.opt_v.apply_gradients(self.opt_grad_v_clipped) + + self.opt_grad_p = self.opt_p.compute_gradients(self.cost_p) + self.opt_grad_p_clipped = [(tf.clip_by_norm(g, Config.GRAD_CLIP_NORM), v) + for g, v in self.opt_grad_p if not g is None] + self.train_op_p = self.opt_p.apply_gradients(self.opt_grad_p_clipped) + self.train_op = [self.train_op_p, self.train_op_v] + else: + self.opt_grad = self.opt.compute_gradients(self.cost_all) + self.opt_grad_clipped = [(tf.clip_by_average_norm(g, Config.GRAD_CLIP_NORM), v) for g, v in + self.opt_grad] + self.train_op = self.opt.apply_gradients(self.opt_grad_clipped) + else: + if Config.DUAL_RMSPROP: + self.train_op_v = self.opt_p.minimize(self.cost_v, global_step=self.global_step) + self.train_op_p = self.opt_v.minimize(self.cost_p, global_step=self.global_step) + self.train_op = [self.train_op_p, self.train_op_v] + else: + self.train_op = self.opt.minimize(self.cost_all, global_step=self.global_step) + + def _create_tensor_board(self): + summaries = tf.get_collection(tf.GraphKeys.SUMMARIES) + summaries.append(tf.summary.scalar("Pcost_advantage", self.cost_p_1_agg)) + summaries.append(tf.summary.scalar("Pcost_entropy", self.cost_p_2_agg)) + summaries.append(tf.summary.scalar("Pcost", self.cost_p)) + summaries.append(tf.summary.scalar("Vcost", self.cost_v)) + summaries.append(tf.summary.scalar("LearningRate", self.var_learning_rate)) + summaries.append(tf.summary.scalar("Beta", self.var_beta)) + for var in tf.trainable_variables(): + summaries.append(tf.summary.histogram("weights_%s" % var.name, var)) + + summaries.append(tf.summary.histogram("activation_n1", self.n1)) + summaries.append(tf.summary.histogram("activation_n2", self.n2)) + summaries.append(tf.summary.histogram("activation_d2", self.d1)) + summaries.append(tf.summary.histogram("activation_v", self.logits_v)) + summaries.append(tf.summary.histogram("activation_p", self.softmax_p)) + + self.summary_op = tf.summary.merge(summaries) + self.log_writer = tf.summary.FileWriter("logs/%s" % self.model_name, self.sess.graph) + + def dense_layer(self, input, out_dim, name, func=tf.nn.relu): + in_dim = input.get_shape().as_list()[-1] + d = 1.0 / np.sqrt(in_dim) + with tf.variable_scope(name): + w_init = tf.random_uniform_initializer(-d, d) + b_init = tf.random_uniform_initializer(-d, d) + w = tf.get_variable('w', dtype=tf.float32, shape=[in_dim, out_dim], initializer=w_init) + b = tf.get_variable('b', shape=[out_dim], initializer=b_init) + + output = tf.matmul(input, w) + b + if func is not None: + output = func(output) + + return output + + def conv2d_layer(self, input, filter_size, out_dim, name, strides, func=tf.nn.relu): + in_dim = input.get_shape().as_list()[-1] + d = 1.0 / np.sqrt(filter_size * in_dim) + with tf.variable_scope(name): + w_init = tf.random_uniform_initializer(-d, d) + b_init = tf.random_uniform_initializer(-d, d) + w = tf.get_variable('w', + shape=[filter_size, filter_size, in_dim, out_dim], + dtype=tf.float32, + initializer=w_init) + b = tf.get_variable('b', shape=[out_dim], initializer=b_init) + + output = tf.nn.conv2d(input, w, strides=strides, padding='SAME') + b + if func is not None: + output = func(output) + + return output + + def conv1d_layer(self, input, filter_size, out_dim, name, stride, func=tf.nn.relu): + in_dim = input.get_shape().as_list()[-1] + d = 1.0 / np.sqrt(filter_size * in_dim) + with tf.variable_scope(name): + w_init = tf.random_uniform_initializer(-d, d) + b_init = tf.random_uniform_initializer(-d, d) + w = tf.get_variable('w', + shape=[filter_size, in_dim, out_dim], + dtype=tf.float32, + initializer=w_init) + b = tf.get_variable('b', shape=[out_dim], initializer=b_init) + + output = tf.nn.conv1d(input, w, stride=stride, padding='SAME') + b + if func is not None: + output = func(output) + + return output + + def __get_base_feed_dict(self): + return {self.var_beta: self.beta, self.var_learning_rate: self.learning_rate} + + def get_global_step(self): + step = self.sess.run(self.global_step) + return step + + def predict_single(self, x): + return self.predict_p(x)[0] + + def predict_v(self, x): + prediction = self.sess.run(self.logits_v, feed_dict={self.input2D: x[0], self.rotation: x[1]}) + return prediction + + def predict_p(self, x): + prediction = self.sess.run(self.softmax_p, feed_dict={self.input2D: x[0], self.rotation: x[1]}) + return prediction + + def predict_p_and_v(self, x): + return self.sess.run([self.softmax_p, self.logits_v], feed_dict={self.input2D: x[0], self.rotation: x[1]}) + + def train(self, x, y_r, a, trainer_id): + input2D = [i[0] for i in x] + rotation = [i[1] for i in x] + input2D = np.reshape(input2D, (len(input2D), 60, 60, 2)) + rotation = np.reshape(rotation, (len(rotation), 16, 1)) + feed_dict = self.__get_base_feed_dict() + feed_dict.update({self.input2D: input2D, self.rotation: rotation, self.y_r: y_r, self.action_index: a}) + self.sess.run(self.train_op, feed_dict=feed_dict) + + def log(self, x, y_r, a): + input2D = [i[0] for i in x] + rotation = [i[1] for i in x] + input2D = np.reshape(input2D, (len(input2D), self.observation_size, self.observation_size, self.observation_channels*2)) + rotation = np.reshape(rotation, (len(rotation), self.rotation_size, self.observation_channels)) + feed_dict = self.__get_base_feed_dict() + feed_dict.update({self.input2D: input2D, self.rotation: rotation, self.y_r: y_r, self.action_index: a}) + step, summary = self.sess.run([self.global_step, self.summary_op], feed_dict=feed_dict) + self.log_writer.add_summary(summary, step) + + + def _checkpoint_filename(self, episode): + return 'checkpoints'+Config.NETWORK_DIR+'/%s_%08d' % (self.model_name, episode) + + def _get_episode_from_filename(self, filename): + # TODO: hacky way of getting the episode. ideally episode should be stored as a TF variable + return int(re.split('/|_|\.', filename)[2]) + + def save(self, episode): + self.saver.save(self.sess, self._checkpoint_filename(episode)) + + def load(self): + # filename = tf.train.latest_checkpoint(os.path.dirname(self._checkpoint_filename(episode=0))) + filename = tf.train.latest_checkpoint(os.path.dirname(self._checkpoint_filename(episode=0))) + if Config.LOAD_EPISODE > 0: + filename = self._checkpoint_filename(Config.LOAD_EPISODE) + self.saver.restore(self.sess, filename) + return self._get_episode_from_filename(filename) + + def get_variables_names(self): + return [var.name for var in self.graph.get_collection('trainable_variables')] + + def get_variable_value(self, name): + return self.sess.run(self.graph.get_tensor_by_name(name)) diff --git a/NeuronalNetwork/ga3c/Server.py b/NeuronalNetwork/ga3c/Server.py index 2cb7945..6b22e01 100644 --- a/NeuronalNetwork/ga3c/Server.py +++ b/NeuronalNetwork/ga3c/Server.py @@ -30,7 +30,7 @@ from .Config import Config from .Environment import Environment -from .NetworkVP import NetworkVP +from .NetworkVP_Hsm import NetworkVP_Hsm from .ProcessAgent import ProcessAgent from .ProcessStats import ProcessStats from .ThreadDynamicAdjustment import ThreadDynamicAdjustment @@ -45,7 +45,7 @@ def __init__(self): self.training_q = Queue(maxsize=Config.MAX_QUEUE_SIZE) self.prediction_q = Queue(maxsize=Config.MAX_QUEUE_SIZE) - self.model = NetworkVP(Config.DEVICE, Config.NETWORK_NAME, Environment().get_num_actions()) + self.model = NetworkVP_Hsm(Config.DEVICE, Config.NETWORK_NAME, Environment().get_num_actions()) if Config.LOAD_CHECKPOINT: self.stats.episode_count.value = self.model.load() @@ -113,6 +113,7 @@ def main(self): while self.stats.episode_count.value < Config.EPISODES: step = min(self.stats.episode_count.value, Config.ANNEALING_EPISODE_COUNT - 1) + self.model.learning_rate = Config.LEARNING_RATE_START + learning_rate_multiplier * step self.model.beta = Config.BETA_START + beta_multiplier * step @@ -121,7 +122,7 @@ def main(self): self.save_model() self.stats.should_save_model.value = 0 - time.sleep(0.01) + time.sleep(0.1) self.dynamic_adjustment.exit_flag = True while self.agents: diff --git a/NeuronalNetwork/ga3c/ThreadPredictor.py b/NeuronalNetwork/ga3c/ThreadPredictor.py index a505452..5061ea6 100644 --- a/NeuronalNetwork/ga3c/ThreadPredictor.py +++ b/NeuronalNetwork/ga3c/ThreadPredictor.py @@ -43,19 +43,25 @@ def __init__(self, server, id): def run(self): ids = np.zeros(Config.PREDICTION_BATCH_SIZE, dtype=np.uint16) states = np.zeros( - (Config.PREDICTION_BATCH_SIZE, Config.OBSERVATION_SIZE, Config.STACKED_FRAMES), + (Config.PREDICTION_BATCH_SIZE, Config.OBSERVATION_SIZE,Config.OBSERVATION_SIZE, 2*Config.STACKED_FRAMES), + dtype=np.float32) + rotations = np.zeros( + (Config.PREDICTION_BATCH_SIZE, Config.OBSERVATION_ROTATION_SIZE,Config.STACKED_FRAMES), dtype=np.float32) while not self.exit_flag: - ids[0], states[0] = self.server.prediction_q.get() + i, x = self.server.prediction_q.get() + ids[0], states[0],rotations[0] = i,x[0],x[1] size = 1 while size < Config.PREDICTION_BATCH_SIZE and not self.server.prediction_q.empty(): - ids[size], states[size] = self.server.prediction_q.get() + i, x = self.server.prediction_q.get() + ids[size], states[size], rotations[size] = i, x[0], x[1] size += 1 - batch = states[:size] - p, v = self.server.model.predict_p_and_v(batch) + state_batch = states[:size] + rotation_batch = rotations[:size] + p, v = self.server.model.predict_p_and_v([state_batch,rotation_batch]) for i in range(size): if ids[i] < len(self.server.agents):