diff --git a/.gitignore b/.gitignore index 54f011d6..1750c7e8 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ /General_Deep_Q_RL.sln /General_Deep_Q_RL/theano.py /General_Deep_Q_RL/plot.png +.DS_Store diff --git a/deer/learning_algos/CRAR_pytorch.py b/deer/learning_algos/CRAR_pytorch.py new file mode 100644 index 00000000..6f7131f5 --- /dev/null +++ b/deer/learning_algos/CRAR_pytorch.py @@ -0,0 +1,630 @@ +""" +Code for the CRAR learning algorithm using Keras + +""" + +import numpy as np +np.set_printoptions(threshold=np.nan) +from keras.optimizers import SGD,RMSprop +from keras import backend as K +from ..base_classes import LearningAlgo +from .NN_CRAR_pytorch import NN # Default Neural network used +#import tensorflow as tf +#config = tf.ConfigProto() +#config.gpu_options.allow_growth=True +#sess = tf.Session(config=config) +import copy +import torch +import torch.nn.functional as F +import torch.optim as optim +import pdb + +def mean_squared_error_p(y_true, y_pred): + """ Modified mean square error that clips + """ + return K.clip(K.max( K.square( y_pred - y_true ) , axis=-1 )-1,0.,100.) # = modified mse error L_inf + #return K.clip(K.mean( K.square( y_pred - y_true ) , axis=-1 )-1,0.,100.) # = modified mse error L_2 + + +def mean_squared_error_p_pytorch(y_pred): + """ Modified mean square error that clips + """ + return torch.sum(torch.clamp( (torch.max((y_pred)**2,dim=-1)[0] - 1), 0., 100.)) # = modified mse error L_inf + +def exp_dec_error(y_true, y_pred): + return K.exp( - 5.*K.sqrt( K.clip(K.sum(K.square(y_pred), axis=-1, keepdims=True),0.000001,10) ) ) + + +def exp_dec_error_pytorch(y_pred): + return torch.mean(torch.exp( - 5.*torch.sqrt( torch.clamp(torch.sum(y_pred**2, dim=-1),0.000001,10) ) )) + + + +def cosine_proximity2(y_true, y_pred): + """ This loss is similar to the native cosine_proximity loss from Keras + but it differs by the fact that only the two first components of the two vectors are used + """ + y_true = K.l2_normalize(y_true[:,0:2], axis=-1) + y_pred = K.l2_normalize(y_pred[:,0:2], axis=-1) + return -K.sum(y_true * y_pred, axis=-1) + + +def cosine_proximity2_pytorch(y_true, y_pred): + """ This loss is similar to the native cosine_proximity loss from Keras + but it differs by the fact that only the two first components of the two vectors are used + """ + + y_true = F.normalize(y_true[:,0:2],p=2,dim=-1) + y_pred = F.normalize(y_pred[:,0:2],p=2,dim=-1) + return -torch.sum(y_true * y_pred, dim=-1) + + +# def loss_diff_s_s_(y_true, y_pred): +# return K.square( 1. - K.sqrt( K.clip( K.sum(y_pred,axis=-1,keepdims=True), 0.000001 , 1. ) ) ) # tend to increase y_pred --> loss -1 + +class CRAR(LearningAlgo): + """ + Combined Reinforcement learning via Abstract Representations (CRAR) using Keras + + Parameters + ----------- + environment : object from class Environment + The environment in which the agent evolves. + rho : float + Parameter for rmsprop. Default : 0.9 + rms_epsilon : float + Parameter for rmsprop. Default : 0.0001 + momentum : float + Momentum for SGD. Default : 0 + clip_norm : float + The gradient tensor will be clipped to a maximum L2 norm given by this value. + freeze_interval : int + Period during which the target network is freezed and after which the target network is updated. Default : 1000 + batch_size : int + Number of tuples taken into account for each iteration of gradient descent. Default : 32 + update_rule: str + {sgd,rmsprop}. Default : rmsprop + random_state : numpy random number generator + Set the random seed. + double_Q : bool, optional + Activate or not the double_Q learning. + More informations in : Hado van Hasselt et al. (2015) - Deep Reinforcement Learning with Double Q-learning. + neural_network : object, optional + Default is deer.learning_algos.NN_keras + """ + + def __init__(self, environment, rho=0.9, rms_epsilon=0.0001, momentum=0, clip_norm=0, freeze_interval=1000, batch_size=32, update_rule="rmsprop", random_state=np.random.RandomState(), double_Q=False, neural_network=NN, **kwargs): + """ Initialize the environment + + """ + LearningAlgo.__init__(self,environment, batch_size) + + self._rho = rho + self._rms_epsilon = rms_epsilon + self._momentum = momentum + self._clip_norm = clip_norm + self._update_rule = update_rule + self._freeze_interval = freeze_interval + self._double_Q = double_Q + self._random_state = random_state + self.update_counter = 0 + self._high_int_dim = kwargs.get('high_int_dim',False) + self._internal_dim = kwargs.get('internal_dim',2) + self.loss_interpret=0 + self.loss_T=0 + self.lossR=0 + self.loss_Q=0 + self.loss_disentangle_t=0 + self.loss_disambiguate1=0 + self.loss_disambiguate2=0 + self.loss_gamma=0 + + self.learn_and_plan = neural_network(self._batch_size, self._input_dimensions, self._n_actions, self._random_state, high_int_dim=self._high_int_dim, internal_dim=self._internal_dim) + + + self.encoder = self.learn_and_plan.encoder_model() + self.encoder_diff = self.learn_and_plan.encoder_diff_model + + + self.R = self.learn_and_plan.float_model() + self.Q = self.learn_and_plan.Q_model() + self.gamma = self.learn_and_plan.float_model() + self.transition = self.learn_and_plan.transition_model() + + self.full_Q=self.learn_and_plan.full_Q_model + + + # used to fit rewards + self.full_R = self.learn_and_plan.full_float_model + + + # used to fit gamma + self.full_gamma = self.learn_and_plan.full_float_model + + + # used to fit transitions + self.diff_Tx_x_ = self.learn_and_plan.diff_Tx_x_ + + # constraint on consecutive t + self.diff_s_s_ = self.learn_and_plan.encoder_diff_model + + + # used to force features variations + if(self._high_int_dim==False): + self.force_features=self.learn_and_plan.force_features + + + self.all_models = [self.encoder,self.R,self.Q,self.gamma,self.transition] + + # Compile all models + self._compile() + + + # Instantiate the same neural network as a target network. + self.learn_and_plan_target = neural_network(self._batch_size, self._input_dimensions, self._n_actions, self._random_state, high_int_dim=self._high_int_dim, internal_dim=self._internal_dim) + self.encoder_target = self.learn_and_plan_target.encoder_model() + self.Q_target = self.learn_and_plan_target.Q_model() + self.R_target = self.learn_and_plan_target.float_model() + self.gamma_target = self.learn_and_plan_target.float_model() + self.transition_target = self.learn_and_plan_target.transition_model() + + self.full_Q_target = self.learn_and_plan_target.full_Q_model + + + self.all_models_target = [self.encoder_target,self.R_target,self.Q_target,self.gamma_target,self.transition_target] + + self._resetQHat() + + + + def train(self, states_val, actions_val, rewards_val, next_states_val, terminals_val): + """ + Train CRAR from one batch of data. + + Parameters + ----------- + states_val : numpy array of objects + Each object is a numpy array that relates to one of the observations + with size [batch_size * history size * size of punctual observation (which is 2D,1D or scalar)]). + actions_val : numpy array of integers with size [self._batch_size] + actions[i] is the action taken after having observed states[:][i]. + rewards_val : numpy array of floats with size [self._batch_size] + rewards[i] is the reward obtained for taking actions[i-1]. + next_states_val : numpy array of objects + Each object is a numpy array that relates to one of the observations + with size [batch_size * history size * size of punctual observation (which is 2D,1D or scalar)]). + terminals_val : numpy array of booleans with size [self._batch_size] + terminals[i] is True if the transition leads to a terminal state and False otherwise + + Returns + ------- + Average loss of the batch training for the Q-values (RMSE) + Individual (square) losses for the Q-values for each tuple + """ + + onehot_actions = np.zeros((self._batch_size, self._n_actions)) + onehot_actions[np.arange(self._batch_size), actions_val] = 1 + onehot_actions_rand = np.zeros((self._batch_size, self._n_actions)) + onehot_actions_rand[np.arange(self._batch_size), np.random.randint(0,2,(32))] = 1 + states_val=list(states_val) + next_states_val=list(next_states_val) + + + states_val = torch.from_numpy(states_val[0]).float() + next_states_val = torch.from_numpy(next_states_val[0]).float() + onehot_actions = torch.from_numpy(onehot_actions).float() + terminals_val = torch.from_numpy(terminals_val[:,None].astype(np.int32)).float() + rewards_val = torch.from_numpy(rewards_val[:,None].astype(np.int32)).float() + + Es_=self.encoder.predict(next_states_val).detach() + Es=self.encoder.predict(states_val).detach() + ETs=self.transition.predict(torch.cat((Es,onehot_actions),-1)).detach() + R=self.R.predict(torch.cat((Es,onehot_actions),-1)).detach() + + if(self.update_counter%500==0): + print ("Printing a few elements useful for debugging:") + #print ("states_val[0][0]") + #print (states_val[0][0]) + #print ("next_states_val[0][0]") + #print (next_states_val[0][0]) + print ("actions_val[0], rewards_val[0], terminals_val[0]") + print (actions_val[0], rewards_val[0], terminals_val[0]) + print ("Es[0],ETs[0],Es_[0]") + + # if(Es.ndim==4): + # print (np.transpose(Es, (0, 3, 1, 2))[0],np.transpose(ETs, (0, 3, 1, 2))[0],np.transpose(Es_, (0, 3, 1, 2))[0]) # data_format='channels_last' --> 'channels_first' + # else: + print (Es[0],ETs[0],Es_[0]) + print ("R[0]") + print (R[0]) + + self.optimizer_diff_Tx_x_.zero_grad() + out = self.diff_Tx_x_(states_val,next_states_val,onehot_actions,(1-terminals_val),self.encoder,self.transition) + loss = torch.nn.MSELoss() + loss_val = loss(out,torch.zeros_like(Es)) + self.loss_T+= loss_val.data.numpy() + loss_val.backward() + for param in list(self.transition.parameters()): + param.grad.data.clamp_(-1, 1) + self.optimizer_diff_Tx_x_.step() + + + + self.optimizer_full_R.zero_grad() + out = self.full_R(states_val,onehot_actions,self.encoder,self.R) + loss = torch.nn.MSELoss() + loss_val = loss(out,rewards_val) + self.lossR+= loss_val.data.numpy() + loss_val.backward() + for param in list(self.encoder.parameters()) + list(self.R.parameters()): + param.grad.data.clamp_(-1, 1) + self.optimizer_full_R.step() + + + self.optimizer_full_gamma.zero_grad() + out = self.full_gamma(states_val,onehot_actions,self.encoder,self.gamma) + loss = torch.nn.MSELoss() + loss_val = loss(out,(1-terminals_val[:])*self._df) + self.loss_gamma+= loss_val.data.numpy() + loss_val.backward() + for param in list(self.encoder.parameters()) + list(self.gamma.parameters()): + param.grad.data.clamp_(-1, 1) + self.optimizer_full_gamma.step() + + + + L_infinity ball of radius 1 loss + self.optimizer_encoder.zero_grad() + out = self.encoder(states_val) + loss_val = mean_squared_error_p_pytorch(out) + self.loss_disambiguate1+= loss_val.data.numpy() + loss_val.backward() + for param in list(self.encoder.parameters()): + param.grad.data.clamp_(-1, 1) + self.optimizer_encoder.step() + + + + + # This one is very important + # Entropy maximization loss (through exponential) between two random states + def roll(x, n): + return torch.cat((x[-n:], x[:-n])) + rolled = roll(states_val,-31) + self.optimizer_encoder_diff.zero_grad() + out = self.encoder_diff(self.encoder,states_val,rolled) + loss_val = exp_dec_error_pytorch(out) + + self.loss_disambiguate2+= loss_val.data.numpy() + loss_val.backward() + for param in list(self.encoder.parameters()): + param.grad.data.clamp_(-1, 1) + self.optimizer_encoder_diff.step() + + + + + # Not so much this one + # Entropy maximization loss (through exponential) between two consecutive states + self.optimizer_diff_s_s_.zero_grad() + out = self.diff_s_s_(self.encoder,states_val,next_states_val) + loss_val = exp_dec_error_pytorch(out) + self.loss_disentangle_t+= loss_val.data.numpy() + loss_val.backward() + for param in list(self.encoder.parameters()): + param.grad.data.clamp_(-1, 1) + self.optimizer_diff_s_s_.step() + + + # Q Learning loss + if self.update_counter % self._freeze_interval == 0: + self._resetQHat() + next_q_vals = self.full_Q_target(next_states_val,self.encoder_target,self.Q_target).detach() + max_next_q_vals=torch.max(next_q_vals, dim=1)[0] + not_terminals= (1 - terminals_val) + target = rewards_val + not_terminals * self._df * max_next_q_vals[:,None] + + self.optimizer_full_Q.zero_grad() + q_vals=self.full_Q(states_val,self.encoder,self.Q).gather(1, torch.from_numpy(actions_val.astype(int)[:,None])) + loss = torch.nn.MSELoss() + loss_val = loss(q_vals,target) + loss = loss_val.data.numpy() + self.loss_Q+= loss + loss_val.backward() + for param in list(self.encoder.parameters()) + list(self.Q.parameters()): + param.grad.data.clamp_(-1, 1) + self.optimizer_full_Q.step() + + + + + + + if(self.update_counter%500==0): + print ("self.loss_T/500., self.lossR/500., self.loss_gamma/500., self.loss_Q/500., self.loss_disentangle_t/500., self.loss_disambiguate1/500., self.loss_disambiguate2/500.") + print (self.loss_T/500., self.lossR/500.,self.loss_gamma/500., self.loss_Q/500., self.loss_disentangle_t/500., self.loss_disambiguate1/500., self.loss_disambiguate2/500.) + + if(self._high_int_dim==False): + print ("self.loss_interpret/500.") + print (self.loss_interpret/500.) + + self.lossR=0 + self.loss_gamma=0 + self.loss_Q=0 + self.loss_T=0 + self.loss_interpret=0 + + self.loss_disentangle_t=0 + self.loss_disambiguate1=0 + self.loss_disambiguate2=0 + + + if(self.update_counter%100==0): + print ("Number of training steps:"+str(self.update_counter)+".") + + self.update_counter += 1 + + + + + return np.sqrt(loss),(q_vals.detach()-target)**2 + + + def _compile(self): + """ Compile all the optimizers for the different losses + """ + + + if (self._update_rule=="rmsprop"): + self.optimizer_full_Q=optim.RMSprop(list(self.encoder.parameters()) + list(self.Q.parameters()), lr=self._lr, alpha=self._rho, eps=self._rms_epsilon) + self.optimizer_diff_Tx_x_=optim.RMSprop( list(self.encoder.parameters()) +list(self.transition.parameters()), lr=self._lr, alpha=self._rho, eps=self._rms_epsilon) # Different optimizers for each network; + self.optimizer_full_R=optim.RMSprop(list(self.encoder.parameters()) + list(self.R.parameters()), lr=self._lr, alpha=self._rho, eps=self._rms_epsilon) # to possibly modify them separately + self.optimizer_full_gamma=optim.RMSprop(list(self.encoder.parameters()) + list(self.gamma.parameters()), lr=self._lr, alpha=self._rho, eps=self._rms_epsilon) + self.optimizer_encoder=optim.RMSprop(self.encoder.parameters(), lr=self._lr, alpha=self._rho, eps=self._rms_epsilon) + self.optimizer_encoder_diff=optim.RMSprop(self.encoder.parameters(), lr=self._lr, alpha=self._rho, eps=self._rms_epsilon) + self.optimizer_diff_s_s_=optim.RMSprop(self.encoder.parameters(), lr=self._lr, alpha=self._rho, eps=self._rms_epsilon) + # self.optimizer_force_features=optim.RMSprop(list(self.encoder.parameters()) + list(self.transition.parameters()), lr=self._lr, alpha=self._rho, eps=self._rms_epsilon) # This never gets updated + + else: + raise Exception('The update_rule '+self._update_rule+' is not implemented.') + + self.optimizers = [self.optimizer_full_Q,self.optimizer_diff_Tx_x_, + self.optimizer_full_R,self.optimizer_full_gamma, + self.optimizer_encoder,self.optimizer_encoder_diff, + self.optimizer_diff_s_s_ ] + + + def qValues(self, state_val): + """ Get the q values for one pseudo-state (without planning) + + Arguments + --------- + state_val : array of objects (or list of objects) + Each object is a numpy array that relates to one of the observations + with size [1 * history size * size of punctual observation (which is 2D,1D or scalar)]). + + Returns + ------- + The q values for the provided pseudo state + """ + copy_state=copy.deepcopy(state_val) #Required! + + return self.full_Q.predict([np.expand_dims(state,axis=0) for state in copy_state])[0] + + def qValues_planning(self, state_val, R, gamma, T, Q, d=5): + """ Get the average Q-values up to planning depth d for one pseudo-state. + + Arguments + --------- + state_val : array of objects (or list of objects) + Each object is a numpy array that relates to one of the observations + with size [1 * history size * size of punctual observation (which is 2D,1D or scalar)]). + R : float_model + Model that fits the reward + gamma : float_model + Model that fits the discount factor + T : transition_model + Model that fits the transition between abstract representation + Q : Q_model + Model that fits the optimal Q-value + d : int + planning depth + + Returns + ------- + The average q values with planning depth up to d for the provided pseudo-state + """ + encoded_x = self.encoder.predict(state_val) + +# ## DEBUG PURPOSES +# print ( "self.full_Q.predict(state_val)[0]" ) +# print ( self.full_Q.predict(state_val)[0] ) +# identity_matrix = np.diag(np.ones(self._n_actions)) +# if(encoded_x.ndim==2): +# tile3_encoded_x=np.tile(encoded_x,(self._n_actions,1)) +# elif(encoded_x.ndim==4): +# tile3_encoded_x=np.tile(encoded_x,(self._n_actions,1,1,1)) +# else: +# print ("error") +# +# repeat_identity=np.repeat(identity_matrix,len(encoded_x),axis=0) +# ##print tile3_encoded_x +# ##print repeat_identity +# r_vals_d0=np.array(R.predict([tile3_encoded_x,repeat_identity])) +# #print "r_vals_d0" +# #print r_vals_d0 +# r_vals_d0=r_vals_d0.flatten() +# print "r_vals_d0" +# print r_vals_d0 +# next_x_predicted=T.predict([tile3_encoded_x,repeat_identity]) +# #print "next_x_predicted" +# #print next_x_predicted +# one_hot_first_action=np.zeros((1,self._n_actions)) +# one_hot_first_action[0]=1 +# next_x_predicted=T.predict([next_x_predicted[0:1],one_hot_first_action]) +# next_x_predicted=T.predict([next_x_predicted[0:1],one_hot_first_action]) +# next_x_predicted=T.predict([next_x_predicted[0:1],one_hot_first_action]) +# #print "next_x_predicted action 0 t4" +# #print next_x_predicted +# ## END DEBUG PURPOSES + + QD_plan=0 + for i in range(d+1): + Qd=self.qValues_planning_abstr(encoded_x, R, gamma, T, Q, d=i, branching_factor=[self._n_actions,2,2,2,2,2,2,2]).reshape(len(encoded_x),-1) + print ("Qd,i") + print (Qd,i) + QD_plan+=Qd + QD_plan=QD_plan/(d+1) + + print ("QD_plan") + print (QD_plan) + + return QD_plan + + def qValues_planning_abstr(self, state_abstr_val, R, gamma, T, Q, d, branching_factor=None): + """ Get the q values for pseudo-state(s) with a planning depth d. + This function is called recursively by decreasing the depth d at every step. + + Arguments + --------- + state_abstr_val : internal state(s). + R : float_model + Model that fits the reward + gamma : float_model + Model that fits the discount factor + T : transition_model + Model that fits the transition between abstract representation + Q : Q_model + Model that fits the optimal Q-value + d : int + planning depth + + Returns + ------- + The Q-values with planning depth d for the provided encoded state(s) + """ + #if(branching_factor==None or branching_factor>self._n_actions): + # branching_factor=self._n_actions + + + + n=len(state_abstr_val) + identity_matrix = np.identity(self._n_actions) + + this_branching_factor=branching_factor.pop(0) + if (n==1): + # We require that the first branching factor is self._n_actions so that this function return values + # with the right dimension (=self._n_actions). + this_branching_factor=self._n_actions + + if (d==0): + if(this_branching_factor1, it provides the possibility to consider a sequence of transitions between s1 and s2 + (input a is then a list of actions) + + Returns + ------- + model with output Tx (= model estimate of x') + + """ + + + enc_s1 = encoder_model(s1) + enc_s2 = encoder_model(s2) + + Tx = transition_model(torch.cat((enc_s1,action),-1)) + + + return (Tx - enc_s2)*(not_terminal) + + def force_features(self,s1,s2,action,encoder_model,transition_model,plan_depth=0): + """ Instantiate a Keras model that provides the vector of the transition at E(s1). It is calculated as the different between E(s1) and E(T(s1)). + Used to force the directions of the transitions. + + The model takes the four following inputs: + s1 : list of objects + Each object is a numpy array that relates to one of the observations + with size [batch_size * history size * size of punctual observation (which is 2D,1D or scalar)]). + a : list of ints with length (plan_depth+1) + the action(s) considered at s1 + + Parameters + ----------- + encoder_model: instantiation of a Keras model for the encoder (E) + transition_model: instantiation of a Keras model for the transition (T) + plan_depth: if>1, it provides the possibility to consider a sequence of transitions between s1 and s2 + (input a is then a list of actions) + + Returns + ------- + model with output E(s1)-T(E(s1)) + + """ + + + enc_s1 = encoder_model(s1) + enc_s2 = encoder_model(s2) + + Tx = transition_model(torch.cat((enc_s1,action),-1)) + + + return (Tx - enc_s2) + + + def float_model(self): + """ Instantiate a Keras model for fitting a float from x. + + The model takes the following inputs: + x : internal state + a : int + the action considered at x + + Parameters + ----------- + + Returns + ------- + model that outputs a float + + """ + + + class FloatModel(nn.Module): + def __init__(self,internal_dim,n_actions): + super(FloatModel, self).__init__() + self.lin1 = nn.Linear(internal_dim+n_actions, 10) + self.lin2 = nn.Linear(10, 50) + self.lin3 = nn.Linear(50, 20) + self.lin4 = nn.Linear(20, 1) + + def forward(self, x): + + x = torch.tanh(self.lin1(x)) + x = torch.tanh(self.lin2(x)) + x = torch.tanh(self.lin3(x)) + x = self.lin4(x) + return x + def predict(self, x): + return self.forward(x) + model = FloatModel(self.internal_dim,self._n_actions) + + + + return model + + def full_float_model(self,x,action,encoder_model,float_model,plan_depth=0,transition_model=None): + """ Instantiate a Keras model for fitting a float from s. + + The model takes the four following inputs: + s : list of objects + Each object is a numpy array that relates to one of the observations + with size [batch_size * history size * size of punctual observation (which is 2D,1D or scalar)]). + a : list of ints with length (plan_depth+1) + the action(s) considered at s + + Parameters + ----------- + encoder_model: instantiation of a Keras model for the encoder (E) + float_model: instantiation of a Keras model for fitting a float from x + plan_depth: if>1, it provides the possibility to consider a sequence of transitions following s + (input a is then a list of actions) + transition_model: instantiation of a Keras model for the transition (T) + + Returns + ------- + model with output the reward r + """ + + + enc_x = encoder_model(x) + reward_pred = float_model(torch.cat((enc_x,action),-1)) + return reward_pred + + def Q_model(self): + """ Instantiate a a Keras model for the Q-network from x. + + The model takes the following inputs: + x : internal state + + Parameters + ----------- + + Returns + ------- + model that outputs the Q-values for each action + """ + + + + class QFunction(nn.Module): + def __init__(self,internal_dim,n_actions): + super(QFunction, self).__init__() + self.lin1 = nn.Linear(internal_dim, 20) + self.lin2 = nn.Linear(20, 50) + self.lin3 = nn.Linear(50, 20) + self.lin4 = nn.Linear(20, n_actions) + + def forward(self, x): + x = torch.tanh(self.lin1(x)) + x = torch.tanh(self.lin2(x)) + x = torch.tanh(self.lin3(x)) + x = self.lin4(x) + return x + def predict(self, x): + return self.forward(x) + + model = QFunction(self.internal_dim,self._n_actions) + + + + + return model + + + def full_Q_model(self, x, encoder_model, Q_model, plan_depth=0, transition_model=None, R_model=None, discount_model=None): + """ Instantiate a a Keras model for the Q-network from s. + + The model takes the following inputs: + s : list of objects + Each object is a numpy array that relates to one of the observations + with size [batch_size * history size * size of punctual observation (which is 2D,1D or scalar)]). + a : list of ints with length plan_depth; if plan_depth=0, there isn't any input for a. + the action(s) considered at s + + Parameters + ----------- + encoder_model: instantiation of a Keras model for the encoder (E) + Q_model: instantiation of a Keras model for the Q-network from x. + plan_depth: if>1, it provides the possibility to consider a sequence of transitions following s + (input a is then a list of actions) + transition_model: instantiation of a Keras model for the transition (T) + R_model: instantiation of a Keras model for the reward + discount_model: instantiation of a Keras model for the discount + + Returns + ------- + model with output the Q-values + """ + + out = encoder_model(x) + Q_estim= Q_model(out) + + return Q_estim + +if __name__ == '__main__': + pass + \ No newline at end of file diff --git a/examples/test_CRAR/run_simple_maze_pytorch.py b/examples/test_CRAR/run_simple_maze_pytorch.py new file mode 100644 index 00000000..2c242b43 --- /dev/null +++ b/examples/test_CRAR/run_simple_maze_pytorch.py @@ -0,0 +1,199 @@ +"""Simple maze launcher +""" + +import sys +import logging +import numpy as np +from joblib import hash, dump +import os +import pdb + +from deer.default_parser import process_args +from deer.agent import NeuralAgent +from deer.learning_algos.CRAR_pytorch import CRAR +from simple_maze_env_pytorch import MyEnv as simple_maze_env +import deer.experiment.base_controllers as bc + +from deer.policies import EpsilonGreedyPolicy + + +class Defaults: + # ---------------------- + # Experiment Parameters + # ---------------------- + STEPS_PER_EPOCH = 5000 + EPOCHS = 50 + STEPS_PER_TEST = 1000 + PERIOD_BTW_SUMMARY_PERFS = 1 + + # ---------------------- + # Environment Parameters + # ---------------------- + FRAME_SKIP = 2 + + # ---------------------- + # DQN Agent parameters: + # ---------------------- + UPDATE_RULE = 'rmsprop' + LEARNING_RATE = 0.0005 + LEARNING_RATE_DECAY = 0.9 + DISCOUNT = 0.9 + DISCOUNT_INC = 1 + DISCOUNT_MAX = 0.99 + RMS_DECAY = 0.9 + RMS_EPSILON = 0.0001 + MOMENTUM = 0 + CLIP_NORM = 1.0 + EPSILON_START = 1.0 + EPSILON_MIN = 1.0 + EPSILON_DECAY = 10000 + UPDATE_FREQUENCY = 1 + REPLAY_MEMORY_SIZE = 1000000 + BATCH_SIZE = 32 + FREEZE_INTERVAL = 1000 + DETERMINISTIC = False + + +HIGHER_DIM_OBS = False + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + + # --- Parse parameters --- + parameters = process_args(sys.argv[1:], Defaults) + if parameters.deterministic: + rng = np.random.RandomState(123456) + else: + rng = np.random.RandomState() + + # --- Instantiate environment --- + env = simple_maze_env(rng, higher_dim_obs=HIGHER_DIM_OBS) + + # --- Instantiate learning_algo --- + learning_algo = CRAR( + env, + parameters.rms_decay, + parameters.rms_epsilon, + parameters.momentum, + parameters.clip_norm, + parameters.freeze_interval, + parameters.batch_size, + parameters.update_rule, + rng, + high_int_dim=False, + internal_dim=2) + + test_policy = EpsilonGreedyPolicy(learning_algo, env.nActions(), rng, 1.) + + # --- Instantiate agent --- + agent = NeuralAgent( + env, + learning_algo, + parameters.replay_memory_size, + max(env.inputDimensions()[i][0] for i in range(len(env.inputDimensions()))), + parameters.batch_size, + rng, + test_policy=test_policy) + + # --- Create unique filename for FindBestController --- + h = hash(vars(parameters), hash_name="sha1") + fname = "test_" + h + print("The parameters hash is: {}".format(h)) + print("The parameters are: {}".format(parameters)) + + # As for the discount factor and the learning rate, one can update periodically the parameter of the epsilon-greedy + # policy implemented by the agent. This controllers has a bit more capabilities, as it allows one to choose more + # precisely when to update epsilon: after every X action, episode or epoch. This parameter can also be reset every + # episode or epoch (or never, hence the resetEvery='none'). + agent.attach(bc.EpsilonController( + initial_e=parameters.epsilon_start, + e_decays=parameters.epsilon_decay, + e_min=parameters.epsilon_min, + evaluate_on='action', + periodicity=1, + reset_every='none')) + + + agent.run(10, 500) + print("end gathering data") + + # --- Bind controllers to the agent --- + # Before every training epoch (periodicity=1), we want to print a summary of the agent's epsilon, discount and + # learning rate as well as the training epoch number. + agent.attach(bc.VerboseController( + evaluate_on='epoch', + periodicity=1)) + + # Every epoch end, one has the possibility to modify the learning rate using a LearningRateController. Here we + # wish to update the learning rate after every training epoch (periodicity=1), according to the parameters given. + agent.attach(bc.LearningRateController( + initial_learning_rate=parameters.learning_rate, + learning_rate_decay=parameters.learning_rate_decay, + periodicity=1)) + + # Same for the discount factor. + agent.attach(bc.DiscountFactorController( + initial_discount_factor=parameters.discount, + discount_factor_growth=parameters.discount_inc, + discount_factor_max=parameters.discount_max, + periodicity=1)) + + # During training epochs, we want to train the agent after every [parameters.update_frequency] action it takes. + # Plus, we also want to display after each training episode (!= than after every training) the average bellman + # residual and the average of the V values obtained during the last episode, hence the two last arguments. + agent.attach(bc.TrainerController( + evaluate_on='action', + periodicity=parameters.update_frequency, + show_episode_avg_V_value=True, + show_avg_Bellman_residual=True)) + + # We wish to discover, among all versions of our neural network (i.e., after every training epoch), which one + # seems to generalize the better, thus which one has the highest validation score. Here, we do not care about the + # "true generalization score", or "test score". + # To achieve this goal, one can use the FindBestController along with an InterleavedTestEpochControllers. It is + # important that the validationID is the same than the id argument of the InterleavedTestEpochController. + # The FindBestController will dump on disk the validation scores for each and every network, as well as the + # structure of the neural network having the best validation score. These dumps can then used to plot the evolution + # of the validation and test scores (see below) or simply recover the resulting neural network for your + # application. + agent.attach(bc.FindBestController( + validationID=simple_maze_env.VALIDATION_MODE, + testID=None, + unique_fname=fname)) + + # All previous controllers control the agent during the epochs it goes through. However, we want to interleave a + # "validation epoch" between each training epoch ("one of two epochs", hence the periodicity=2). We do not want + # these validation epoch to interfere with the training of the agent, which is well established by the + # TrainerController, EpsilonController and alike. Therefore, we will disable these controllers for the whole + # duration of the validation epochs interleaved this way, using the controllersToDisable argument of the + # InterleavedTestEpochController. For each validation epoch, we want also to display the sum of all rewards + # obtained, hence the showScore=True. Finally, we want to call the summarizePerformance method of ALE_env every + # [parameters.period_btw_summary_perfs] *validation* epochs. + agent.attach(bc.InterleavedTestEpochController( + id=simple_maze_env.VALIDATION_MODE, + epoch_length=parameters.steps_per_test, + controllers_to_disable=[0, 1, 2, 3, 4], + periodicity=2, + show_score=True, + summarize_every=1)) + + # --- Run the experiment --- + try: + os.mkdir("params") + except Exception: + pass + dump(vars(parameters), "params/" + fname + ".jldump") + agent.gathering_data=False + + + agent.run(parameters.epochs, parameters.steps_per_epoch) + + # --- Show results --- + basename = "scores/" + fname + scores = joblib.load(basename + "_scores.jldump") + plt.plot(range(1, len(scores['vs'])+1), scores['vs'], label="VS", color='b') + plt.legend() + plt.xlabel("Number of epochs") + plt.ylabel("Score") + plt.savefig(basename + "_scores.pdf") + plt.show() \ No newline at end of file diff --git a/examples/test_CRAR/simple_maze_env_pytorch.py b/examples/test_CRAR/simple_maze_env_pytorch.py new file mode 100644 index 00000000..7602d31b --- /dev/null +++ b/examples/test_CRAR/simple_maze_env_pytorch.py @@ -0,0 +1,429 @@ +""" Simple maze environment + +""" +import numpy as np +import cv2 +import pdb +import torch + +from deer.base_classes import Environment + +import matplotlib +matplotlib.use('agg') +# matplotlib.use('qt5agg') +from mpl_toolkits.axes_grid1 import host_subplot +import mpl_toolkits.axisartist as AA +import matplotlib.pyplot as plt +from mpl_toolkits.mplot3d import Axes3D +import matplotlib.cm as cm +from matplotlib.patches import Circle, Rectangle +from matplotlib.offsetbox import AnchoredOffsetbox, TextArea, DrawingArea, HPacker +import copy + +class MyEnv(Environment): + VALIDATION_MODE = 0 + + def __init__(self, rng, **kwargs): + + self._mode = -1 + self._mode_score = 0.0 + self._mode_episode_count = 0 + self._size_maze=8 + self._higher_dim_obs=kwargs["higher_dim_obs"] + self.create_map() + self.intern_dim=2 + + def create_map(self): + self._map=np.zeros((self._size_maze,self._size_maze)) + self._map[-1,:]=1 + self._map[0,:]=1 + self._map[:,0]=1 + self._map[:,-1]=1 + self._map[:,self._size_maze//2]=1 + self._map[self._size_maze//2,self._size_maze//2]=0 + self._pos_agent=[2,2] + self._pos_goal=[self._size_maze-2,self._size_maze-2] + + + def reset(self, mode): + self.create_map() + + self._map[self._size_maze//2,self._size_maze//2]=0 + + if mode == MyEnv.VALIDATION_MODE: + if self._mode != MyEnv.VALIDATION_MODE: + self._mode = MyEnv.VALIDATION_MODE + self._mode_score = 0.0 + self._mode_episode_count = 0 + + else: + self._mode_episode_count += 1 + elif self._mode != -1: + self._mode = -1 + + # Setting the starting position of the agent + self._pos_agent=[self._size_maze//2,self._size_maze//2] + + #print ("new map:") + #print (self._map) + #print ("reset mode") + #print (mode) + + return [1 * [self._size_maze * [self._size_maze * [0]]]] + + + def act(self, action): + """Applies the agent action [action] on the environment. + + Parameters + ----------- + action : int + The action selected by the agent to operate on the environment. Should be an identifier + included between 0 included and nActions() excluded. + """ + + self._cur_action=action + if(action==0): + if(self._map[self._pos_agent[0]-1,self._pos_agent[1]]==0): + self._pos_agent[0]=self._pos_agent[0]-1 + elif(action==1): + if(self._map[self._pos_agent[0]+1,self._pos_agent[1]]==0): + self._pos_agent[0]=self._pos_agent[0]+1 + elif(action==2): + if(self._map[self._pos_agent[0],self._pos_agent[1]-1]==0): + self._pos_agent[1]=self._pos_agent[1]-1 + elif(action==3): + if(self._map[self._pos_agent[0],self._pos_agent[1]+1]==0): + self._pos_agent[1]=self._pos_agent[1]+1 + + # There is no reward in this simple environment + self.reward = 0 + + self._mode_score += self.reward + return self.reward + + def summarizePerformance(self, test_data_set, learning_algo, *args, **kwargs): + """ Plot of the low-dimensional representation of the environment built by the model + """ + + all_possib_inp=[] # Will store all possible inputs (=observation) for the CRAR agent + labels_maze=[] + self.create_map() + for y_a in range(self._size_maze): + for x_a in range(self._size_maze): + state=copy.deepcopy(self._map) + state[self._size_maze//2,self._size_maze//2]=0 + if(state[x_a,y_a]==0): + if(self._higher_dim_obs==True): + all_possib_inp.append(self.get_higher_dim_obs([[x_a,y_a]],[self._pos_goal])) + else: + state[x_a,y_a]=0.5 + all_possib_inp.append(state) + + ## labels + #if(y_a 'channels_first' + + n=1000 + historics=[] + for i,observ in enumerate(test_data_set.observations()[0][0:n]): + historics.append(np.expand_dims(observ,axis=0)) + historics=np.array(historics) + + historics = torch.from_numpy(historics).float() + abs_states=learning_algo.encoder.predict(historics) + # if(abs_states.ndim==4): + # abs_states=np.transpose(abs_states, (0, 3, 1, 2)) # data_format='channels_last' --> 'channels_first' + + actions=test_data_set.actions()[0:n] + + if self.inTerminalState() == False: + self._mode_episode_count += 1 + print("== Mean score per episode is {} over {} episodes ==".format(self._mode_score / (self._mode_episode_count+0.0001), self._mode_episode_count)) + + + m = cm.ScalarMappable(cmap=cm.jet) + + + + abs_states = abs_states.detach().numpy() + all_possib_abs_states = all_possib_abs_states.detach().numpy() + + x = np.array(abs_states)[:,0] + y = np.array(abs_states)[:,1] + if(self.intern_dim>2): + z = np.array(abs_states)[:,2] + + fig = plt.figure() + if(self.intern_dim==2): + ax = fig.add_subplot(111) + ax.set_xlabel(r'$X_1$') + ax.set_ylabel(r'$X_2$') + else: + ax = fig.add_subplot(111,projection='3d') + ax.set_xlabel(r'$X_1$') + ax.set_ylabel(r'$X_2$') + ax.set_zlabel(r'$X_3$') + + # Plot the estimated transitions + for i in range(n-1): + # pdb.set_trace() + predicted1=learning_algo.transition.predict(torch.cat((torch.from_numpy(abs_states[i:i+1]).float() ,torch.from_numpy(np.array([[1,0,0,0]])).float()),-1)).detach().numpy() + predicted2=learning_algo.transition.predict(torch.cat((torch.from_numpy(abs_states[i:i+1]).float() ,torch.from_numpy(np.array([[0,1,0,0]])).float()),-1)).detach().numpy() + predicted3=learning_algo.transition.predict(torch.cat((torch.from_numpy(abs_states[i:i+1]).float() ,torch.from_numpy(np.array([[0,0,1,0]])).float()),-1)).detach().numpy() + predicted4=learning_algo.transition.predict(torch.cat((torch.from_numpy(abs_states[i:i+1]).float() ,torch.from_numpy(np.array([[0,0,0,1]])).float()),-1)).detach().numpy() + # predicted1=learning_algo.transition.predict([abs_states[i:i+1],np.array([[1,0,0,0]])]) + # predicted2=learning_algo.transition.predict([abs_states[i:i+1],np.array([[0,1,0,0]])]) + # predicted3=learning_algo.transition.predict([abs_states[i:i+1],np.array([[0,0,1,0]])]) + # predicted4=learning_algo.transition.predict([abs_states[i:i+1],np.array([[0,0,0,1]])]) + if(self.intern_dim==2): + ax.plot(np.concatenate([x[i:i+1],predicted1[0,:1]]), np.concatenate([y[i:i+1],predicted1[0,1:2]]), color="0.9", alpha=0.75) + ax.plot(np.concatenate([x[i:i+1],predicted2[0,:1]]), np.concatenate([y[i:i+1],predicted2[0,1:2]]), color="0.65", alpha=0.75) + ax.plot(np.concatenate([x[i:i+1],predicted3[0,:1]]), np.concatenate([y[i:i+1],predicted3[0,1:2]]), color="0.4", alpha=0.75) + ax.plot(np.concatenate([x[i:i+1],predicted4[0,:1]]), np.concatenate([y[i:i+1],predicted4[0,1:2]]), color="0.15", alpha=0.75) + else: + ax.plot(np.concatenate([x[i:i+1],predicted1[0,:1]]), np.concatenate([y[i:i+1],predicted1[0,1:2]]), np.concatenate([z[i:i+1],predicted1[0,2:3]]), color="0.9", alpha=0.75) + ax.plot(np.concatenate([x[i:i+1],predicted2[0,:1]]), np.concatenate([y[i:i+1],predicted2[0,1:2]]), np.concatenate([z[i:i+1],predicted2[0,2:3]]), color="0.65", alpha=0.75) + ax.plot(np.concatenate([x[i:i+1],predicted3[0,:1]]), np.concatenate([y[i:i+1],predicted3[0,1:2]]), np.concatenate([z[i:i+1],predicted3[0,2:3]]), color="0.4", alpha=0.75) + ax.plot(np.concatenate([x[i:i+1],predicted4[0,:1]]), np.concatenate([y[i:i+1],predicted4[0,1:2]]), np.concatenate([z[i:i+1],predicted4[0,2:3]]), color="0.15", alpha=0.75) + + # Plot the dots at each time step depending on the action taken + length_block=[[0,18],[18,19],[19,31]] + for i in range(3): + colors=['blue','orange','green'] + if(self.intern_dim==2): + line3 = ax.scatter(all_possib_abs_states[length_block[i][0]:length_block[i][1],0], all_possib_abs_states[length_block[i][0]:length_block[i][1],1], c=colors[i], marker='x', edgecolors='k', alpha=0.5, s=100) + else: + line3 = ax.scatter(all_possib_abs_states[length_block[i][0]:length_block[i][1],0], all_possib_abs_states[length_block[i][0]:length_block[i][1],1] ,all_possib_abs_states[length_block[i][0]:length_block[i][1],2], marker='x', depthshade=True, edgecolors='k', alpha=0.5, s=50) + + if(self.intern_dim==2): + axes_lims=[ax.get_xlim(),ax.get_ylim()] + else: + axes_lims=[ax.get_xlim(),ax.get_ylim(),ax.get_zlim()] + + # Plot the legend for transition estimates + box1b = TextArea(" Estimated transitions (action 0, 1, 2 and 3): ", textprops=dict(color="k")) + box2b = DrawingArea(90, 20, 0, 0) + el1b = Rectangle((5, 10), 15,2, fc="0.9", alpha=0.75) + el2b = Rectangle((25, 10), 15,2, fc="0.65", alpha=0.75) + el3b = Rectangle((45, 10), 15,2, fc="0.4", alpha=0.75) + el4b = Rectangle((65, 10), 15,2, fc="0.15", alpha=0.75) + box2b.add_artist(el1b) + box2b.add_artist(el2b) + box2b.add_artist(el3b) + box2b.add_artist(el4b) + + boxb = HPacker(children=[box1b, box2b], + align="center", + pad=0, sep=5) + + anchored_box = AnchoredOffsetbox(loc=3, + child=boxb, pad=0., + frameon=True, + bbox_to_anchor=(0., 0.98), + bbox_transform=ax.transAxes, + borderpad=0., + ) + ax.add_artist(anchored_box) + + + #plt.show() + plt.savefig('pytorch/fig_base'+str(learning_algo.update_counter)+'.pdf') + + +# # Plot the Q_vals +# c = learning_algo.Q.predict(np.concatenate((np.expand_dims(x,axis=1),np.expand_dims(y,axis=1),np.expand_dims(z,axis=1)),axis=1)) +# #print "actions,C" +# #print actions +# #print c +# #c=np.max(c,axis=1) +# m1=ax.scatter(x, y, z+zrange/20, c=c[:,0], vmin=-1., vmax=1., cmap=plt.cm.RdYlGn) +# m2=ax.scatter(x, y, z+3*zrange/40, c=c[:,1], vmin=-1., vmax=1., cmap=plt.cm.RdYlGn) +# +# #plt.colorbar(m3) +# ax2 = fig.add_axes([0.85, 0.15, 0.025, 0.7]) +# cmap = matplotlib.cm.RdYlGn +# norm = matplotlib.colors.Normalize(vmin=-1, vmax=1) +# +# # ColorbarBase derives from ScalarMappable and puts a colorbar +# # in a specified axes, so it has everything needed for a +# # standalone colorbar. There are many more kwargs, but the +# # following gives a basic continuous colorbar with ticks +# # and labels. +# cb1 = matplotlib.colorbar.ColorbarBase(ax2, cmap=cmap,norm=norm,orientation='vertical') +# cb1.set_label('Estimated expected return') +# +# #plt.show() +# plt.savefig('fig_w_V'+str(learning_algo.update_counter)+'.pdf') +# +# +# # fig_visuV +# fig = plt.figure() +# ax = fig.add_subplot(111, projection='3d') +# +# x = np.array([i for i in range(5) for jk in range(25)])/4.*(axes_lims[0][1]-axes_lims[0][0])+axes_lims[0][0] +# y = np.array([j for i in range(5) for j in range(5) for k in range(5)])/4.*(axes_lims[1][1]-axes_lims[1][0])+axes_lims[1][0] +# z = np.array([k for i in range(5) for j in range(5) for k in range(5)])/4.*(axes_lims[2][1]-axes_lims[2][0])+axes_lims[2][0] +# +# c = learning_algo.Q.predict(np.concatenate((np.expand_dims(x,axis=1),np.expand_dims(y,axis=1),np.expand_dims(z,axis=1)),axis=1)) +# c=np.max(c,axis=1) +# #print "c" +# #print c +# +# m=ax.scatter(x, y, z, c=c, vmin=-1., vmax=1., cmap=plt.hot()) +# #plt.colorbar(m) +# fig.subplots_adjust(right=0.8) +# ax2 = fig.add_axes([0.875, 0.15, 0.025, 0.7]) +# cmap = matplotlib.cm.hot +# norm = matplotlib.colors.Normalize(vmin=-1, vmax=1) +# +# # ColorbarBase derives from ScalarMappable and puts a colorbar +# # in a specified axes, so it has everything needed for a +# # standalone colorbar. There are many more kwargs, but the +# # following gives a basic continuous colorbar with ticks +# # and labels. +# cb1 = matplotlib.colorbar.ColorbarBase(ax2, cmap=cmap,norm=norm,orientation='vertical') +# cb1.set_label('Estimated expected return') +# +# #plt.show() +# plt.savefig('fig_visuV'+str(learning_algo.update_counter)+'.pdf') +# +# +# # fig_visuR +# fig = plt.figure() +# ax = fig.add_subplot(111, projection='3d') +# +# x = np.array([i for i in range(5) for jk in range(25)])/4.*(axes_lims[0][1]-axes_lims[0][0])+axes_lims[0][0] +# y = np.array([j for i in range(5) for j in range(5) for k in range(5)])/4.*(axes_lims[1][1]-axes_lims[1][0])+axes_lims[1][0] +# z = np.array([k for i in range(5) for j in range(5) for k in range(5)])/4.*(axes_lims[2][1]-axes_lims[2][0])+axes_lims[2][0] +# +# coords=np.concatenate((np.expand_dims(x,axis=1),np.expand_dims(y,axis=1),np.expand_dims(z,axis=1)),axis=1) +# repeat_nactions_coord=np.repeat(coords,self.nActions(),axis=0) +# identity_matrix = np.diag(np.ones(self.nActions())) +# tile_identity_matrix=np.tile(identity_matrix,(5*5*5,1)) +# +# c = learning_algo.R.predict([repeat_nactions_coord,tile_identity_matrix]) +# c=np.max(np.reshape(c,(125,self.nActions())),axis=1) +# #print "c" +# #print c +# #mini=np.min(c) +# #maxi=np.max(c) +# +# m=ax.scatter(x, y, z, c=c, vmin=-1., vmax=1., cmap=plt.hot()) +# #plt.colorbar(m) +# fig.subplots_adjust(right=0.8) +# ax2 = fig.add_axes([0.875, 0.15, 0.025, 0.7]) +# cmap = matplotlib.cm.hot +# norm = matplotlib.colors.Normalize(vmin=-1, vmax=1) +# +# # ColorbarBase derives from ScalarMappable and puts a colorbar +# # in a specified axes, so it has everything needed for a +# # standalone colorbar. There are many more kwargs, but the +# # following gives a basic continuous colorbar with ticks +# # and labels. +# cb1 = matplotlib.colorbar.ColorbarBase(ax2, cmap=cmap,norm=norm,orientation='vertical') +# cb1.set_label('Estimated expected return') +# +# #plt.show() +# plt.savefig('fig_visuR'+str(learning_algo.update_counter)+'.pdf') + + matplotlib.pyplot.close("all") # avoids memory leaks + + def inputDimensions(self): + if(self._higher_dim_obs==True): + return [(1,self._size_maze*6,self._size_maze*6)] + else: + return [(1,self._size_maze,self._size_maze)] + + def observationType(self, subject): + return np.float + + def nActions(self): + return 4 + + def observe(self): + obs=copy.deepcopy(self._map) + + obs[self._pos_agent[0],self._pos_agent[1]]=0.5 + if(self._higher_dim_obs==True): + "self._pos_agent" + self._pos_agent + obs=self.get_higher_dim_obs([self._pos_agent],[self._pos_goal]) + + return [obs] + + def get_higher_dim_obs(self,indices_agent,indices_reward): + """ Obtain the high-dimensional observation from indices of the agent position and the indices of the reward positions. + """ + obs=copy.deepcopy(self._map) + obs=obs/1. + obs=np.repeat(np.repeat(obs, 6, axis=0),6, axis=1) + # agent repr + agent_obs=np.zeros((6,6)) + agent_obs[0,2]=0.7 + agent_obs[1,0:5]=0.8 + agent_obs[2,1:4]=0.8 + agent_obs[3,1:4]=0.8 + agent_obs[4,1]=0.8 + agent_obs[4,3]=0.8 + agent_obs[5,0:2]=0.8 + agent_obs[5,3:5]=0.8 + + # reward repr + reward_obs=np.zeros((6,6)) + #reward_obs[:,1]=0.8 + #reward_obs[0,1:4]=0.7 + #reward_obs[1,3]=0.8 + #reward_obs[2,1:4]=0.7 + #reward_obs[4,2]=0.8 + #reward_obs[5,2:4]=0.8 + + for i in indices_reward: + obs[i[0]*6:(i[0]+1)*6:,i[1]*6:(i[1]+1)*6]=reward_obs + + for i in indices_agent: + obs[i[0]*6:(i[0]+1)*6:,i[1]*6:(i[1]+1)*6]=agent_obs + + #plt.imshow(obs, cmap='gray_r') + #plt.show() + return obs + + + def inTerminalState(self): + # Uncomment the following lines to add some cases where the episode terminates. + # This is used to show how the environment representation interpret cases where + # part of the environment could not be explored. +# if((self._pos_agent[0]<=1 and self._cur_action==0) ): +# return True + return False + + # If there is a goal, then terminates the environment when the goas is reached. + #if (self._pos_agent==self._pos_goal): + # return True + #else: + # return False + + + +if __name__ == "__main__": + pass