diff --git a/.gitignore b/.gitignore index 0b68375a..c78e6696 100644 --- a/.gitignore +++ b/.gitignore @@ -81,3 +81,7 @@ ipython_config.py venv/ env.bak/ venv.bak/ + +# Smooth exploration +gym/smooth_exploration/data* +gym/smooth_exploration/figures* diff --git a/gym/envs/__init__.py b/gym/envs/__init__.py index 80ede800..826b25e6 100644 --- a/gym/envs/__init__.py +++ b/gym/envs/__init__.py @@ -19,6 +19,8 @@ "Anymal": ".anymal_c.anymal", "A1": ".a1.a1", "HumanoidRunning": ".mit_humanoid.humanoid_running", + "Pendulum": ".pendulum.pendulum", + "Lander": ".mit_humanoid.lander", } config_dict = { @@ -26,10 +28,15 @@ "MiniCheetahCfg": ".mini_cheetah.mini_cheetah_config", "MiniCheetahRefCfg": ".mini_cheetah.mini_cheetah_ref_config", "MiniCheetahOscCfg": ".mini_cheetah.mini_cheetah_osc_config", + "MiniCheetahSACCfg": ".mini_cheetah.mini_cheetah_SAC_config", "MITHumanoidCfg": ".mit_humanoid.mit_humanoid_config", "A1Cfg": ".a1.a1_config", "AnymalCFlatCfg": ".anymal_c.flat.anymal_c_flat_config", "HumanoidRunningCfg": ".mit_humanoid.humanoid_running_config", + "PendulumCfg": ".pendulum.pendulum_config", + "PendulumSACCfg": ".pendulum.pendulum_SAC_config", + "LanderCfg": ".mit_humanoid.lander_config", + "PendulumPSDCfg": ".pendulum.pendulum_PSD_config", } runner_config_dict = { @@ -37,10 +44,15 @@ "MiniCheetahRunnerCfg": ".mini_cheetah.mini_cheetah_config", "MiniCheetahRefRunnerCfg": ".mini_cheetah.mini_cheetah_ref_config", "MiniCheetahOscRunnerCfg": ".mini_cheetah.mini_cheetah_osc_config", + "MiniCheetahSACRunnerCfg": ".mini_cheetah.mini_cheetah_SAC_config", "MITHumanoidRunnerCfg": ".mit_humanoid.mit_humanoid_config", "A1RunnerCfg": ".a1.a1_config", "AnymalCFlatRunnerCfg": ".anymal_c.flat.anymal_c_flat_config", "HumanoidRunningRunnerCfg": ".mit_humanoid.humanoid_running_config", + "PendulumRunnerCfg": ".pendulum.pendulum_config", + "PendulumSACRunnerCfg": ".pendulum.pendulum_SAC_config", + "LanderRunnerCfg": ".mit_humanoid.lander_config", + "PendulumPSDRunnerCfg": ".pendulum.pendulum_PSD_config", } task_dict = { @@ -56,14 +68,22 @@ "MiniCheetahOscCfg", "MiniCheetahOscRunnerCfg", ], + "sac_mini_cheetah": [ + "MiniCheetahRef", + "MiniCheetahSACCfg", + "MiniCheetahSACRunnerCfg" + ], "humanoid": ["MIT_Humanoid", "MITHumanoidCfg", "MITHumanoidRunnerCfg"], "humanoid_running": [ "HumanoidRunning", "HumanoidRunningCfg", "HumanoidRunningRunnerCfg", ], - "a1": ["A1", "A1Cfg", "A1RunnerCfg"], "flat_anymal_c": ["Anymal", "AnymalCFlatCfg", "AnymalCFlatRunnerCfg"], + "pendulum": ["Pendulum", "PendulumCfg", "PendulumRunnerCfg"], + "sac_pendulum": ["Pendulum", "PendulumSACCfg", "PendulumSACRunnerCfg"], + "lander": ["Lander", "LanderCfg", "LanderRunnerCfg"], + "psd_pendulum": ["Pendulum", "PendulumPSDCfg", "PendulumPSDRunnerCfg"], } for class_name, class_location in class_dict.items(): diff --git a/gym/envs/a1/a1_config.py b/gym/envs/a1/a1_config.py index cc19d8f3..87477b56 100644 --- a/gym/envs/a1/a1_config.py +++ b/gym/envs/a1/a1_config.py @@ -137,13 +137,13 @@ class scaling(LeggedRobotCfg.scaling): class A1RunnerCfg(LeggedRobotRunnerCfg): seed = -1 + runner_class_name = "OldPolicyRunner" - class policy(LeggedRobotRunnerCfg.policy): - actor_hidden_dims = [256, 256, 256] - critic_hidden_dims = [256, 256, 256] + class actor(LeggedRobotRunnerCfg.actor): + hidden_dims = [256, 256, 256] # activation can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" - actor_obs = [ + obs = [ "base_height", "base_lin_vel", "base_ang_vel", @@ -154,7 +154,19 @@ class policy(LeggedRobotRunnerCfg.policy): "commands", ] - critic_obs = [ + actions = ["dof_pos_target"] + + class noise: + dof_pos_obs = 0.005 + dof_vel = 0.005 + base_ang_vel = 0.05 + projected_gravity = 0.02 + + class critic: + hidden_dims = [256, 256, 256] + # activation can be elu, relu, selu, crelu, lrelu, tanh, sigmoid + activation = "elu" + obs = [ "base_height", "base_lin_vel", "base_ang_vel", @@ -165,16 +177,8 @@ class policy(LeggedRobotRunnerCfg.policy): "commands", ] - actions = ["dof_pos_target"] - - class noise: - dof_pos_obs = 0.005 # can be made very low - dof_vel = 0.005 - base_ang_vel = 0.05 - projected_gravity = 0.02 - - class reward(LeggedRobotRunnerCfg.policy.reward): - class weights(LeggedRobotRunnerCfg.policy.reward.weights): + class reward: + class weights: tracking_lin_vel = 1.0 tracking_ang_vel = 1.0 lin_vel_z = 0.0 diff --git a/gym/envs/anymal_c/flat/anymal_c_flat_config.py b/gym/envs/anymal_c/flat/anymal_c_flat_config.py index 4ce7f214..d18f7cca 100644 --- a/gym/envs/anymal_c/flat/anymal_c_flat_config.py +++ b/gym/envs/anymal_c/flat/anymal_c_flat_config.py @@ -85,6 +85,7 @@ class push_robots: toggle = True interval_s = 1 max_push_vel_xy = 0.5 + push_box_dims = [0.2, 0.2, 0.2] class domain_rand(LeggedRobotCfg.domain_rand): randomize_base_mass = True @@ -120,14 +121,14 @@ class scaling(LeggedRobotCfg.scaling): class AnymalCFlatRunnerCfg(LeggedRobotRunnerCfg): seed = -1 + runner_class_name = "OldPolicyRunner" - class policy(LeggedRobotRunnerCfg.policy): - actor_hidden_dims = [256, 256, 256] - critic_hidden_dims = [256, 256, 256] + class actor(LeggedRobotRunnerCfg.actor): + hidden_dims = [256, 256, 256] # can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" - actor_obs = [ + obs = [ "base_height", "base_lin_vel", "base_ang_vel", @@ -138,7 +139,19 @@ class policy(LeggedRobotRunnerCfg.policy): "dof_pos_history", ] - critic_obs = [ + actions = ["dof_pos_target"] + + class noise: + dof_pos_obs = 0.005 + dof_vel = 0.005 + base_ang_vel = 0.05 # 0.027, 0.14, 0.37 + projected_gravity = 0.02 + + class critic(LeggedRobotRunnerCfg.critic): + hidden_dims = [256, 256, 256] + # can be elu, relu, selu, crelu, lrelu, tanh, sigmoid + activation = "elu" + obs = [ "base_height", "base_lin_vel", "base_ang_vel", @@ -149,16 +162,8 @@ class policy(LeggedRobotRunnerCfg.policy): "dof_pos_history", ] - actions = ["dof_pos_target"] - - class noise: - dof_pos_obs = 0.005 # can be made very low - dof_vel = 0.005 - base_ang_vel = 0.05 # 0.027, 0.14, 0.37 - projected_gravity = 0.02 - - class reward(LeggedRobotRunnerCfg.policy.reward): - class weights(LeggedRobotRunnerCfg.policy.reward.weights): + class reward: + class weights: tracking_lin_vel = 3.0 tracking_ang_vel = 1.0 lin_vel_z = 0.0 diff --git a/gym/envs/anymal_c/mixed_terrains/anymal_c_rough_config.py b/gym/envs/anymal_c/mixed_terrains/anymal_c_rough_config.py index badc3faa..85b0c42b 100644 --- a/gym/envs/anymal_c/mixed_terrains/anymal_c_rough_config.py +++ b/gym/envs/anymal_c/mixed_terrains/anymal_c_rough_config.py @@ -56,12 +56,15 @@ class domain_rand(AnymalCFlatCfg.domain_rand): class AnymalCRoughCCfgPPO(AnymalCFlatCfgPPO): - class policy(AnymalCFlatCfgPPO.policy): - actor_hidden_dims = [128, 64, 32] + class actor(AnymalCFlatCfgPPO.actor): + hidden_dims = [128, 64, 32] critic_hidden_dims = [128, 64, 32] # can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" + class critic(AnymalCFlatCfgPPO.critic): + pass + class algorithm(AnymalCFlatCfgPPO.algorithm): entropy_coef = 0.01 diff --git a/gym/envs/base/base_task.py b/gym/envs/base/base_task.py index 551143a3..033e0eeb 100644 --- a/gym/envs/base/base_task.py +++ b/gym/envs/base/base_task.py @@ -18,33 +18,20 @@ def __init__(self, gym, sim, cfg, sim_params, sim_device, headless): # * env device is GPU only if sim is on GPU and use_gpu_pipeline=True, # * otherwise returned tensors are copied to CPU by physX. if sim_device_type == "cuda" and sim_params.use_gpu_pipeline: - self.device = self.sim_device + device = self.sim_device else: - self.device = "cpu" + device = "cpu" # * graphics device for rendering, -1 for no rendering self.graphics_device_id = self.sim_device_id - self.num_envs = cfg.env.num_envs self.num_actuators = cfg.env.num_actuators # * optimization flags for pytorch JIT torch._C._jit_set_profiling_mode(False) torch._C._jit_set_profiling_executor(False) - # allocate buffers - self.to_be_reset = torch.ones( - self.num_envs, device=self.device, dtype=torch.bool - ) - self.terminated = torch.ones( - self.num_envs, device=self.device, dtype=torch.bool - ) - self.episode_length_buf = torch.zeros( - self.num_envs, device=self.device, dtype=torch.long - ) - self.timed_out = torch.zeros( - self.num_envs, device=self.device, dtype=torch.bool - ) + super().__init__(num_envs=cfg.env.num_envs, device=device) # todo: read from config self.enable_viewer_sync = True diff --git a/gym/envs/base/fixed_robot.py b/gym/envs/base/fixed_robot.py index 99c038d9..f623e5d1 100644 --- a/gym/envs/base/fixed_robot.py +++ b/gym/envs/base/fixed_robot.py @@ -40,64 +40,60 @@ def __init__(self, gym, sim, cfg, sim_params, sim_device, headless): self.reset() def step(self): - """Apply actions, simulate, call self.post_physics_step() - and pre_physics_step() - - Args: - actions (torch.Tensor): Tensor of shape - (num_envs, num_actions_per_env) - """ - self._reset_buffers() - self._pre_physics_step() - # * step physics and render each frame + self._pre_decimation_step() self._render() for _ in range(self.cfg.control.decimation): + self._pre_compute_torques() self.torques = self._compute_torques() - - if self.cfg.asset.disable_motors: - self.torques[:] = 0.0 - torques_to_gym_tensor = torch.zeros( - self.num_envs, self.num_dof, device=self.device - ) - - # todo encapsulate - next_torques_idx = 0 - for dof_idx in range(self.num_dof): - if self.cfg.control.actuated_joints_mask[dof_idx]: - torques_to_gym_tensor[:, dof_idx] = self.torques[ - :, next_torques_idx - ] - next_torques_idx += 1 - else: - torques_to_gym_tensor[:, dof_idx] = torch.zeros( - self.num_envs, device=self.device - ) - - self.gym.set_dof_actuation_force_tensor( - self.sim, gymtorch.unwrap_tensor(torques_to_gym_tensor) - ) - self.gym.simulate(self.sim) - if self.device == "cpu": - self.gym.fetch_results(self.sim, True) - self.gym.refresh_dof_state_tensor(self.sim) - - self._post_physics_step() + self._post_compute_torques() + self._step_physx_sim() + self._post_physx_step() + self._post_decimation_step() self._check_terminations_and_timeouts() env_ids = self.to_be_reset.nonzero(as_tuple=False).flatten() self._reset_idx(env_ids) - def _pre_physics_step(self): - pass + def _pre_decimation_step(self): + return None + + def _pre_compute_torques(self): + return None + + def _post_compute_torques(self): + if self.cfg.asset.disable_motors: + self.torques[:] = 0.0 - def _post_physics_step(self): + def _step_physx_sim(self): + next_torques_idx = 0 + torques_to_gym_tensor = torch.zeros( + self.num_envs, self.num_dof, device=self.device + ) + for dof_idx in range(self.num_dof): + if self.cfg.control.actuated_joints_mask[dof_idx]: + torques_to_gym_tensor[:, dof_idx] = self.torques[:, next_torques_idx] + next_torques_idx += 1 + else: + torques_to_gym_tensor[:, dof_idx] = torch.zeros( + self.num_envs, device=self.device + ) + self.gym.set_dof_actuation_force_tensor( + self.sim, gymtorch.unwrap_tensor(torques_to_gym_tensor) + ) + self.gym.simulate(self.sim) + if self.device == "cpu": + self.gym.fetch_results(self.sim, True) + self.gym.refresh_dof_state_tensor(self.sim) + + def _post_physx_step(self): """ check terminations, compute observations and rewards """ self.gym.refresh_actor_root_state_tensor(self.sim) self.gym.refresh_net_contact_force_tensor(self.sim) + def _post_decimation_step(self): self.episode_length_buf += 1 self.common_step_counter += 1 @@ -188,10 +184,7 @@ def _process_dof_props(self, props, env_id): try: # ! this seems like a crime... self.torque_limits[i] = props["effort"][i].item() except: - print( - "WARNING: passive joints need to be listed after " - + "active joints in the URDF." - ) + print("WARNING: your system has unactuated joints") # * soft limits m = (self.dof_pos_limits[i, 0] + self.dof_pos_limits[i, 1]) / 2 r = self.dof_pos_limits[i, 1] - self.dof_pos_limits[i, 0] @@ -212,18 +205,6 @@ def _process_rigid_body_props(self, props, env_id): return props def _compute_torques(self): - """Compute torques from actions. - Actions can be interpreted as position or velocity targets given - to a PD controller, or directly as scaled torques. - [NOTE]: torques must have the same dimension as the number of DOFs, - even if some DOFs are not actuated. - - Args: - actions (torch.Tensor): Actions - - Returns: - [torch.Tensor]: Torques sent to the simulation - """ actuated_dof_pos = torch.zeros( self.num_envs, self.num_actuators, device=self.device ) @@ -415,10 +396,7 @@ def _init_buffers(self): self.default_act_pos = self.default_act_pos.unsqueeze(0) # * store indices of actuated joints self.act_idx = to_torch(actuated_idx, dtype=torch.long, device=self.device) - # * check that init range highs and lows are consistent - # * and repopulate to match - if self.cfg.init_state.reset_mode == "reset_to_range": - self.initialize_ranges_for_initial_conditions() + self.initialize_ranges_for_initial_conditions() def initialize_ranges_for_initial_conditions(self): self.dof_pos_range = torch.zeros( @@ -584,50 +562,42 @@ def _get_env_origins(self): # TODO: do without terrain # ------------ reward functions---------------- - def _sqrdexp(self, x, scale=1.0): + def _sqrdexp(self, x, sigma=None): """shorthand helper for squared exponential""" - return torch.exp( - -torch.square(x / scale) / self.cfg.reward_settings.tracking_sigma - ) + if sigma is None: + return torch.exp(-torch.square(x) / self.cfg.reward_settings.tracking_sigma) + else: + return torch.exp(-torch.square(x) / sigma) def _reward_torques(self): """Penalize torques""" - return -torch.sum(torch.square(self.torques), dim=1) + return -torch.mean(torch.square(self.torques), dim=1) def _reward_dof_vel(self): """Penalize dof velocities""" - return -torch.sum(torch.square(self.dof_vel), dim=1) + return -torch.mean(torch.square(self.dof_vel), dim=1) def _reward_action_rate(self): """Penalize changes in actions""" nact = self.num_actuators - dt2 = (self.dt * self.cfg.control.decimation) ** 2 - error = ( - torch.square( - self.dof_pos_history[:, :nact] - - self.dof_pos_history[:, nact : 2 * nact] - ) - / dt2 + error = torch.square( + self.dof_pos_history[:, :nact] - self.dof_pos_history[:, 2 * nact :] ) - return -torch.sum(error, dim=1) + return -torch.mean(error, dim=1) def _reward_action_rate2(self): """Penalize changes in actions""" nact = self.num_actuators - dt2 = (self.dt * self.cfg.control.decimation) ** 2 - error = ( - torch.square( - self.dof_pos_history[:, :nact] - - 2 * self.dof_pos_history[:, nact : 2 * nact] - + self.dof_pos_history[:, 2 * nact :] - ) - / dt2 + error = torch.square( + self.dof_pos_history[:, :nact] + - 2 * self.dof_pos_history[:, nact : 2 * nact] + + self.dof_pos_history[:, 2 * nact :] ) - return -torch.sum(error, dim=1) + return -torch.mean(error, dim=1) def _reward_collision(self): """Penalize collisions on selected bodies""" - return -torch.sum( + return -torch.mean( 1.0 * ( torch.norm( @@ -649,11 +619,11 @@ def _reward_dof_pos_limits(self): max=0.0 ) # lower limit out_of_limits += (self.dof_pos - self.dof_pos_limits[:, 1]).clip(min=0.0) - return -torch.sum(out_of_limits, dim=1) + return -torch.mean(out_of_limits, dim=1) def _reward_dof_vel_limits(self): """Penalize dof velocities too close to the limit""" # * clip to max error = 1 rad/s per joint to avoid huge penalties limit = self.cfg.reward_settings.soft_dof_vel_limit error = self.dof_vel.abs() - self.dof_vel_limits * limit - return -torch.sum(error.clip(min=0.0, max=1.0), dim=1) + return -torch.mean(error.clip(min=0.0, max=1.0), dim=1) diff --git a/gym/envs/base/fixed_robot_config.py b/gym/envs/base/fixed_robot_config.py index a59cc9eb..60160b9c 100644 --- a/gym/envs/base/fixed_robot_config.py +++ b/gym/envs/base/fixed_robot_config.py @@ -123,35 +123,34 @@ class FixedRobotCfgPPO(BaseConfig): class logging: enable_local_saving = True - class policy: + class actor: init_noise_std = 1.0 - actor_hidden_dims = [512, 256, 128] - critic_hidden_dims = [512, 256, 128] + hidden_dims = [512, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" - # only for 'ActorCriticRecurrent': - # rnn_type = 'lstm' - # rnn_hidden_size = 512 - # rnn_num_layers = 1 - - actor_obs = [ + obs = [ "observation_a", "observation_b", "these_need_to_be_atributes_(states)_of_the_robot_env", ] + normalize_obs = True + smooth_exploration = False + actions = ["tau_ff"] + disable_actions = False - critic_obs = [ + class noise: + observation_a = 0.1 # implement as needed, also in your robot class + + class critic: + hidden_dims = [512, 256, 128] + activation = "elu" + normalize_obs = True + obs = [ "observation_x", "observation_y", "critic_obs_can_be_the_same_or_different_than_actor_obs", ] - actions = ["tau_ff"] - disable_actions = False - - class noise: - noise = 0.1 # implement as needed, also in your robot class - class rewards: class weights: torques = 0.0 @@ -165,24 +164,29 @@ class termination_weight: termination = 0.0 class algorithm: - # * training params - value_loss_coef = 1.0 - use_clipped_value_loss = True + # both + gamma = 0.99 + lam = 0.95 + # shared + batch_size = 2**15 + max_gradient_steps = 24 + # new + storage_size = 2**17 # new + batch_size = 2**15 # new + clip_param = 0.2 - entropy_coef = 0.01 - num_learning_epochs = 5 - # * mini batch size = num_envs*nsteps / nminibatches - num_mini_batches = 4 learning_rate = 1.0e-3 + max_grad_norm = 1.0 + # Critic + use_clipped_value_loss = True + # Actor + entropy_coef = 0.01 schedule = "adaptive" # could be adaptive, fixed - gamma = 0.99 - lam = 0.95 desired_kl = 0.01 - max_grad_norm = 1.0 class runner: policy_class_name = "ActorCritic" - algorithm_class_name = "PPO" + algorithm_class_name = "PPO2" num_steps_per_env = 24 # per iteration max_iterations = 500 # number of policy updates diff --git a/gym/envs/base/legged_robot.py b/gym/envs/base/legged_robot.py index e02507d5..c80ebc85 100644 --- a/gym/envs/base/legged_robot.py +++ b/gym/envs/base/legged_robot.py @@ -110,12 +110,12 @@ def _post_decimation_step(self): self.base_height = self.root_states[:, 2:3] - n = self.num_actuators - self.dof_pos_history[:, 2 * n :] = self.dof_pos_history[:, n : 2 * n] - self.dof_pos_history[:, n : 2 * n] = self.dof_pos_history[:, :n] - self.dof_pos_history[:, :n] = self.dof_pos_target self.dof_pos_obs = self.dof_pos - self.default_dof_pos + self.dof_pos_history = self.dof_pos_history.roll(self.num_actuators) + # self.dof_pos_history[:, : self.num_actuators] = self.dof_pos_obs + self.dof_pos_history[:, : self.num_actuators] = self.dof_pos_target + env_ids = ( self.episode_length_buf % int(self.cfg.commands.resampling_time / self.dt) == 0 @@ -133,7 +133,10 @@ def _reset_idx(self, env_ids): self._reset_system(env_ids) self._resample_commands(env_ids) # * reset buffers - self.dof_pos_history[env_ids] = 0.0 + self.dof_pos_obs[env_ids] = self.dof_pos[env_ids] - self.default_dof_pos + # self.dof_pos_history[env_ids] = self.dof_pos_obs[env_ids].tile(3) + self.dof_pos_target[env_ids] = self.default_dof_pos + self.dof_pos_history[env_ids] = self.dof_pos_target[env_ids].tile(3) self.episode_length_buf[env_ids] = 0 def _initialize_sim(self): @@ -481,41 +484,26 @@ def _init_buffers(self): get_axis_params(-1.0, self.up_axis_idx), device=self.device ).repeat((self.num_envs, 1)) self.torques = torch.zeros( - self.num_envs, - self.num_actuators, - dtype=torch.float, - device=self.device, + self.num_envs, self.num_actuators, dtype=torch.float, device=self.device ) self.p_gains = torch.zeros( - self.num_actuators, dtype=torch.float, device=self.device + self.num_envs, self.num_actuators, dtype=torch.float, device=self.device ) self.d_gains = torch.zeros( - self.num_actuators, dtype=torch.float, device=self.device + self.num_envs, self.num_actuators, dtype=torch.float, device=self.device ) self.dof_pos_target = torch.zeros( - self.num_envs, - self.num_actuators, - dtype=torch.float, - device=self.device, + self.num_envs, self.num_actuators, dtype=torch.float, device=self.device ) self.dof_vel_target = torch.zeros( - self.num_envs, - self.num_actuators, - dtype=torch.float, - device=self.device, + self.num_envs, self.num_actuators, dtype=torch.float, device=self.device ) self.tau_ff = torch.zeros( - self.num_envs, - self.num_actuators, - dtype=torch.float, - device=self.device, + self.num_envs, self.num_actuators, dtype=torch.float, device=self.device ) self.dof_pos_history = torch.zeros( - self.num_envs, - self.num_actuators * 3, - dtype=torch.float, - device=self.device, + self.num_envs, self.num_actuators * 3, dtype=torch.float, device=self.device ) self.commands = torch.zeros( self.num_envs, 3, dtype=torch.float, device=self.device @@ -532,6 +520,29 @@ def _init_buffers(self): self.num_envs, 1, dtype=torch.float, device=self.device ) + # # * get the body_name to body_index dict + # body_dict = self.gym.get_actor_rigid_body_dict( + # self.envs[0], self.actor_handles[0] + # ) + # # * extract a list of body_names where the index is the id number + # body_names = [ + # body_tuple[0] + # for body_tuple in sorted( + # body_dict.items(), key=lambda body_tuple: body_tuple[1] + # ) + # ] + # # * construct a list of id numbers corresponding to end_effectors + # self.end_effector_ids = [] + # for end_effector_name in self.cfg.asset.foot_collisionbox_names: + # self.end_effector_ids.extend( + # [ + # body_names.index(body_name) + # for body_name in body_names + # if end_effector_name in body_name + # ] + # ) + # # ---------------------------------------- + if self.cfg.terrain.measure_heights: self.height_points = self._init_height_points() self.measured_heights = 0 @@ -559,8 +570,8 @@ def _init_buffers(self): found = False for dof_name in self.cfg.control.stiffness.keys(): if dof_name in name: - self.p_gains[i] = self.cfg.control.stiffness[dof_name] - self.d_gains[i] = self.cfg.control.damping[dof_name] + self.p_gains[:, i] = self.cfg.control.stiffness[dof_name] + self.d_gains[:, i] = self.cfg.control.damping[dof_name] found = True if not found: self.p_gains[i] = 0.0 @@ -980,11 +991,12 @@ def _get_heights(self, env_ids=None): return heights.view(self.num_envs, -1) * self.terrain.cfg.vertical_scale - def _sqrdexp(self, x, scale=1.0): + def _sqrdexp(self, x, sigma=None): """shorthand helper for squared exponential""" - return torch.exp( - -torch.square(x / scale) / self.cfg.reward_settings.tracking_sigma - ) + if sigma is None: + return torch.exp(-torch.square(x) / self.cfg.reward_settings.tracking_sigma) + else: + return torch.exp(-torch.square(x) / sigma) # ------------ reward functions---------------- @@ -994,11 +1006,11 @@ def _reward_lin_vel_z(self): def _reward_ang_vel_xy(self): """Penalize xy axes base angular velocity""" - return -torch.sum(torch.square(self.base_ang_vel[:, :2]), dim=1) + return -torch.mean(torch.square(self.base_ang_vel[:, :2]), dim=1) def _reward_orientation(self): """Penalize non flat base orientation""" - return -torch.sum(torch.square(self.projected_gravity[:, :2]), dim=1) + return -torch.mean(torch.square(self.projected_gravity[:, :2]), dim=1) def _reward_base_height(self): """Penalize base height away from target""" @@ -1009,37 +1021,29 @@ def _reward_base_height(self): def _reward_torques(self): """Penalize torques""" - return -torch.sum(torch.square(self.torques), dim=1) + return -torch.mean(torch.square(self.torques), dim=1) def _reward_dof_vel(self): """Penalize dof velocities""" - return -torch.sum(torch.square(self.dof_vel), dim=1) + return -torch.mean(torch.square(self.dof_vel), dim=1) def _reward_action_rate(self): """Penalize changes in actions""" n = self.num_actuators - dt2 = (self.dt * self.cfg.control.decimation) ** 2 - error = ( - torch.square( - self.dof_pos_history[:, :n] - self.dof_pos_history[:, n : 2 * n] - ) - / dt2 + error = torch.square( + self.dof_pos_history[:, :n] - self.dof_pos_history[:, 2 * n :] ) - return -torch.sum(error, dim=1) + return -torch.mean(error, dim=1) def _reward_action_rate2(self): """Penalize changes in actions""" n = self.num_actuators - dt2 = (self.dt * self.cfg.control.decimation) ** 2 - error = ( - torch.square( - self.dof_pos_history[:, :n] - - 2 * self.dof_pos_history[:, n : 2 * n] - + self.dof_pos_history[:, 2 * n :] - ) - / dt2 + error = torch.square( + self.dof_pos_history[:, :n] + - 2 * self.dof_pos_history[:, n : 2 * n] + + self.dof_pos_history[:, 2 * n :] ) - return -torch.sum(error, dim=1) + return -torch.mean(error, dim=1) def _reward_collision(self): """Penalize collisions on selected bodies""" @@ -1064,26 +1068,26 @@ def _reward_dof_pos_limits(self): # * lower limit out_of_limits = -(self.dof_pos - self.dof_pos_limits[:, 0]).clip(max=0.0) out_of_limits += (self.dof_pos - self.dof_pos_limits[:, 1]).clip(min=0.0) - return -torch.sum(out_of_limits, dim=1) + return -torch.mean(out_of_limits, dim=1) def _reward_dof_vel_limits(self): """Penalize dof velocities too close to the limit""" # * clip to max error = 1 rad/s per joint to avoid huge penalties limit = self.cfg.reward_settings.soft_dof_vel_limit error = self.dof_vel.abs() - self.dof_vel_limits * limit - return -torch.sum(error.clip(min=0.0, max=1.0), dim=1) + return -torch.mean(error.clip(min=0.0, max=1.0), dim=1) def _reward_torque_limits(self): """penalize torques too close to the limit""" limit = self.cfg.reward_settings.soft_torque_limit error = self.torques.abs() - self.torque_limits * limit - return -torch.sum(error.clip(min=0.0, max=1.0), dim=1) + return -torch.mean(error.clip(min=0.0, max=1.0), dim=1) def _reward_tracking_lin_vel(self): """Tracking of linear velocity commands (xy axes)""" error = torch.square(self.commands[:, :2] - self.base_lin_vel[:, :2]) error = torch.exp(-error / self.cfg.reward_settings.tracking_sigma) - return torch.sum(error, dim=1) + return torch.mean(error, dim=1) def _reward_tracking_ang_vel(self): """Tracking of angular velocity commands (yaw)""" @@ -1092,7 +1096,7 @@ def _reward_tracking_ang_vel(self): def _reward_feet_contact_forces(self): """penalize high contact forces""" - return -torch.sum( + return -torch.mean( ( torch.norm(self.contact_forces[:, self.feet_indices, :], dim=-1) - self.cfg.reward_settings.max_contact_force diff --git a/gym/envs/base/legged_robot_config.py b/gym/envs/base/legged_robot_config.py index 0de83e0e..c38f3e33 100644 --- a/gym/envs/base/legged_robot_config.py +++ b/gym/envs/base/legged_robot_config.py @@ -163,7 +163,7 @@ class asset: file = "" # * name of the feet bodies, # * used to index body state and contact force tensors - foot_name = "None" + foot_name = "foot" penalize_contacts_on = [] terminate_after_contacts_on = [] end_effector_names = [] @@ -233,15 +233,15 @@ class LeggedRobotRunnerCfg(BaseConfig): class logging: enable_local_saving = True - class policy: + class actor: init_noise_std = 1.0 - actor_hidden_dims = [512, 256, 128] - critic_hidden_dims = [512, 256, 128] + hidden_dims = [512, 256, 128] # can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" normalize_obs = True + smooth_exploration = False - actor_obs = [ + obs = [ "observation_a", "observation_b", "these_need_to_be_atributes_(states)_of_the_robot_env", @@ -263,6 +263,17 @@ class noise: projected_gravity = 0.05 height_measurements = 0.1 + class critic: + hidden_dims = [512, 256, 128] + # can be elu, relu, selu, crelu, lrelu, tanh, sigmoid + activation = "elu" + normalize_obs = True + obs = [ + "observation_x", + "observation_y", + "critic_obs_can_be_the_same_or_different_than_actor_obs", + ] + class reward: class weights: tracking_lin_vel = 0.0 @@ -283,25 +294,29 @@ class termination_weight: termination = 0.01 class algorithm: - # * training params - value_loss_coef = 1.0 - use_clipped_value_loss = True + # both + gamma = 0.99 + lam = 0.95 + # shared + batch_size = 2**15 + max_gradient_steps = 24 + # new clip_param = 0.2 - entropy_coef = 0.01 - num_learning_epochs = 5 - # * mini batch size = num_envs*nsteps / nminibatches - num_mini_batches = 4 learning_rate = 1.0e-3 + max_grad_norm = 1.0 + # Critic + use_clipped_value_loss = True + # Actor + entropy_coef = 0.01 schedule = "adaptive" # could be adaptive, fixed - gamma = 0.99 - lam = 0.95 desired_kl = 0.01 - max_grad_norm = 1.0 + lr_range = [2e-5, 1e-2] + lr_ratio = 1.5 class runner: policy_class_name = "ActorCritic" - algorithm_class_name = "PPO" - num_steps_per_env = 24 + algorithm_class_name = "PPO2" + num_steps_per_env = 24 # deprecate max_iterations = 1500 save_interval = 50 run_name = "" diff --git a/gym/envs/base/task_skeleton.py b/gym/envs/base/task_skeleton.py index ffbac6ce..4dfcb280 100644 --- a/gym/envs/base/task_skeleton.py +++ b/gym/envs/base/task_skeleton.py @@ -5,10 +5,15 @@ class TaskSkeleton: - def __init__(self, num_envs=1, max_episode_length=1.0, device="cpu"): + def __init__(self, num_envs=1, device="cpu"): self.num_envs = num_envs - self.max_episode_length = max_episode_length self.device = device + + self.to_be_reset = torch.ones(num_envs, device=device, dtype=torch.bool) + self.terminated = torch.ones(num_envs, device=device, dtype=torch.bool) + self.episode_length_buf = torch.zeros(num_envs, device=device, dtype=torch.long) + self.timed_out = torch.zeros(num_envs, device=device, dtype=torch.bool) + return None def get_states(self, obs_list): @@ -45,31 +50,19 @@ def reset(self): """Reset all robots""" self._reset_idx(torch.arange(self.num_envs, device=self.device)) self.step() + self.episode_length_buf[:] = 0 def _reset_buffers(self): self.to_be_reset[:] = False self.terminated[:] = False self.timed_out[:] = False - def compute_reward(self, reward_weights): - """Compute and return a torch tensor of rewards - reward_weights: dict with keys matching reward names, and values - matching weights - """ - reward = torch.zeros(self.num_envs, device=self.device, dtype=torch.float) - for name, weight in reward_weights.items(): - reward += weight * self._eval_reward(name) - return reward - - def _eval_reward(self, name): - return eval("self._reward_" + name + "()") - def _check_terminations_and_timeouts(self): """Check if environments need to be reset""" contact_forces = self.contact_forces[:, self.termination_contact_indices, :] - self.terminated = torch.any(torch.norm(contact_forces, dim=-1) > 1.0, dim=1) + self.terminated |= torch.any(torch.norm(contact_forces, dim=-1) > 1.0, dim=1) self.timed_out = self.episode_length_buf >= self.max_episode_length - self.to_be_reset = self.timed_out | self.terminated + # self.to_be_reset = self.timed_out | self.terminated def step(self, actions): raise NotImplementedError diff --git a/gym/envs/cartpole/cartpole_config.py b/gym/envs/cartpole/cartpole_config.py index d8a19292..c3489021 100644 --- a/gym/envs/cartpole/cartpole_config.py +++ b/gym/envs/cartpole/cartpole_config.py @@ -7,7 +7,7 @@ class CartpoleCfg(FixedRobotCfg): class env(FixedRobotCfg.env): num_envs = 4024 num_actuators = 1 # 1 for the cart force - episode_length_s = 20 + episode_length_s = 5.0 class terrain(FixedRobotCfg.terrain): pass @@ -35,7 +35,8 @@ class control(FixedRobotCfg.control): 1, # slider_to_cart 0, ] # cart_to_pole - + stiffness = {"slider_to_cart": 0.0} + damping = {"slider_to_cart": 0.0} ctrl_frequency = 250 desired_sim_frequency = 500 @@ -67,15 +68,14 @@ class CartpoleRunnerCfg(FixedRobotCfgPPO): seed = -1 runner_class_name = "OnPolicyRunner" - class policy(FixedRobotCfgPPO.policy): + class actor(FixedRobotCfgPPO.actor): init_noise_std = 1.0 num_layers = 2 num_units = 32 - actor_hidden_dims = [num_units] * num_layers - critic_hidden_dims = [num_units] * num_layers + hidden_dims = [num_units] * num_layers activation = "elu" - actor_obs = [ + obs = [ "cart_obs", "pole_trig_obs", "dof_vel", @@ -83,8 +83,6 @@ class policy(FixedRobotCfgPPO.policy): "pole_vel_square", ] - critic_obs = actor_obs - actions = ["tau_ff"] class noise: @@ -94,6 +92,20 @@ class noise: pole_vel = 0.010 actuation = 0.00 + class critic: + num_layers = 2 + num_units = 32 + hidden_dims = [num_units] * num_layers + activation = "elu" + + obs = [ + "cart_obs", + "pole_trig_obs", + "dof_vel", + "cart_vel_square", + "pole_vel_square", + ] + class reward: class weights: pole_pos = 5 @@ -108,25 +120,12 @@ class termination_weight: termination = 0.0 class algorithm(FixedRobotCfgPPO.algorithm): - # training params - value_loss_coef = 1.0 - use_clipped_value_loss = True - clip_param = 0.2 - entropy_coef = 0.01 - num_learning_epochs = 5 - # * mini batch size = num_envs*nsteps / nminibatches - num_mini_batches = 4 - learning_rate = 1.0e-3 - schedule = "adaptive" # could be adaptive, fixed - discount_horizon = 1.0 # [s] - GAE_bootstrap_horizon = 0.2 # [s] - desired_kl = 0.01 - max_grad_norm = 1.0 + pass class runner(FixedRobotCfgPPO.runner): policy_class_name = "ActorCritic" - algorithm_class_name = "PPO" - num_steps_per_env = 96 # per iteration + algorithm_class_name = "PPO2" + num_steps_per_env = 32 # per iteration max_iterations = 500 # number of policy updates # * logging diff --git a/gym/envs/mini_cheetah/mini_cheetah_SAC_config.py b/gym/envs/mini_cheetah/mini_cheetah_SAC_config.py new file mode 100644 index 00000000..13348b69 --- /dev/null +++ b/gym/envs/mini_cheetah/mini_cheetah_SAC_config.py @@ -0,0 +1,183 @@ +from gym.envs.base.legged_robot_config import LeggedRobotRunnerCfg +from gym.envs.mini_cheetah.mini_cheetah_ref_config import MiniCheetahRefCfg + +BASE_HEIGHT_REF = 0.3 + + +class MiniCheetahSACCfg(MiniCheetahRefCfg): + class env(MiniCheetahRefCfg.env): + num_envs = 1 + episode_length_s = 4 # TODO + + class terrain(MiniCheetahRefCfg.terrain): + pass + + class init_state(MiniCheetahRefCfg.init_state): + pass + + class control(MiniCheetahRefCfg.control): + # * PD Drive parameters: + stiffness = {"haa": 20.0, "hfe": 20.0, "kfe": 20.0} + damping = {"haa": 0.5, "hfe": 0.5, "kfe": 0.5} + gait_freq = 3.0 + ctrl_frequency = 20 # TODO + desired_sim_frequency = 100 + + class commands(MiniCheetahRefCfg.commands): + pass + + class push_robots(MiniCheetahRefCfg.push_robots): + pass + + class domain_rand(MiniCheetahRefCfg.domain_rand): + pass + + class asset(MiniCheetahRefCfg.asset): + file = ( + "{LEGGED_GYM_ROOT_DIR}/resources/robots/" + + "mini_cheetah/urdf/mini_cheetah_simple.urdf" + ) + foot_name = "foot" + penalize_contacts_on = ["shank"] + terminate_after_contacts_on = ["base"] + end_effector_names = ["foot"] + collapse_fixed_joints = False + self_collisions = 1 + flip_visual_attachments = False + disable_gravity = False + disable_motors = False + joint_damping = 0.1 + rotor_inertia = [0.002268, 0.002268, 0.005484] * 4 + + class reward_settings(MiniCheetahRefCfg.reward_settings): + soft_dof_pos_limit = 0.9 + soft_dof_vel_limit = 0.9 + soft_torque_limit = 0.9 + max_contact_force = 600.0 + base_height_target = BASE_HEIGHT_REF + tracking_sigma = 0.25 + + class scaling(MiniCheetahRefCfg.scaling): + base_ang_vel = 0.3 + base_lin_vel = BASE_HEIGHT_REF + dof_vel = 4 * [2.0, 2.0, 4.0] + base_height = 0.3 + dof_pos = 4 * [0.2, 0.3, 0.3] + dof_pos_obs = dof_pos + dof_pos_target = 4 * [0.2, 0.3, 0.3] + tau_ff = 4 * [18, 18, 28] + commands = [3, 1, 3] + + +class MiniCheetahSACRunnerCfg(LeggedRobotRunnerCfg): + seed = -1 + runner_class_name = "OffPolicyRunner" + + class actor(LeggedRobotRunnerCfg.actor): + hidden_dims = { + "latent": [128, 128], + "mean": [64], + "std": [64], + } + # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid + activation = { + "latent": "elu", + "mean": "elu", + "std": "elu", + } + + # TODO[lm]: Handle normalization + normalize_obs = False + obs = [ + "base_ang_vel", + "projected_gravity", + "commands", + "dof_pos_obs", + "dof_vel", + "phase_obs", + ] + + actions = ["dof_pos_target"] + add_noise = False # TODO + disable_actions = False + + class noise: + scale = 1.0 + dof_pos_obs = 0.01 + base_ang_vel = 0.01 + dof_pos = 0.005 + dof_vel = 0.005 + lin_vel = 0.05 + ang_vel = [0.3, 0.15, 0.4] + gravity_vec = 0.1 + + class critic(LeggedRobotRunnerCfg.critic): + hidden_dims = [128, 128, 64] + # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid + activation = "elu" + + # TODO[lm]: Handle normalization + normalize_obs = False + obs = [ + "base_height", + "base_lin_vel", + "base_ang_vel", + "projected_gravity", + "commands", + "dof_pos_obs", + "dof_vel", + "phase_obs", + "dof_pos_target", + ] + + class reward: + class weights: + tracking_lin_vel = 4.0 + tracking_ang_vel = 2.0 + lin_vel_z = 0.0 + ang_vel_xy = 0.01 + orientation = 1.0 + torques = 5.0e-7 + dof_vel = 0.0 + min_base_height = 1.5 + collision = 0.0 + action_rate = 0.01 + action_rate2 = 0.001 + stand_still = 0.0 + dof_pos_limits = 0.0 + feet_contact_forces = 0.0 + dof_near_home = 0.0 + reference_traj = 1.5 + swing_grf = 1.5 + stance_grf = 1.5 + + class termination_weight: + termination = 0.15 + + class algorithm(LeggedRobotRunnerCfg.algorithm): + # Taken from SAC pendulum + initial_fill = 500 + storage_size = 10**6 + batch_size = 256 + max_gradient_steps = 1 # 10 + action_max = 1.0 # TODO + action_min = -1.0 # TODO + actor_noise_std = 0.5 # TODO + log_std_max = 4.0 + log_std_min = -20.0 + alpha = 0.2 + target_entropy = -12.0 # -action_dim + max_grad_norm = 1.0 + polyak = 0.995 # flipped compared to SB3 (polyak == 1-tau) + gamma = 0.99 + alpha_lr = 1e-4 + actor_lr = 1e-4 + critic_lr = 1e-4 + + class runner(LeggedRobotRunnerCfg.runner): + run_name = "" + experiment_name = "sac_mini_cheetah" + max_iterations = 50_000 + algorithm_class_name = "SAC" + save_interval = 10_000 + num_steps_per_env = 1 diff --git a/gym/envs/mini_cheetah/mini_cheetah_config.py b/gym/envs/mini_cheetah/mini_cheetah_config.py index de015b21..ba2655f7 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_config.py @@ -124,25 +124,13 @@ class scaling(LeggedRobotCfg.scaling): class MiniCheetahRunnerCfg(LeggedRobotRunnerCfg): seed = -1 + runner_class_name = "OnPolicyRunner" - class policy(LeggedRobotRunnerCfg.policy): - actor_hidden_dims = [256, 256, 128] - critic_hidden_dims = [128, 64] + class actor(LeggedRobotRunnerCfg.actor): + hidden_dims = [256, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" - - actor_obs = [ - "base_lin_vel", - "base_ang_vel", - "projected_gravity", - "commands", - "dof_pos_obs", - "dof_vel", - "dof_pos_target", - ] - critic_obs = [ - "base_height", - "base_lin_vel", + obs = [ "base_ang_vel", "projected_gravity", "commands", @@ -150,9 +138,10 @@ class policy(LeggedRobotRunnerCfg.policy): "dof_vel", "dof_pos_target", ] - + normalize_obs = False actions = ["dof_pos_target"] add_noise = True + disable_actions = False class noise: scale = 1.0 @@ -164,8 +153,23 @@ class noise: ang_vel = [0.3, 0.15, 0.4] gravity_vec = 0.1 - class reward(LeggedRobotRunnerCfg.policy.reward): - class weights(LeggedRobotRunnerCfg.policy.reward.weights): + class critic(LeggedRobotRunnerCfg.critic): + hidden_dims = [128, 64] + # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid + activation = "elu" + obs = [ + "base_height", + "base_lin_vel", + "base_ang_vel", + "projected_gravity", + "commands", + "dof_pos_obs", + "dof_vel", + "dof_pos_target", + ] + + class reward: + class weights: tracking_lin_vel = 4.0 tracking_ang_vel = 2.0 lin_vel_z = 0.0 @@ -185,24 +189,11 @@ class termination_weight: termination = 0.01 class algorithm(LeggedRobotRunnerCfg.algorithm): - # * training params - value_loss_coef = 1.0 - use_clipped_value_loss = True - clip_param = 0.2 - entropy_coef = 0.02 - num_learning_epochs = 4 - # * mini batch size = num_envs*nsteps / nminibatches - num_mini_batches = 8 - learning_rate = 1.0e-5 - schedule = "adaptive" # can be adaptive or fixed - discount_horizon = 1.0 # [s] - GAE_bootstrap_horizon = 2.0 # [s] - desired_kl = 0.01 - max_grad_norm = 1.0 + pass class runner(LeggedRobotRunnerCfg.runner): run_name = "" experiment_name = "mini_cheetah" max_iterations = 500 - algorithm_class_name = "PPO" + algorithm_class_name = "PPO2" num_steps_per_env = 32 diff --git a/gym/envs/mini_cheetah/mini_cheetah_osc.py b/gym/envs/mini_cheetah/mini_cheetah_osc.py index acec0b8a..6d44e35d 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_osc.py +++ b/gym/envs/mini_cheetah/mini_cheetah_osc.py @@ -16,6 +16,9 @@ def __init__(self, gym, sim, cfg, sim_params, sim_device, headless): def _init_buffers(self): super()._init_buffers() + + self._switch = torch.zeros(self.num_envs, 1, device=self.device) + self.oscillators = torch.zeros(self.num_envs, 4, device=self.device) self.oscillator_obs = torch.zeros(self.num_envs, 8, device=self.device) @@ -158,6 +161,7 @@ def _post_decimation_step(self): """Update all states that are not handled in PhysX""" super()._post_decimation_step() self.grf = self._compute_grf() + self._update_cmd_switch() # self._step_oscillators() def _post_physx_step(self): @@ -266,12 +270,16 @@ def _compute_grf(self, grf_norm=True): else: return grf - def _switch(self): + def _update_cmd_switch(self): c_vel = torch.linalg.norm(self.commands, dim=1) - return torch.exp( + self._switch = torch.exp( -torch.square(torch.max(torch.zeros_like(c_vel), c_vel - 0.1)) / self.cfg.reward_settings.switch_scale ) + # return torch.exp( + # -torch.square(torch.max(torch.zeros_like(c_vel), c_vel - 0.1)) + # / self.cfg.reward_settings.switch_scale + # ) def _reward_cursorial(self): # penalize the abad joints being away from 0 @@ -357,10 +365,10 @@ def _reward_stance_velocity(self): return -torch.mean(rew, dim=1) def _reward_dof_vel(self): - return super()._reward_dof_vel() * self._switch() + return super()._reward_dof_vel() * self._switch def _reward_dof_near_home(self): - return super()._reward_dof_near_home() * self._switch() + return super()._reward_dof_near_home() * self._switch def _reward_stand_still(self): """Penalize motion at zero commands""" @@ -371,11 +379,11 @@ def _reward_stand_still(self): rew_vel = torch.mean(self._sqrdexp(self.dof_vel), dim=1) rew_base_vel = torch.mean(torch.square(self.base_lin_vel), dim=1) rew_base_vel += torch.mean(torch.square(self.base_ang_vel), dim=1) - return (rew_vel + rew_pos - rew_base_vel) * self._switch() + return (rew_vel + rew_pos - rew_base_vel) * self._switch def _reward_standing_torques(self): """Penalize torques at zero commands""" - return super()._reward_torques() * self._switch() + return super()._reward_torques() * self._switch # * gait similarity scores def angle_difference(self, theta1, theta2): diff --git a/gym/envs/mini_cheetah/mini_cheetah_osc_config.py b/gym/envs/mini_cheetah/mini_cheetah_osc_config.py index a08d5179..34996404 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_osc_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_osc_config.py @@ -64,7 +64,7 @@ class control(MiniCheetahCfg.control): # * PD Drive parameters: stiffness = {"haa": 20.0, "hfe": 20.0, "kfe": 20.0} damping = {"haa": 0.5, "hfe": 0.5, "kfe": 0.5} - ctrl_frequency = 100 + ctrl_frequency = 500 desired_sim_frequency = 500 class osc: @@ -150,7 +150,7 @@ class reward_settings(MiniCheetahCfg.reward_settings): soft_dof_vel_limit = 0.9 soft_torque_limit = 0.9 max_contact_force = 600.0 - base_height_target = BASE_HEIGHT_REF + 0.03 + base_height_target = 0.3 tracking_sigma = 0.25 switch_scale = 0.5 @@ -160,14 +160,16 @@ class scaling(MiniCheetahCfg.scaling): class MiniCheetahOscRunnerCfg(MiniCheetahRunnerCfg): seed = -1 + runner_class_name = "OnPolicyRunner" - class policy(MiniCheetahRunnerCfg.policy): - actor_hidden_dims = [256, 256, 128] - critic_hidden_dims = [256, 256, 128] + class actor(MiniCheetahRunnerCfg.actor): + frequency = 100 + hidden_dims = [256, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" - - actor_obs = [ + smooth_exploration = False + exploration_sample_freq = 16 + obs = [ "base_ang_vel", "projected_gravity", "commands", @@ -176,8 +178,26 @@ class policy(MiniCheetahRunnerCfg.policy): "oscillator_obs", "dof_pos_target", ] + normalize_obs = False + + actions = ["dof_pos_target"] + disable_actions = False + + class noise: + scale = 1.0 + dof_pos_obs = 0.01 + base_ang_vel = 0.01 + dof_pos = 0.005 + dof_vel = 0.005 + lin_vel = 0.05 + ang_vel = [0.3, 0.15, 0.4] + gravity_vec = 0.1 - critic_obs = [ + class critic(MiniCheetahRunnerCfg.critic): + hidden_dims = [256, 256, 128] + # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid + activation = "elu" + obs = [ "base_height", "base_lin_vel", "base_ang_vel", @@ -189,11 +209,7 @@ class policy(MiniCheetahRunnerCfg.policy): "oscillators_vel", "dof_pos_target", ] - - actions = ["dof_pos_target"] - - class noise: - pass + normalize_obs = True class reward: class weights: @@ -202,12 +218,12 @@ class weights: lin_vel_z = 0.0 ang_vel_xy = 0.01 orientation = 1.0 - torques = 5.0e-7 + torques = 5.0e-6 dof_vel = 0.0 min_base_height = 1.5 collision = 0 - action_rate = 0.01 # -0.01 - action_rate2 = 0.001 # -0.001 + action_rate = 0.001 # -0.01 + action_rate2 = 0.0001 # -0.001 stand_still = 0.0 dof_pos_limits = 0.0 feet_contact_forces = 0.0 @@ -222,27 +238,33 @@ class weights: standing_torques = 0.0 # 1.e-5 class termination_weight: - termination = 15.0 / 100.0 + termination = 0.15 class algorithm(MiniCheetahRunnerCfg.algorithm): - # training params - value_loss_coef = 1.0 - use_clipped_value_loss = True + # both + gamma = 0.99 + lam = 0.98 + batch_size = 2**15 + max_gradient_steps = 24 + # new + storage_size = 2**17 # new + batch_size = 2**15 # new + clip_param = 0.2 - entropy_coef = 0.02 - num_learning_epochs = 4 - # mini batch size = num_envs*nsteps/nminibatches - num_mini_batches = 8 - learning_rate = 1.0e-5 - schedule = "adaptive" # can be adaptive, fixed - discount_horizon = 1.0 - GAE_bootstrap_horizon = 2.0 - desired_kl = 0.01 + learning_rate = 1.0e-3 max_grad_norm = 1.0 + # Critic + use_clipped_value_loss = True + # Actor + entropy_coef = 0.01 + schedule = "adaptive" # could be adaptive, fixed + desired_kl = 0.01 + lr_range = [2e-5, 1e-2] + lr_ratio = 1.5 class runner(MiniCheetahRunnerCfg.runner): run_name = "" experiment_name = "FullSend" max_iterations = 500 # number of policy updates - algorithm_class_name = "PPO" + algorithm_class_name = "PPO2" num_steps_per_env = 32 diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref.py b/gym/envs/mini_cheetah/mini_cheetah_ref.py index 57e1f0e7..0084a80c 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref.py @@ -17,6 +17,7 @@ def __init__(self, gym, sim, cfg, sim_params, sim_device, headless): def _init_buffers(self): super()._init_buffers() + self._switch = torch.zeros(self.num_envs, 1, device=self.device) self.phase = torch.zeros( self.num_envs, 1, dtype=torch.float, device=self.device ) @@ -41,6 +42,7 @@ def _post_decimation_step(self): self.phase_obs = torch.cat( (torch.sin(self.phase), torch.cos(self.phase)), dim=1 ) + self._update_cmd_switch() def _resample_commands(self, env_ids): super()._resample_commands(env_ids) @@ -50,10 +52,20 @@ def _resample_commands(self, env_ids): ).squeeze(1) self.commands[env_ids, :3] *= (rand_ids < 0.9).unsqueeze(1) - def _switch(self): + def _check_terminations_and_timeouts(self): + """Check if environments need to be reset""" + contact_forces = self.contact_forces[:, self.termination_contact_indices, :] + self.terminated |= torch.any(torch.norm(contact_forces, dim=-1) > 1.0, dim=1) + self.timed_out = self.episode_length_buf >= self.max_episode_length + # self.to_be_reset = self.timed_out | self.terminated + + # --- + + def _update_cmd_switch(self): c_vel = torch.linalg.norm(self.commands, dim=1) - return torch.exp( - -torch.square(torch.max(torch.zeros_like(c_vel), c_vel - 0.1)) / 0.1 + self._switch = torch.exp( + -torch.square(torch.max(torch.zeros_like(c_vel), c_vel - 0.1)) + / self.cfg.reward_settings.switch_scale ) def _reward_swing_grf(self): @@ -64,7 +76,7 @@ def _reward_swing_grf(self): ) ph_off = torch.lt(self.phase, torch.pi) rew = in_contact * torch.cat((ph_off, ~ph_off, ~ph_off, ph_off), dim=1) - return -torch.sum(rew.float(), dim=1) * (1 - self._switch()) + return -torch.sum(rew.float(), dim=1) * (1 - self._switch) def _reward_stance_grf(self): """Reward non-zero grf during stance (pi to 2pi)""" @@ -75,7 +87,7 @@ def _reward_stance_grf(self): ph_off = torch.gt(self.phase, torch.pi) # should this be in swing? rew = in_contact * torch.cat((ph_off, ~ph_off, ~ph_off, ph_off), dim=1) - return torch.sum(rew.float(), dim=1) * (1 - self._switch()) + return torch.sum(rew.float(), dim=1) * (1 - self._switch) def _reward_reference_traj(self): """REWARDS EACH LEG INDIVIDUALLY BASED ON ITS POSITION IN THE CYCLE""" @@ -84,7 +96,7 @@ def _reward_reference_traj(self): error /= self.scales["dof_pos"] reward = (self._sqrdexp(error) - torch.abs(error) * 0.2).mean(dim=1) # * only when commanded velocity is higher - return reward * (1 - self._switch()) + return reward * (1 - self._switch) def _get_ref(self): leg_frame = torch.zeros_like(self.torques) @@ -112,10 +124,10 @@ def _reward_stand_still(self): rew_vel = torch.mean(self._sqrdexp(self.dof_vel), dim=1) rew_base_vel = torch.mean(torch.square(self.base_lin_vel), dim=1) rew_base_vel += torch.mean(torch.square(self.base_ang_vel), dim=1) - return (rew_vel + rew_pos - rew_base_vel) * self._switch() + return (rew_vel + rew_pos - rew_base_vel) * self._switch def _reward_tracking_lin_vel(self): """Tracking linear velocity commands (xy axes)""" # just use lin_vel? reward = super()._reward_tracking_lin_vel() - return reward * (1 - self._switch()) + return reward * (1 - self._switch) diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py index 41bb634f..37c52919 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py @@ -26,7 +26,7 @@ class control(MiniCheetahCfg.control): stiffness = {"haa": 20.0, "hfe": 20.0, "kfe": 20.0} damping = {"haa": 0.5, "hfe": 0.5, "kfe": 0.5} gait_freq = 3.0 - ctrl_frequency = 100 + ctrl_frequency = 500 desired_sim_frequency = 500 class commands(MiniCheetahCfg.commands): @@ -60,6 +60,7 @@ class reward_settings(MiniCheetahCfg.reward_settings): max_contact_force = 600.0 base_height_target = 0.3 tracking_sigma = 0.25 + switch_scale = 0.1 class scaling(MiniCheetahCfg.scaling): pass @@ -67,15 +68,17 @@ class scaling(MiniCheetahCfg.scaling): class MiniCheetahRefRunnerCfg(MiniCheetahRunnerCfg): seed = -1 - runner_class_name = "MyRunner" + runner_class_name = "OnPolicyRunner" - class policy(MiniCheetahRunnerCfg.policy): - actor_hidden_dims = [256, 256, 128] - critic_hidden_dims = [256, 256, 128] + class actor(MiniCheetahRunnerCfg.actor): + frequency = 100 + hidden_dims = [256, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid + layer_norm = [True, True, False] activation = "elu" - - actor_obs = [ + smooth_exploration = False + exploration_sample_freq = 16 + obs = [ "base_ang_vel", "projected_gravity", "commands", @@ -83,8 +86,29 @@ class policy(MiniCheetahRunnerCfg.policy): "dof_vel", "phase_obs", ] + normalize_obs = False + + actions = ["dof_pos_target"] + disable_actions = False + + class noise: + scale = 1.0 + dof_pos_obs = 0.01 + base_ang_vel = 0.01 + dof_pos = 0.005 + dof_vel = 0.005 + lin_vel = 0.05 + ang_vel = [0.3, 0.15, 0.4] + gravity_vec = 0.1 + + class critic(MiniCheetahRunnerCfg.critic): + hidden_dims = [256, 256, 128] + layer_norm = [True, True, False] + dropouts = [0.1, 0.0, 0.0] - critic_obs = [ + # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid + activation = "elu" + obs = [ "base_height", "base_lin_vel", "base_ang_vel", @@ -95,12 +119,7 @@ class policy(MiniCheetahRunnerCfg.policy): "phase_obs", "dof_pos_target", ] - - actions = ["dof_pos_target"] - disable_actions = False - - class noise: - pass + normalize_obs = False class reward: class weights: @@ -109,49 +128,46 @@ class weights: lin_vel_z = 0.0 ang_vel_xy = 0.01 orientation = 1.0 - torques = 5.0e-7 + torques = 5.0e-8 dof_vel = 0.0 min_base_height = 1.5 collision = 0.0 - action_rate = 0.01 - action_rate2 = 0.001 + action_rate = 1e-5 + action_rate2 = 1e-6 stand_still = 0.0 dof_pos_limits = 0.0 feet_contact_forces = 0.0 dof_near_home = 0.0 - reference_traj = 1.5 + reference_traj = 0.0 swing_grf = 1.5 stance_grf = 1.5 - class pbrs_weights: - reference_traj = 0.0 - swing_grf = 0.0 - stance_grf = 0.0 - min_base_height = 0.0 - orientation = 0.0 - class termination_weight: termination = 0.15 class algorithm(MiniCheetahRunnerCfg.algorithm): - # training params - value_loss_coef = 1.0 - use_clipped_value_loss = True + # both + gamma = 0.99 + lam = 0.95 + # shared + batch_size = 2 * 4096 # use all the data + max_gradient_steps = 50 + clip_param = 0.2 + learning_rate = 1.0e-3 + max_grad_norm = 1.0 + # Critic + use_clipped_value_loss = True + # Actor entropy_coef = 0.01 - num_learning_epochs = 6 - # mini batch size = num_envs*nsteps/nminibatches - num_mini_batches = 4 - learning_rate = 5.0e-5 - schedule = "adaptive" # can be adaptive, fixed - discount_horizon = 1.0 # [s] - GAE_bootstrap_horizon = 1.0 # [s] + schedule = "adaptive" # could be adaptive, fixed desired_kl = 0.01 - max_grad_norm = 1.0 + lr_range = [2e-5, 1e-2] + lr_ratio = 1.5 class runner(MiniCheetahRunnerCfg.runner): run_name = "" experiment_name = "mini_cheetah_ref" max_iterations = 500 # number of policy updates - algorithm_class_name = "PPO" - num_steps_per_env = 32 + algorithm_class_name = "PPO2" + num_steps_per_env = 20 # deprecate diff --git a/gym/envs/mit_humanoid/humanoid_running_config.py b/gym/envs/mit_humanoid/humanoid_running_config.py index 86a113f4..5ad7f3b5 100644 --- a/gym/envs/mit_humanoid/humanoid_running_config.py +++ b/gym/envs/mit_humanoid/humanoid_running_config.py @@ -142,7 +142,7 @@ class asset(LeggedRobotCfg.asset): # +'humanoid_fixed_arms_full.urdf') file = ( "{LEGGED_GYM_ROOT_DIR}/resources/robots/" - + "mit_humanoid/urdf/humanoid_F_sf.urdf" + + "mit_humanoid/urdf/humanoid_F_sf_learnt.urdf" ) keypoints = ["base"] end_effectors = ["left_foot", "right_foot"] @@ -165,7 +165,6 @@ class asset(LeggedRobotCfg.asset): fix_base_link = False disable_gravity = False - disable_actions = False disable_motors = False # (1: disable, 0: enable...bitwise filter) @@ -201,15 +200,14 @@ class HumanoidRunningRunnerCfg(LeggedRobotRunnerCfg): do_wandb = True seed = -1 - class policy(LeggedRobotRunnerCfg.policy): + class actor(LeggedRobotRunnerCfg.actor): init_noise_std = 1.0 - actor_hidden_dims = [256, 256, 256] - critic_hidden_dims = [256, 256, 256] + hidden_dims = [256, 256, 256] # (elu, relu, selu, crelu, lrelu, tanh, sigmoid) activation = "elu" normalize_obs = True # True, False - actor_obs = [ + obs = [ "base_height", "base_lin_vel", "base_ang_vel", @@ -221,10 +219,8 @@ class policy(LeggedRobotRunnerCfg.policy): "in_contact", ] - critic_obs = actor_obs - actions = ["dof_pos_target_legs"] - + disable_actions = False add_noise = True noise_level = 1.0 # scales other values @@ -237,6 +233,24 @@ class noise: dof_vel = 0.01 in_contact = 0.1 + class critic: + hidden_dims = [256, 256, 256] + # (elu, relu, selu, crelu, lrelu, tanh, sigmoid) + activation = "elu" + normalize_obs = True # True, False + + obs = [ + "base_height", + "base_lin_vel", + "base_ang_vel", + "projected_gravity", + "commands", + "phase_obs", + "dof_pos_legs", + "dof_vel_legs", + "in_contact", + ] + class reward: class weights: # * Behavioral rewards * # @@ -259,26 +273,14 @@ class termination_weight: termination = 1 class algorithm(LeggedRobotRunnerCfg.algorithm): - # algorithm training hyperparameters - value_loss_coef = 1.0 - use_clipped_value_loss = True - clip_param = 0.2 - entropy_coef = 0.01 - num_learning_epochs = 5 - num_mini_batches = 4 # minibatch size = num_envs*nsteps/nminibatches - learning_rate = 1.0e-5 - schedule = "adaptive" # could be adaptive, fixed - gamma = 0.999 - lam = 0.99 - desired_kl = 0.01 - max_grad_norm = 1.0 + pass class runner(LeggedRobotRunnerCfg.runner): policy_class_name = "ActorCritic" - algorithm_class_name = "PPO" - num_steps_per_env = 24 + algorithm_class_name = "PPO2" + num_steps_per_env = 32 max_iterations = 1000 - run_name = "ICRA2023" + run_name = "HumanoidRunning" experiment_name = "HumanoidLocomotion" save_interval = 50 plot_input_gradients = False diff --git a/gym/envs/mit_humanoid/lander.py b/gym/envs/mit_humanoid/lander.py new file mode 100644 index 00000000..86d37321 --- /dev/null +++ b/gym/envs/mit_humanoid/lander.py @@ -0,0 +1,127 @@ +import torch + +# from gym.envs.base.legged_robot import LeggedRobot +from gym.envs.mit_humanoid.mit_humanoid import MIT_Humanoid +from isaacgym.torch_utils import torch_rand_float + + +class Lander(MIT_Humanoid): + def __init__(self, gym, sim, cfg, sim_params, sim_device, headless): + super().__init__(gym, sim, cfg, sim_params, sim_device, headless) + + def _resample_commands(self, env_ids): + """Randommly select commands of some environments + + Args: + env_ids (List[int]): Environments ids for which new commands are needed + """ + if len(env_ids) == 0: + return + super()._resample_commands(env_ids) + # * with 75% chance, reset to 0 + self.commands[env_ids, :] *= ( + torch_rand_float(0, 1, (len(env_ids), 1), device=self.device).squeeze(1) + < 0.25 + ).unsqueeze(1) + + def _check_terminations_and_timeouts(self): + """Check if environments need to be reset""" + contact_forces = self.contact_forces[:, self.termination_contact_indices, :] + self.terminated |= torch.any(torch.norm(contact_forces, dim=-1) > 1.0, dim=1) + self.timed_out = self.episode_length_buf >= self.max_episode_length + # self.to_be_reset = self.timed_out | self.terminated + + # --- rewards --- + + def _switch(self, mode=None): + c_vel = torch.linalg.norm(self.commands, dim=1) + switch = torch.exp( + -torch.square( + torch.max( + torch.zeros_like(c_vel), + c_vel - self.cfg.reward_settings.switch_scale, + ) + ) + / self.cfg.reward_settings.switch_scale + ) + if mode is None or mode == "stand": + return switch + elif mode == "move": + return 1 - switch + + def _reward_lin_vel_xy(self): + return torch.exp( + -torch.linalg.norm(self.commands[:, :2] - self.base_lin_vel[:, :2], dim=1) + ) + + def _reward_lin_vel_z(self): + # Penalize z axis base linear velocity w. squared exp + return self._sqrdexp(self.base_lin_vel[:, 2] / self.scales["base_lin_vel"]) + + def _reward_orientation(self): + # Penalize non flat base orientation + return torch.sum( + self._sqrdexp(torch.square(self.projected_gravity[:, :2])), dim=1 + ) + + def _reward_min_base_height(self): + """Squared exponential saturating at base_height target""" + error = self.base_height - self.cfg.reward_settings.base_height_target + error = torch.clamp(error, max=0, min=None).flatten() + return self._sqrdexp(error) + + def _reward_tracking_lin_vel(self): + """Tracking of linear velocity commands (xy axes)""" + # just use lin_vel? + error = self.commands[:, :2] - self.base_lin_vel[:, :2] + # * scale by (1+|cmd|): if cmd=0, no scaling. + error *= 1.0 / (1.0 + torch.abs(self.commands[:, :2])) + return torch.mean(self._sqrdexp(error), dim=1) + + def _reward_dof_vel(self): + # Penalize dof velocities + return torch.mean( + self._sqrdexp(self.dof_vel / self.scales["dof_vel"]), dim=1 + ) * self._switch("stand") + + def _reward_dof_near_home(self): + return self._sqrdexp( + (self.dof_pos - self.default_dof_pos) / self.scales["dof_pos"] + ).mean(dim=1) + + def _reward_stand_still(self): + """Penalize motion at zero commands""" + # * normalize angles so we care about being within 5 deg + rew_pos = torch.mean( + self._sqrdexp((self.dof_pos - self.default_dof_pos) / torch.pi * 36), dim=1 + ) + rew_vel = torch.mean(self._sqrdexp(self.dof_vel), dim=1) + rew_base_vel = torch.mean(torch.square(self.base_lin_vel), dim=1) + rew_base_vel += torch.mean(torch.square(self.base_ang_vel), dim=1) + return rew_vel + rew_pos - rew_base_vel * self._switch("stand") + + def _compute_grf(self, grf_norm=True): + grf = torch.norm(self.contact_forces[:, self.feet_indices, :], dim=-1) + if grf_norm: + return torch.clamp_max(grf / self.cfg.asset.total_mass, 1.0) + else: + return grf + + def smooth_sqr_wave(self, phase, sigma=0.2): # sigma=0 is step function + return phase.sin() / (2 * torch.sqrt(phase.sin() ** 2.0 + sigma**2.0)) + 0.5 + + def _reward_hips_forward(self): + # reward hip motors for pointing forward + hip_yaw_abad = torch.cat((self.dof_pos[:, 0:2], self.dof_pos[:, 5:7]), dim=1) + hip_yaw_abad -= torch.cat( + (self.default_dof_pos[:, 0:2], self.default_dof_pos[:, 5:7]), dim=1 + ) + hip_yaw_abad /= torch.cat( + (self.scales["dof_pos"][0:2], self.scales["dof_pos"][5:7]) + ) + return (hip_yaw_abad).pow(2).mean(dim=1) + # return self._sqrdexp(hip_yaw_abad).sum(dim=1).mean(dim=1) + + def _reward_power(self): + power = self.torques * self.dof_vel + return power.pow(2).mean(dim=1) diff --git a/gym/envs/mit_humanoid/lander_config.py b/gym/envs/mit_humanoid/lander_config.py new file mode 100644 index 00000000..86f3329f --- /dev/null +++ b/gym/envs/mit_humanoid/lander_config.py @@ -0,0 +1,328 @@ +from gym.envs.base.legged_robot_config import ( + LeggedRobotCfg, + LeggedRobotRunnerCfg, +) + +BASE_HEIGHT_REF = 0.80 + + +class LanderCfg(LeggedRobotCfg): + class env(LeggedRobotCfg.env): + num_envs = 4096 + num_actuators = 18 + episode_length_s = 10 # episode length in seconds + + sampled_history_length = 3 # n samples + sampled_history_frequency = 10 # [Hz] + + class terrain(LeggedRobotCfg.terrain): + pass + + class init_state(LeggedRobotCfg.init_state): + # * default setup chooses how the initial conditions are chosen. + # * "reset_to_basic" = a single position with added randomized noise. + # * "reset_to_range" = a range of joint positions and velocities. + # * "reset_to_traj" = feed in a trajectory to sample from. + reset_mode = "reset_to_range" + + default_joint_angles = { + "hip_yaw": 0.0, + "hip_abad": 0.0, + "hip_pitch": -0.667751, + "knee": 1.4087, + "ankle": -0.708876, + "shoulder_pitch": 0.0, + "shoulder_abad": 0.0, + "shoulder_yaw": 0.0, + "elbow": -1.25, + } + + # * default COM for basic initialization + pos = [0.0, 0.0, 0.6] # x,y,z [m] + rot = [0.0, 0.0, 0.0, 1.0] # x,y,z,w [quat] + lin_vel = [0.0, 0.0, 0.0] # x,y,z [m/s] + ang_vel = [0.0, 0.0, 0.0] # x,y,z [rad/s] + + # * initialization for random range setup + + dof_pos_range = { + "hip_yaw": [-0.0, 0.0], + "hip_abad": [-0.0, 0.0], + "hip_pitch": [-0.667751, -0.667751], + "knee": [1.4087, 1.4087], + "ankle": [-0.708876, -0.708876], + "shoulder_pitch": [0.0, 0.0], + "shoulder_abad": [0.0, 0.0], + "shoulder_yaw": [0.0, 0.0], + "elbow": [0, 0], + } + dof_vel_range = { + "hip_yaw": [-0.1, 0.1], + "hip_abad": [-0.1, 0.1], + "hip_pitch": [-0.1, 0.1], + "knee": [-0.1, 0.1], + "ankle": [-0.1, 0.1], + "shoulder_pitch": [0.0, 0.0], + "shoulder_abad": [0.0, 0.0], + "shoulder_yaw": [0.0, 0.0], + "elbow": [0.0, 0.0], + } + + root_pos_range = [ + [0.0, 0.0], # x + [0.0, 0.0], # y + [0.64, 1.5], # z + [-0.1, 0.1], # roll + [-0.1, 0.1], # pitch + [-0.1, 0.1], + ] # yaw + + root_vel_range = [ + [-0.75, 2.75], # x + [-0.55, 0.55], # y + [-2.5, 0.25], # z + [-0.35, 0.35], # roll + [-0.35, 0.35], # pitch + [-0.35, 0.35], # yaw + ] + + class control(LeggedRobotCfg.control): + # * PD Drive parameters: + stiffness = { + "hip_yaw": 30.0, + "hip_abad": 30.0, + "hip_pitch": 30.0, + "knee": 30.0, + "ankle": 30.0, + "shoulder_pitch": 40.0, + "shoulder_abad": 40.0, + "shoulder_yaw": 40.0, + "elbow": 50.0, + } # [N*m/rad] + damping = { + "hip_yaw": 2.0, + "hip_abad": 2.0, + "hip_pitch": 2.0, + "knee": 2.0, + "ankle": 2.0, + "shoulder_pitch": 2.0, + "shoulder_abad": 2.0, + "shoulder_yaw": 2.0, + "elbow": 1.0, + } # [N*m*s/rad] + + ctrl_frequency = 500 + desired_sim_frequency = 500 + + # class oscillator: + # base_frequency = 3.0 # [Hz] + + class commands: + resampling_time = 10.0 # time before command are changed[s] + + class ranges: + lin_vel_x = [-2.0, 2.0] # min max [m/s] [-0.75, 0.75] + lin_vel_y = 0.3 # max [m/s] + yaw_vel = 1.0 # max [rad/s] + + class push_robots: + toggle = True + interval_s = 2 + max_push_vel_xy = 0.5 + push_box_dims = [0.1, 0.1, 0.3] # x,y,z [m] + + class domain_rand: + randomize_friction = True + friction_range = [0.5, 1.25] + randomize_base_mass = True + added_mass_range = [-1.0, 1.0] + + class asset(LeggedRobotCfg.asset): + file = ( + "{LEGGED_GYM_ROOT_DIR}/resources/robots/" + + "mit_humanoid/urdf/humanoid_F_sf_learnt.urdf" + ) + # foot_collisionbox_names = ["foot"] + foot_name = "foot" + penalize_contacts_on = ["arm", "hand", "shoulder"] + terminate_after_contacts_on = ["base"] + flip_visual_attachments = False + self_collisions = 0 # 1 to disagble, 0 to enable...bitwise filter + collapse_fixed_joints = False + fix_base_link = False + disable_gravity = False + disable_motors = False + total_mass = 25.0 + + class reward_settings(LeggedRobotCfg.reward_settings): + soft_dof_pos_limit = 0.8 + soft_dof_vel_limit = 0.8 + soft_torque_limit = 0.8 + max_contact_force = 1500.0 + base_height_target = BASE_HEIGHT_REF + tracking_sigma = 0.25 + + # a smooth switch based on |cmd| (commanded velocity). + switch_scale = 0.5 + switch_threshold = 0.2 + + class scaling(LeggedRobotCfg.scaling): + base_ang_vel = 2.5 + base_lin_vel = 1.5 + commands = 1 + base_height = BASE_HEIGHT_REF + dof_pos = [ + 0.1, + 0.2, + 0.8, + 0.8, + 0.8, + 0.1, + 0.2, + 0.8, + 0.8, + 0.8, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + ] + # # * Action scales + dof_pos_target = dof_pos + dof_vel = [ + 0.5, + 1.0, + 4.0, + 4.0, + 2.0, + 0.5, + 1.0, + 4.0, + 4.0, + 2.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + ] + dof_pos_history = 3 * dof_pos + + +class LanderRunnerCfg(LeggedRobotRunnerCfg): + seed = -1 + runner_class_name = "MyRunner" + + class actor(LeggedRobotRunnerCfg.actor): + frequency = 100 + init_noise_std = 1.0 + hidden_dims = [512, 256, 128] + # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid + activation = ["elu", "elu", "tanh"] + layer_norm = True + smooth_exploration = False + + obs = [ + "base_height", + "base_lin_vel", + "base_ang_vel", + "projected_gravity", + "commands", + "dof_pos_obs", + "dof_vel", + "dof_pos_history", + "sampled_history_dof_pos", + "sampled_history_dof_vel", + "sampled_history_dof_pos_target", + ] + normalize_obs = False + + actions = ["dof_pos_target"] + disable_actions = False + + class noise: + dof_pos = 0.005 + dof_vel = 0.05 + base_ang_vel = 0.025 + base_lin_vel = 0.025 + projected_gravity = 0.01 + feet_contact_state = 0.025 + + class critic(LeggedRobotRunnerCfg.critic): + hidden_dims = [512, 256, 128] + # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid + activation = "elu" + layer_norm = True + + obs = [ + # "base_height", + "base_lin_vel", + "base_ang_vel", + "projected_gravity", + "commands", + "dof_pos_obs", + "dof_vel", + "dof_pos_history", + "sampled_history_dof_pos", + "sampled_history_dof_vel", + "sampled_history_dof_pos_target", + ] + normalize_obs = False + + class reward: + class weights: + torques = 5.0e-5 + power = 1e-6 # 1.0e-2 + min_base_height = 1.5 + lin_vel_xy = 1.0 + action_rate = 1e-2 + action_rate2 = 1e-3 + lin_vel_z = 0.0 + ang_vel_xy = 0.0 + dof_vel = 0.5 + dof_pos_limits = 0.25 + dof_near_home = 0.75 + hips_forward = 0.0 + collision = 1.0 + + class termination_weight: + termination = 1.0 + + class algorithm(LeggedRobotRunnerCfg.algorithm): + # both + gamma = 0.99 + lam = 0.95 + # shared + batch_size = 2**15 + max_gradient_steps = 24 + # new + storage_size = 2**17 # new + batch_size = 2**15 # new + + clip_param = 0.2 + learning_rate = 1.0e-3 + max_grad_norm = 1.0 + # Critic + use_clipped_value_loss = True + # Actor + entropy_coef = 0.01 + schedule = "adaptive" # could be adaptive, fixed + desired_kl = 0.01 + lr_range = [1e-5, 1e-2] + lr_ratio = 1.5 + + class runner(LeggedRobotRunnerCfg.runner): + policy_class_name = "ActorCritic" + algorithm_class_name = "PPO2" + num_steps_per_env = 24 + max_iterations = 500 + run_name = "lander" + experiment_name = "Humanoid" + save_interval = 50 diff --git a/gym/envs/mit_humanoid/mit_humanoid.py b/gym/envs/mit_humanoid/mit_humanoid.py index 74e35d5a..a2e082e8 100644 --- a/gym/envs/mit_humanoid/mit_humanoid.py +++ b/gym/envs/mit_humanoid/mit_humanoid.py @@ -1,7 +1,6 @@ import torch from gym.envs.base.legged_robot import LeggedRobot -from .jacobian import _apply_coupling class MIT_Humanoid(LeggedRobot): @@ -10,25 +9,148 @@ def __init__(self, gym, sim, cfg, sim_params, sim_device, headless): def _init_buffers(self): super()._init_buffers() + self._init_sampled_history_buffers() + def _init_sampled_history_buffers(self): + self.sampled_history_dof_pos_target = torch.zeros( + (self.num_envs, self.num_dof * self.cfg.env.sampled_history_length), + device=self.device, + ) + self.sampled_history_dof_pos = torch.zeros( + self.num_envs, + self.num_dof * self.cfg.env.sampled_history_length, + device=self.device, + ) + self.sampled_history_dof_vel = torch.zeros( + self.num_envs, + self.num_dof * self.cfg.env.sampled_history_length, + device=self.device, + ) + self.sampled_history_counter = torch.zeros( + self.num_envs, dtype=int, device=self.device + ) + self.sampled_history_threshold = int( + self.cfg.control.ctrl_frequency / self.cfg.env.sampled_history_frequency + ) + self.J = torch.eye(self.num_dof).to(self.device) + self.J[4, 3] = 1 + self.J[9, 8] = 1 + self.J_inv_T = torch.inverse(self.J.T) + + def _apply_coupling(self, q, qd, q_des, qd_des, kp, kd, tau_ff): + # Create a Jacobian matrix and move it to the same device as input tensors + + # Perform transformations using Jacobian + q = torch.matmul(q, self.J.T) + qd = torch.matmul(qd, self.J.T) + q_des = torch.matmul(q_des, self.J.T) + qd_des = torch.matmul(qd_des, self.J.T) + + # Compute feed-forward torques + tau_ff = torch.matmul(self.J_inv_T, tau_ff.T).T + + # Compute kp and kd terms + kp = torch.diagonal( + torch.matmul( + torch.matmul(self.J_inv_T, torch.diag_embed(kp, dim1=-2, dim2=-1)), + self.J_inv_T.T, + ), + dim1=-2, + dim2=-1, + ) + + kd = torch.diagonal( + torch.matmul( + torch.matmul(self.J_inv_T, torch.diag_embed(kd, dim1=-2, dim2=-1)), + self.J_inv_T.T, + ), + dim1=-2, + dim2=-1, + ) + + # Compute torques + torques = kp * (q_des - q) + kd * (qd_des - qd) + tau_ff + torques = torch.matmul(torques, self.J) + + return torques + + def _reset_system(self, env_ids): + if len(env_ids) == 0: + return + super()._reset_system(env_ids) + self._reset_sampled_history_buffers(env_ids) + return + + def _reset_sampled_history_buffers(self, ids): + n = self.cfg.env.sampled_history_length + self.sampled_history_dof_pos_target[ids] = self.dof_pos_target[ids].tile(n) + self.sampled_history_dof_pos[ids] = self.dof_pos_target[ids].tile(n) + self.sampled_history_dof_vel[ids] = self.dof_pos_target[ids].tile(n) + + # compute_torques accounting for coupling, and filtering torques def _compute_torques(self): - self.desired_pos_target = self.dof_pos_target + self.default_dof_pos - q = self.dof_pos.clone() - qd = self.dof_vel.clone() - q_des = self.desired_pos_target.clone() - qd_des = self.dof_vel_target.clone() - tau_ff = self.tau_ff.clone() - kp = self.p_gains.clone() - kd = self.d_gains.clone() - - if self.cfg.asset.apply_humanoid_jacobian: - torques = _apply_coupling(q, qd, q_des, qd_des, kp, kd, tau_ff) - else: - torques = kp * (q_des - q) + kd * (qd_des - qd) + tau_ff + torques = self._apply_coupling( + self.dof_pos, + self.dof_vel, + self.dof_pos_target + self.default_dof_pos, + self.dof_vel_target, + self.p_gains, + self.d_gains, + self.tau_ff, + ) + return torques.clip(-self.torque_limits, self.torque_limits) + + def _post_decimation_step(self): + super()._post_decimation_step() + self._update_sampled_history_buffers() + + def _update_sampled_history_buffers(self): + self.sampled_history_counter += 1 + + ids = torch.nonzero( + self.sampled_history_counter == self.sampled_history_threshold, + as_tuple=False, + ).flatten() + + self.sampled_history_dof_pos_target[ids] = torch.roll( + self.sampled_history_dof_pos_target[ids], self.num_dof, dims=1 + ) # check + self.sampled_history_dof_pos_target[ids, : self.num_dof] = self.dof_pos_target[ + ids + ] + self.sampled_history_dof_pos[ids] = torch.roll( + self.sampled_history_dof_pos[ids], self.num_dof, dims=1 + ) # check + self.sampled_history_dof_pos[ids, : self.num_dof] = self.dof_pos_target[ids] + self.sampled_history_dof_vel[ids] = torch.roll( + self.sampled_history_dof_vel[ids], self.num_dof, dims=1 + ) # check + self.sampled_history_dof_vel[ids, : self.num_dof] = self.dof_pos_target[ids] - torques = torch.clip(torques, -self.torque_limits, self.torque_limits) + self.sampled_history_counter[ids] = 0 - return torques.view(self.torques.shape) + # --- rewards --- + + def _switch(self, mode=None): + c_vel = torch.linalg.norm(self.commands, dim=1) + switch = torch.exp( + -torch.square( + torch.max( + torch.zeros_like(c_vel), + c_vel - self.cfg.reward_settings.switch_scale, + ) + ) + / self.cfg.reward_settings.switch_scale + ) + if mode is None or mode == "stand": + return switch + elif mode == "move": + return 1 - switch + + def _reward_lin_vel_xy(self): + return torch.exp( + -torch.linalg.norm(self.commands[:, :2] - self.base_lin_vel[:, :2], dim=1) + ) def _reward_lin_vel_z(self): # Penalize z axis base linear velocity w. squared exp @@ -56,12 +178,44 @@ def _reward_tracking_lin_vel(self): def _reward_dof_vel(self): # Penalize dof velocities - return torch.mean(self._sqrdexp(self.dof_vel / self.scales["dof_vel"]), dim=1) + return torch.mean( + self._sqrdexp(self.dof_vel / self.scales["dof_vel"]), dim=1 + ) * self._switch("stand") def _reward_dof_near_home(self): - return torch.mean( - self._sqrdexp( - (self.dof_pos - self.default_dof_pos) / self.scales["dof_pos_obs"] - ), - dim=1, + return self._sqrdexp( + (self.dof_pos - self.default_dof_pos) / self.scales["dof_pos"] + ).mean(dim=1) + + def _reward_stand_still(self): + """Penalize motion at zero commands""" + # * normalize angles so we care about being within 5 deg + rew_pos = torch.mean( + self._sqrdexp((self.dof_pos - self.default_dof_pos) / torch.pi * 36), dim=1 + ) + rew_vel = torch.mean(self._sqrdexp(self.dof_vel), dim=1) + rew_base_vel = torch.mean(torch.square(self.base_lin_vel), dim=1) + rew_base_vel += torch.mean(torch.square(self.base_ang_vel), dim=1) + return rew_vel + rew_pos - rew_base_vel * self._switch("stand") + + def _compute_grf(self, grf_norm=True): + grf = torch.norm(self.contact_forces[:, self.feet_indices, :], dim=-1) + if grf_norm: + return torch.clamp_max(grf / self.cfg.asset.total_mass, 1.0) + else: + return grf + + def smooth_sqr_wave(self, phase, sigma=0.2): # sigma=0 is step function + return phase.sin() / (2 * torch.sqrt(phase.sin() ** 2.0 + sigma**2.0)) + 0.5 + + def _reward_hips_forward(self): + # reward hip motors for pointing forward + hip_yaw_abad = torch.cat((self.dof_pos[:, 0:2], self.dof_pos[:, 5:7]), dim=1) + hip_yaw_abad -= torch.cat( + (self.default_dof_pos[:, 0:2], self.default_dof_pos[:, 5:7]), dim=1 + ) + hip_yaw_abad /= torch.cat( + (self.scales["dof_pos"][0:2], self.scales["dof_pos"][5:7]) ) + return (hip_yaw_abad).pow(2).mean(dim=1) + # return self._sqrdexp(hip_yaw_abad).sum(dim=1).mean(dim=1) diff --git a/gym/envs/mit_humanoid/mit_humanoid_config.py b/gym/envs/mit_humanoid/mit_humanoid_config.py index 3439e58c..861f2826 100644 --- a/gym/envs/mit_humanoid/mit_humanoid_config.py +++ b/gym/envs/mit_humanoid/mit_humanoid_config.py @@ -3,14 +3,17 @@ LeggedRobotRunnerCfg, ) +BASE_HEIGHT_REF = 0.80 + class MITHumanoidCfg(LeggedRobotCfg): class env(LeggedRobotCfg.env): num_envs = 4096 - num_observations = 49 + 3 * 18 # 121 num_actuators = 18 - episode_length_s = 100 # episode length in seconds - num_privileged_obs = num_observations + episode_length_s = 5 # episode length in seconds + + sampled_history_length = 3 # n samples + sampled_history_frequency = 10 # [Hz] class terrain(LeggedRobotCfg.terrain): pass @@ -25,17 +28,17 @@ class init_state(LeggedRobotCfg.init_state): default_joint_angles = { "hip_yaw": 0.0, "hip_abad": 0.0, - "hip_pitch": -0.4, - "knee": 0.9, - "ankle": -0.45, + "hip_pitch": -0.667751, + "knee": 1.4087, + "ankle": -0.708876, "shoulder_pitch": 0.0, "shoulder_abad": 0.0, "shoulder_yaw": 0.0, - "elbow": 0.0, + "elbow": -1.25, } # * default COM for basic initialization - pos = [0.0, 0.0, 0.66] # x,y,z [m] + pos = [0.0, 0.0, 0.6] # x,y,z [m] rot = [0.0, 0.0, 0.0, 1.0] # x,y,z,w [quat] lin_vel = [0.0, 0.0, 0.0] # x,y,z [m/s] ang_vel = [0.0, 0.0, 0.0] # x,y,z [rad/s] @@ -43,22 +46,22 @@ class init_state(LeggedRobotCfg.init_state): # * initialization for random range setup dof_pos_range = { - "hip_yaw": [0.0, 0.0], - "hip_abad": [0.0, 0.0], - "hip_pitch": [-0.29, -0.25], - "knee": [0.67, 0.71], - "ankle": [-0.43, -0.39], + "hip_yaw": [-0.0, 0.0], + "hip_abad": [-0.0, 0.0], + "hip_pitch": [-0.667751, -0.667751], + "knee": [1.4087, 1.4087], + "ankle": [-0.708876, -0.708876], "shoulder_pitch": [0.0, 0.0], "shoulder_abad": [0.0, 0.0], "shoulder_yaw": [0.0, 0.0], - "elbow": [0.0, 0.0], + "elbow": [0, 0], } dof_vel_range = { - "hip_yaw": [-0.0, 0.1], - "hip_abad": [-0.0, 0.1], - "hip_pitch": [-0.1, -0.1], - "knee": [-0.05, 0.05], - "ankle": [-0.05, 0.05], + "hip_yaw": [-0.1, 0.1], + "hip_abad": [-0.1, 0.1], + "hip_pitch": [-0.1, 0.1], + "knee": [-0.1, 0.1], + "ankle": [-0.1, 0.1], "shoulder_pitch": [0.0, 0.0], "shoulder_abad": [0.0, 0.0], "shoulder_yaw": [0.0, 0.0], @@ -68,20 +71,20 @@ class init_state(LeggedRobotCfg.init_state): root_pos_range = [ [0.0, 0.0], # x [0.0, 0.0], # y - [0.7, 0.72], # z + [0.64, 0.7], # z [-0.1, 0.1], # roll [-0.1, 0.1], # pitch [-0.1, 0.1], ] # yaw root_vel_range = [ - [-0.1, 0.1], # x - [-0.1, 0.1], # y - [-0.1, 0.1], # z - [-0.1, 0.1], # roll - [-0.1, 0.1], # pitch - [-0.1, 0.1], - ] # yaw + [-0.75, 2.75], # x + [-0.55, 0.55], # y + [-0.35, 0.1], # z + [-0.35, 0.35], # roll + [-0.35, 0.35], # pitch + [-0.35, 0.35], # yaw + ] class control(LeggedRobotCfg.control): # * PD Drive parameters: @@ -94,39 +97,44 @@ class control(LeggedRobotCfg.control): "shoulder_pitch": 40.0, "shoulder_abad": 40.0, "shoulder_yaw": 40.0, - "elbow": 40.0, + "elbow": 50.0, } # [N*m/rad] damping = { - "hip_yaw": 5.0, - "hip_abad": 5.0, - "hip_pitch": 5.0, - "knee": 5.0, - "ankle": 5.0, - "shoulder_pitch": 5.0, - "shoulder_abad": 5.0, - "shoulder_yaw": 5.0, - "elbow": 5.0, + "hip_yaw": 2.0, + "hip_abad": 2.0, + "hip_pitch": 2.0, + "knee": 2.0, + "ankle": 2.0, + "shoulder_pitch": 2.0, + "shoulder_abad": 2.0, + "shoulder_yaw": 2.0, + "elbow": 1.0, } # [N*m*s/rad] ctrl_frequency = 100 - desired_sim_frequency = 800 + desired_sim_frequency = 500 + + filter_gain = 0.1586 # 1: no filtering, 0: wall + + class oscillator: + base_frequency = 3.0 # [Hz] class commands: resampling_time = 10.0 # time before command are changed[s] class ranges: - lin_vel_x = [-1.0, 1.0] # min max [m/s] - lin_vel_y = 1.0 # max [m/s] - yaw_vel = 1 # max [rad/s] + lin_vel_x = [-0.0, 4.0] # min max [m/s] [-0.75, 0.75] + lin_vel_y = 0.0 # max [m/s] + yaw_vel = 0.0 # max [rad/s] class push_robots: - toggle = False - interval_s = 15 - max_push_vel_xy = 0.05 - push_box_dims = [0.1, 0.2, 0.3] # x,y,z [m] + toggle = True + interval_s = 1 + max_push_vel_xy = 0.5 + push_box_dims = [0.1, 0.1, 0.3] # x,y,z [m] class domain_rand: - randomize_friction = False + randomize_friction = True friction_range = [0.5, 1.25] randomize_base_mass = True added_mass_range = [-1.0, 1.0] @@ -134,61 +142,100 @@ class domain_rand: class asset(LeggedRobotCfg.asset): file = ( "{LEGGED_GYM_ROOT_DIR}/resources/robots/" - + "mit_humanoid/urdf/humanoid_R_sf.urdf" + + "mit_humanoid/urdf/humanoid_F_sf_learnt.urdf" ) + # foot_collisionbox_names = ["foot"] foot_name = "foot" - penalize_contacts_on = ["base", "arm"] + penalize_contacts_on = ["arm"] terminate_after_contacts_on = ["base"] - end_effector_names = ["hand", "foot"] + end_effector_names = ["hand", "foot"] # ?? flip_visual_attachments = False - self_collisions = 1 # 1 to disagble, 0 to enable...bitwise filter + self_collisions = 0 # 1 to disagble, 0 to enable...bitwise filter collapse_fixed_joints = False # * see GymDofDriveModeFlags # * (0 is none, 1 is pos tgt, 2 is vel tgt, 3 effort) default_dof_drive_mode = 3 + fix_base_link = False disable_gravity = False disable_motors = False - apply_humanoid_jacobian = False + total_mass = 25.0 class reward_settings(LeggedRobotCfg.reward_settings): - soft_dof_pos_limit = 0.9 - soft_dof_vel_limit = 0.9 - soft_torque_limit = 0.9 + soft_dof_pos_limit = 0.8 + soft_dof_vel_limit = 0.8 + soft_torque_limit = 0.8 max_contact_force = 1500.0 + base_height_target = BASE_HEIGHT_REF + tracking_sigma = 0.25 - base_height_target = 0.65 - tracking_sigma = 0.5 + # a smooth switch based on |cmd| (commanded velocity). + switch_scale = 0.5 + switch_threshold = 0.2 class scaling(LeggedRobotCfg.scaling): - # * dimensionless time: sqrt(L/g) or sqrt(I/[mgL]), with I=I0+mL^2 - virtual_leg_length = 0.65 - dimensionless_time = (virtual_leg_length / 9.81) ** 0.5 - base_height = virtual_leg_length - base_lin_vel = virtual_leg_length / dimensionless_time - base_ang_vel = 3.14 / dimensionless_time - dof_vel = 20 # ought to be roughly max expected speed. - height_measurements = virtual_leg_length - - # todo check order of joints, create per-joint scaling - dof_pos = 3.14 - dof_pos_obs = dof_pos - # * Action scales + base_ang_vel = 2.5 + base_lin_vel = 1.5 + commands = 1 + base_height = BASE_HEIGHT_REF + dof_pos = [ + 0.1, + 0.2, + 0.8, + 0.8, + 0.8, + 0.1, + 0.2, + 0.8, + 0.8, + 0.8, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + ] + # # * Action scales dof_pos_target = dof_pos - tau_ff = 0.1 + dof_vel = [ + 0.5, + 1.0, + 4.0, + 4.0, + 2.0, + 0.5, + 1.0, + 4.0, + 4.0, + 2.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + ] + dof_pos_history = 3 * dof_pos class MITHumanoidRunnerCfg(LeggedRobotRunnerCfg): seed = -1 runner_class_name = "OnPolicyRunner" - class policy(LeggedRobotRunnerCfg.policy): + class actor(LeggedRobotRunnerCfg.actor): + frequency = 100 init_noise_std = 1.0 - actor_hidden_dims = [512, 256, 128] - critic_hidden_dims = [512, 256, 128] + hidden_dims = [512, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" + layer_norm = [True, True, False] + smooth_exploration = False - actor_obs = [ + obs = [ "base_height", "base_lin_vel", "base_ang_vel", @@ -196,59 +243,86 @@ class policy(LeggedRobotRunnerCfg.policy): "commands", "dof_pos_obs", "dof_vel", - "dof_pos_history", + # "dof_pos_history", + "sampled_history_dof_pos", + "sampled_history_dof_vel", + "sampled_history_dof_pos_target", ] - critic_obs = actor_obs + normalize_obs = False actions = ["dof_pos_target"] + disable_actions = False class noise: - base_height = 0.05 - dof_pos_obs = 0.0 - dof_vel = 0.0 - base_lin_vel = 0.1 - base_ang_vel = 0.2 - projected_gravity = 0.05 - height_measurements = 0.1 + dof_pos = 0.005 + dof_vel = 0.05 + base_ang_vel = 0.025 + base_lin_vel = 0.025 + projected_gravity = 0.01 + feet_contact_state = 0.025 + + class critic(LeggedRobotRunnerCfg.critic): + hidden_dims = [512, 256, 128] + # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid + activation = "elu" + layer_norm = [True, True, False] + + obs = [ + "base_height", + "base_lin_vel", + "base_ang_vel", + "projected_gravity", + "dof_pos_obs", + "dof_vel", + "sampled_history_dof_pos", + "sampled_history_dof_vel", + "sampled_history_dof_pos_target", + ] + normalize_obs = False class reward: class weights: + tracking_lin_vel = 4.0 tracking_ang_vel = 0.5 - tracking_lin_vel = 0.5 - orientation = 1.5 - torques = 5.0e-6 + # orientation = 1.0 + torques = 5.0e-4 min_base_height = 1.5 - action_rate = 0.01 - action_rate2 = 0.001 + action_rate = 1e-3 + action_rate2 = 1e-3 lin_vel_z = 0.0 ang_vel_xy = 0.0 - dof_vel = 0.0 - stand_still = 0.0 - dof_pos_limits = 0.0 - dof_near_home = 0.5 + # dof_vel = 0.25 + # stand_still = 0.25 + dof_pos_limits = 0.25 + dof_near_home = 0.25 + hips_forward = 0.0 + walk_freq = 0.0 # 2.5 class termination_weight: termination = 15 class algorithm(LeggedRobotRunnerCfg.algorithm): - # * training params - value_loss_coef = 1.0 - use_clipped_value_loss = True + # both + gamma = 0.99 + lam = 0.95 + # shared + batch_size = 4096 + max_gradient_steps = 48 clip_param = 0.2 - entropy_coef = 0.001 - num_learning_epochs = 5 - # * mini batch size = num_envs*nsteps / nminibatches - num_mini_batches = 4 - learning_rate = 5.0e-4 + learning_rate = 1.0e-3 + max_grad_norm = 1.0 + # Critic + use_clipped_value_loss = True + # Actor + entropy_coef = 0.01 schedule = "adaptive" # could be adaptive, fixed - gamma = 0.999 - lam = 0.95 desired_kl = 0.01 - max_grad_norm = 1.0 + lr_range = [1e-5, 1e-2] + lr_ratio = 1.5 class runner(LeggedRobotRunnerCfg.runner): policy_class_name = "ActorCritic" - algorithm_class_name = "PPO" + algorithm_class_name = "PPO2" num_steps_per_env = 24 max_iterations = 1000 run_name = "Standing" diff --git a/gym/envs/pendulum/pendulum.py b/gym/envs/pendulum/pendulum.py new file mode 100644 index 00000000..90bc60ae --- /dev/null +++ b/gym/envs/pendulum/pendulum.py @@ -0,0 +1,91 @@ +from math import sqrt +import torch +import numpy as np + +from gym.envs.base.fixed_robot import FixedRobot + + +class Pendulum(FixedRobot): + def _init_buffers(self): + super()._init_buffers() + self.dof_pos_obs = torch.zeros(self.num_envs, 2, device=self.device) + + def _post_decimation_step(self): + super()._post_decimation_step() + self.dof_pos_obs = torch.cat([self.dof_pos.sin(), self.dof_pos.cos()], dim=1) + + def _reset_system(self, env_ids): + super()._reset_system(env_ids) + self.dof_pos_obs[env_ids] = torch.cat( + [self.dof_pos[env_ids].sin(), self.dof_pos[env_ids].cos()], dim=1 + ) + + def _check_terminations_and_timeouts(self): + super()._check_terminations_and_timeouts() + self.terminated = self.timed_out + + def reset_to_uniform(self, env_ids): + grid_points = int(sqrt(self.num_envs)) + lin_pos = torch.linspace( + self.dof_pos_range[0, 0], + self.dof_pos_range[0, 1], + grid_points, + device=self.device, + ) + lin_vel = torch.linspace( + self.dof_vel_range[0, 0], + self.dof_vel_range[0, 1], + grid_points, + device=self.device, + ) + grid = torch.cartesian_prod(lin_pos, lin_vel) + self.dof_pos[env_ids] = grid[env_ids, 0].unsqueeze(-1) + self.dof_vel[env_ids] = grid[env_ids, 1].unsqueeze(-1) + + def _reward_theta(self): + theta_rwd = torch.cos(self.dof_pos[:, 0]) # no scaling + return self._sqrdexp(theta_rwd) + + def _reward_omega(self): + omega_rwd = torch.square(self.dof_vel[:, 0] / self.scales["dof_vel"]) + return self._sqrdexp(omega_rwd) + + def _reward_equilibrium(self): + # theta_norm = self._normalize_theta() + # omega = self.dof_vel[:, 0] + # error = torch.stack( + # [theta_norm / self.scales["dof_pos"], omega / self.scales["dof_vel"]], + # dim=1) + # return self._sqrdexp(torch.mean(error, dim=1), 0.01) + + # todo compare alternatives + error = torch.abs(self.dof_state) + error[:, 0] /= self.scales["dof_pos"] + error[:, 1] /= self.scales["dof_vel"] + return self._sqrdexp(torch.mean(error, dim=1), sigma=0.01) + + def _reward_torques(self): + """Penalize torques""" + return self._sqrdexp(torch.mean(torch.square(self.torques), dim=1), sigma=0.2) + + def _reward_energy(self): + kinetic_energy = ( + 0.5 + * self.cfg.asset.mass + * self.cfg.asset.length**2 + * torch.square(self.dof_vel[:, 0]) + ) + potential_energy = ( + self.cfg.asset.mass + * 9.81 + * self.cfg.asset.length + * torch.cos(self.dof_pos[:, 0]) + ) + desired_energy = self.cfg.asset.mass * 9.81 * self.cfg.asset.length + energy_error = kinetic_energy + potential_energy - desired_energy + return self._sqrdexp(energy_error / desired_energy) + + def _normalize_theta(self): + # normalize to range [-pi, pi] + theta = self.dof_pos[:, 0] + return ((theta + np.pi) % (2 * np.pi)) - np.pi diff --git a/gym/envs/pendulum/pendulum_PSD_config.py b/gym/envs/pendulum/pendulum_PSD_config.py new file mode 100644 index 00000000..083588c8 --- /dev/null +++ b/gym/envs/pendulum/pendulum_PSD_config.py @@ -0,0 +1,62 @@ +from gym.envs.pendulum.pendulum_SAC_config import PendulumSACCfg, PendulumSACRunnerCfg + + +class PendulumPSDCfg(PendulumSACCfg): + class env(PendulumSACCfg.env): + pass + + class init_state(PendulumSACCfg.init_state): + pass + + class control(PendulumSACCfg.control): + pass + + class asset(PendulumSACCfg.asset): + pass + + class reward_settings(PendulumSACCfg.reward_settings): + pass + + class scaling(PendulumSACCfg.scaling): + pass + + +class PendulumPSDRunnerCfg(PendulumSACRunnerCfg): + seed = -1 + runner_class_name = "PSACRunner" + + class actor(PendulumSACRunnerCfg.actor): + pass + + class critic(PendulumSACRunnerCfg.critic): + obs = ["dof_pos_obs", "dof_vel"] + hidden_dims = [16, 16] + # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid + dropouts = [0.1, 0.0] + layer_norm = [True, True] + activation = "elu" + # TODO[lm]: Current normalization uses torch.no_grad, this should be changed + normalize_obs = False + + # critic_class_name = "CholeskyLatent" + # critic_class_name = "Critic" + critic_class_name = "DenseSpectralLatent" + # * some class-specific params + minimize = False + relative_dim = 4 # 16 + latent_dim = 4 # 18, + latent_hidden_dims = [16] + latent_activation = "elu" + + class algorithm(PendulumSACRunnerCfg.algorithm): + alpha_lr = 1e-4 + actor_lr = 1e-4 + critic_lr = 1e-4 + + class runner(PendulumSACRunnerCfg.runner): + run_name = "" + experiment_name = "psd_pendulum" + algorithm_class_name = "SAC" + num_steps_per_env = 1 + save_interval = 500 + log_storage = True diff --git a/gym/envs/pendulum/pendulum_SAC_config.py b/gym/envs/pendulum/pendulum_SAC_config.py new file mode 100644 index 00000000..42bf0265 --- /dev/null +++ b/gym/envs/pendulum/pendulum_SAC_config.py @@ -0,0 +1,103 @@ +import torch +from gym.envs.base.fixed_robot_config import FixedRobotCfgPPO +from gym.envs.pendulum.pendulum_config import PendulumCfg + + +class PendulumSACCfg(PendulumCfg): + class env(PendulumCfg.env): + num_envs = 1024 + episode_length_s = 10 + + class init_state(PendulumCfg.init_state): + reset_mode = "reset_to_range" + default_joint_angles = {"theta": 0.0} + dof_pos_range = { + "theta": [-torch.pi, torch.pi], + } + dof_vel_range = {"theta": [-5, 5]} + + class control(PendulumCfg.control): + ctrl_frequency = 100 + desired_sim_frequency = 100 + + class asset(PendulumCfg.asset): + joint_damping = 0.1 + + class reward_settings(PendulumCfg.reward_settings): + tracking_sigma = 0.25 + + class scaling(PendulumCfg.scaling): + dof_vel = 5.0 + dof_pos = 2.0 * torch.pi + tau_ff = 1.0 + + +class PendulumSACRunnerCfg(FixedRobotCfgPPO): + seed = -1 + runner_class_name = "OffPolicyRunner" + + class actor(FixedRobotCfgPPO.actor): + frequency = 10 + latent_nn = {"hidden_dims": [128, 64], "activation": "elu", "layer_norm": True} + mean_nn = {"hidden_dims": [32], "activation": "elu", "layer_norm": True} + std_nn = {"hidden_dims": [32], "activation": "elu", "layer_norm": True} + nn_params = {"latent": latent_nn, "mean": mean_nn, "std": std_nn} + + normalize_obs = False + obs = ["dof_pos_obs", "dof_vel"] + actions = ["tau_ff"] + disable_actions = False + + class noise: + dof_pos = 0.0 + dof_vel = 0.0 + + class critic(FixedRobotCfgPPO.critic): + obs = [ + "dof_pos_obs", + "dof_vel", + ] + hidden_dims = [128, 64, 32] + # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid + activation = "elu" + # TODO[lm]: Current normalization uses torch.no_grad, this should be changed + normalize_obs = False + + class reward: + class weights: + theta = 0.0 + omega = 0.0 + equilibrium = 2.0 + energy = 0.5 + dof_vel = 0.0 + torques = 0.025 + + class termination_weight: + termination = 0.0 + + class algorithm(FixedRobotCfgPPO.algorithm): + initial_fill = 0 + storage_size = 100 * 1024 # 10**6 # 17 + batch_size = 256 # 4096 + max_gradient_steps = 1 # 10 # SB3: 1 + action_max = 2.0 + action_min = -2.0 + actor_noise_std = 1.0 + log_std_max = 4.0 + log_std_min = -20.0 + alpha = 0.2 + target_entropy = -1.0 + max_grad_norm = 1.0 + polyak = 0.995 # flipped compared to stable-baselines3 (polyak == 1-tau) + gamma = 0.99 + alpha_lr = 1e-4 + actor_lr = 1e-3 + critic_lr = 1e-3 + + class runner(FixedRobotCfgPPO.runner): + run_name = "" + experiment_name = "sac_pendulum" + max_iterations = 30_000 # number of policy updates + algorithm_class_name = "SAC" + save_interval = 5000 + num_steps_per_env = 1 diff --git a/gym/envs/pendulum/pendulum_config.py b/gym/envs/pendulum/pendulum_config.py new file mode 100644 index 00000000..910fe341 --- /dev/null +++ b/gym/envs/pendulum/pendulum_config.py @@ -0,0 +1,133 @@ +import torch + +from gym.envs.base.fixed_robot_config import FixedRobotCfg, FixedRobotCfgPPO + + +class PendulumCfg(FixedRobotCfg): + class env(FixedRobotCfg.env): + num_envs = 4096 + num_actuators = 1 + episode_length_s = 10 + + class terrain(FixedRobotCfg.terrain): + pass + + class viewer: + ref_env = 0 + pos = [10.0, 5.0, 10.0] # [m] + lookat = [0.0, 0.0, 0.0] # [m] + + class init_state(FixedRobotCfg.init_state): + default_joint_angles = {"theta": 0} # -torch.pi / 2.0} + + # * default setup chooses how the initial conditions are chosen. + # * "reset_to_basic" = a single position + # * "reset_to_range" = uniformly random from a range defined below + reset_mode = "reset_to_range" + + # * initial conditions for reset_to_range + dof_pos_range = { + "theta": [-torch.pi, torch.pi], + } + dof_vel_range = {"theta": [-5, 5]} + + class control(FixedRobotCfg.control): + actuated_joints_mask = [1] # angle + ctrl_frequency = 25 + desired_sim_frequency = 200 + stiffness = {"theta": 0.0} # [N*m/rad] + damping = {"theta": 0.0} # [N*m*s/rad] + + class asset(FixedRobotCfg.asset): + # * Things that differ + file = "{LEGGED_GYM_ROOT_DIR}/resources/robots/" + "pendulum/urdf/pendulum.urdf" + disable_gravity = False + disable_motors = False # all torques set to 0 + joint_damping = 0.1 + mass = 1.0 + length = 1.0 + + class reward_settings(FixedRobotCfg.reward_settings): + tracking_sigma = 0.25 + + class scaling(FixedRobotCfg.scaling): + dof_vel = 5.0 + dof_pos = 2.0 * torch.pi + # * Action scales + tau_ff = 1.0 + + +class PendulumRunnerCfg(FixedRobotCfgPPO): + seed = -1 + runner_class_name = "OnPolicyRunner" + + class actor(FixedRobotCfgPPO.actor): + frequency = 25 + hidden_dims = [128, 64, 32] + # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid + activation = "tanh" + + # TODO[lm]: Handle normalization in SAC, then also use it here again + normalize_obs = False + obs = [ + "dof_pos_obs", + "dof_vel", + ] + + actions = ["tau_ff"] + disable_actions = False + + class noise: + dof_pos = 0.0 + dof_vel = 0.0 + + class critic: + hidden_dims = [128, 64, 32] + # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid + activation = "tanh" + + # TODO[lm]: Handle normalization in SAC, then also use it here again + normalize_obs = False + obs = [ + "dof_pos_obs", + "dof_vel", + ] + + class reward: + class weights: + theta = 0.1 + omega = 0.1 + equilibrium = 1.0 + energy = 0.5 + dof_vel = 0.1 + torques = 0.025 + + class termination_weight: + termination = 0.0 + + class algorithm(FixedRobotCfgPPO.algorithm): + # both + gamma = 0.95 + # discount_horizon = 2.0 + lam = 0.98 + # shared + max_gradient_steps = 24 + # new + storage_size = 2**17 # new + batch_size = 2**16 # new + clip_param = 0.2 + learning_rate = 1.0e-4 + max_grad_norm = 1.0 + # Critic + use_clipped_value_loss = True + # Actor + entropy_coef = 0.01 + schedule = "fixed" # could be adaptive, fixed + desired_kl = 0.01 + + class runner(FixedRobotCfgPPO.runner): + run_name = "" + experiment_name = "pendulum" + max_iterations = 200 # number of policy updates + algorithm_class_name = "PPO2" + num_steps_per_env = 32 diff --git a/gym/utils/helpers.py b/gym/utils/helpers.py index 90af8bce..8e74d93c 100644 --- a/gym/utils/helpers.py +++ b/gym/utils/helpers.py @@ -191,7 +191,7 @@ def get_args(custom_parameters=None): { "name": "--task", "type": str, - "default": "cartpole", + "default": "lander", "help": "Resume training or start testing from a checkpoint. " "Overrides config file if provided.", }, diff --git a/gym/utils/logging_and_saving/wandb_singleton.py b/gym/utils/logging_and_saving/wandb_singleton.py index a1b45b8a..bba14fea 100644 --- a/gym/utils/logging_and_saving/wandb_singleton.py +++ b/gym/utils/logging_and_saving/wandb_singleton.py @@ -7,7 +7,7 @@ class WandbSingleton(object): def __new__(self): if not hasattr(self, "instance"): - self.instance = super(WandbSingleton, self).__new__(self) + self.instance = super().__new__(self) self.entity_name = None self.project_name = None self.experiment_name = "" diff --git a/gym/utils/task_registry.py b/gym/utils/task_registry.py index 6c2025ba..cc0787ca 100644 --- a/gym/utils/task_registry.py +++ b/gym/utils/task_registry.py @@ -178,6 +178,9 @@ def set_control_and_sim_dt(self, env_cfg, train_cfg): f" to {env_cfg.sim_dt}." ) + if not hasattr(train_cfg.actor, "frequency"): + train_cfg.actor.frequency = env_cfg.control.ctrl_frequency + def set_discount_rates(self, train_cfg, dt): if hasattr(train_cfg.algorithm, "discount_horizon"): hrzn = train_cfg.algorithm.discount_horizon diff --git a/learning/__init__.py b/learning/__init__.py index 34f15bd6..15c19314 100644 --- a/learning/__init__.py +++ b/learning/__init__.py @@ -29,3 +29,9 @@ # THE POSSIBILITY OF SUCH DAMAGE. # # Copyright (c) 2021 ETH Zurich, Nikita Rudin + +import os + +LEGGED_GYM_ROOT_DIR = os.path.dirname( + os.path.dirname(os.path.realpath(__file__)) +) diff --git a/learning/algorithms/__init__.py b/learning/algorithms/__init__.py index 7827379b..7211a720 100644 --- a/learning/algorithms/__init__.py +++ b/learning/algorithms/__init__.py @@ -31,4 +31,6 @@ # Copyright (c) 2021 ETH Zurich, Nikita Rudin from .ppo import PPO +from .ppo2 import PPO2 from .SE import StateEstimator +from .sac import SAC diff --git a/learning/algorithms/ppo.py b/learning/algorithms/ppo.py index 9c7454bd..92898ad5 100644 --- a/learning/algorithms/ppo.py +++ b/learning/algorithms/ppo.py @@ -69,7 +69,12 @@ def __init__( self.actor_critic = actor_critic self.actor_critic.to(self.device) self.storage = None # initialized later - self.optimizer = optim.Adam(self.actor_critic.parameters(), lr=learning_rate) + self.optimizer = optim.Adam( + self.actor_critic.actor.parameters(), lr=learning_rate + ) + self.crit_optimizer = optim.Adam( + self.actor_critic.critic.parameters(), lr=learning_rate + ) self.transition = RolloutStorage.Transition() # * PPO parameters @@ -121,6 +126,11 @@ def act(self, obs, critic_obs): return self.transition.actions def process_env_step(self, rewards, dones, timed_out=None): + if self.storage is None: + raise AttributeError( + "This version of PPO is deprecated and only works with OldPolicyRunner." + "Use PPO2 instead." + ) self.transition.rewards = rewards.clone() self.transition.dones = dones # * Bootstrapping on time outs @@ -156,7 +166,6 @@ def update(self): actions_log_prob_batch = self.actor_critic.get_actions_log_prob( actions_batch ) - value_batch = self.actor_critic.evaluate(critic_obs_batch) mu_batch = self.actor_critic.action_mean sigma_batch = self.actor_critic.action_std entropy_batch = self.actor_critic.entropy @@ -195,6 +204,7 @@ def update(self): surrogate_loss = torch.max(surrogate, surrogate_clipped).mean() # * Value function loss + value_batch = self.actor_critic.evaluate(critic_obs_batch) if self.use_clipped_value_loss: value_clipped = target_values_batch + ( value_batch - target_values_batch @@ -207,16 +217,25 @@ def update(self): loss = ( surrogate_loss - + self.value_loss_coef * value_loss + # + self.value_loss_coef * value_loss - self.entropy_coef * entropy_batch.mean() ) # * Gradient step self.optimizer.zero_grad() loss.backward() - nn.utils.clip_grad_norm_(self.actor_critic.parameters(), self.max_grad_norm) + nn.utils.clip_grad_norm_( + self.actor_critic.actor.parameters(), self.max_grad_norm + ) self.optimizer.step() + self.crit_optimizer.zero_grad() + value_loss.backward() + nn.utils.clip_grad_norm_( + self.actor_critic.critic.parameters(), self.max_grad_norm + ) + self.crit_optimizer.step() + self.mean_value_loss += value_loss.item() self.mean_surrogate_loss += surrogate_loss.item() diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py new file mode 100644 index 00000000..6c64e334 --- /dev/null +++ b/learning/algorithms/ppo2.py @@ -0,0 +1,165 @@ +import torch +import torch.nn as nn +import torch.optim as optim + +from learning.utils import ( + create_uniform_generator, + compute_generalized_advantages, + normalize, +) + + +class PPO2: + def __init__( + self, + actor, + critic, + batch_size=2**15, + max_gradient_steps=10, + clip_param=0.2, + gamma=0.998, + lam=0.95, + entropy_coef=0.0, + learning_rate=1e-3, + max_grad_norm=1.0, + use_clipped_value_loss=True, + schedule="fixed", + desired_kl=0.01, + device="cpu", + lr_range=[1e-4, 1e-2], + lr_ratio=1.3, + **kwargs, + ): + self.device = device + + self.desired_kl = desired_kl + self.schedule = schedule + self.learning_rate = learning_rate + self.lr_range = lr_range + self.lr_ratio = lr_ratio + + # * PPO components + self.actor = actor.to(self.device) + self.optimizer = optim.Adam(self.actor.parameters(), lr=learning_rate) + self.critic = critic.to(self.device) + self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=learning_rate) + + # * PPO parameters + self.clip_param = clip_param + self.batch_size = batch_size + self.max_gradient_steps = max_gradient_steps + self.entropy_coef = entropy_coef + self.gamma = gamma + self.lam = lam + self.max_grad_norm = max_grad_norm + self.use_clipped_value_loss = use_clipped_value_loss + + def switch_to_train(self): + self.actor.train() + self.critic.train() + + def act(self, obs): + return self.actor.act(obs).detach() + + def update(self, data): + data["values"] = self.critic.evaluate(data["critic_obs"]) + data["advantages"] = compute_generalized_advantages( + data, self.gamma, self.lam, self.critic + ) + data["returns"] = data["advantages"] + data["values"] + self.update_critic(data) + data["advantages"] = normalize(data["advantages"]) + self.update_actor(data) + + def update_critic(self, data): + self.mean_value_loss = 0 + counter = 0 + + generator = create_uniform_generator( + data, + self.batch_size, + max_gradient_steps=self.max_gradient_steps, + ) + for batch in generator: + value_loss = self.critic.loss_fn(batch["critic_obs"], batch["returns"]) + self.critic_optimizer.zero_grad() + value_loss.backward() + nn.utils.clip_grad_norm_(self.critic.parameters(), self.max_grad_norm) + self.critic_optimizer.step() + self.mean_value_loss += value_loss.item() + counter += 1 + self.mean_value_loss /= counter + + def update_actor(self, data): + self.mean_surrogate_loss = 0 + counter = 0 + + self.actor.update_distribution(data["actor_obs"]) + data["old_sigma_batch"] = self.actor.action_std.detach() + data["old_mu_batch"] = self.actor.action_mean.detach() + data["old_actions_log_prob_batch"] = self.actor.get_actions_log_prob( + data["actions"] + ).detach() + + generator = create_uniform_generator( + data, + self.batch_size, + max_gradient_steps=self.max_gradient_steps, + ) + for batch in generator: + self.actor.update_distribution(batch["actor_obs"]) + actions_log_prob_batch = self.actor.get_actions_log_prob(batch["actions"]) + mu_batch = self.actor.action_mean + sigma_batch = self.actor.action_std + entropy_batch = self.actor.entropy + + # * KL + if self.desired_kl is not None and self.schedule == "adaptive": + with torch.inference_mode(): + kl = torch.sum( + torch.log(sigma_batch / batch["old_sigma_batch"] + 1.0e-5) + + ( + torch.square(batch["old_sigma_batch"]) + + torch.square(batch["old_mu_batch"] - mu_batch) + ) + / (2.0 * torch.square(sigma_batch)) + - 0.5, + axis=-1, + ) + kl_mean = torch.mean(kl) + lr_min, lr_max = self.lr_range + + if kl_mean > self.desired_kl * 2.0: + self.learning_rate = max( + lr_min, self.learning_rate / self.lr_ratio + ) + elif kl_mean < self.desired_kl / 2.0 and kl_mean > 0.0: + self.learning_rate = min( + lr_max, self.learning_rate * self.lr_ratio + ) + + for param_group in self.optimizer.param_groups: + # ! check this + param_group["lr"] = self.learning_rate + + # * Surrogate loss + ratio = torch.exp( + actions_log_prob_batch + - torch.squeeze(batch["old_actions_log_prob_batch"]) + ) + surrogate = -torch.squeeze(batch["advantages"]) * ratio + surrogate_clipped = -torch.squeeze(batch["advantages"]) * torch.clamp( + ratio, 1.0 - self.clip_param, 1.0 + self.clip_param + ) + surrogate_loss = torch.max(surrogate, surrogate_clipped).mean() + + loss = surrogate_loss - self.entropy_coef * entropy_batch.mean() + + # * Gradient step + self.optimizer.zero_grad() + loss.backward() + nn.utils.clip_grad_norm_(self.actor.parameters(), self.max_grad_norm) + self.optimizer.step() + self.mean_surrogate_loss += surrogate_loss.item() + counter += 1 + self.mean_surrogate_loss /= counter diff --git a/learning/algorithms/sac.py b/learning/algorithms/sac.py new file mode 100644 index 00000000..7bda74cb --- /dev/null +++ b/learning/algorithms/sac.py @@ -0,0 +1,268 @@ +import torch + +# import torch.nn as nn +import torch.optim as optim + +from learning.utils import create_uniform_generator, polyak_update + + +class SAC: + def __init__( + self, + actor, + critic_1, + critic_2, + target_critic_1, + target_critic_2, + batch_size=2**15, + max_gradient_steps=10, + action_max=1.0, + action_min=-1.0, + actor_noise_std=1.0, + log_std_max=4.0, + log_std_min=-20.0, + alpha=0.2, + alpha_lr=1e-4, + target_entropy=None, + max_grad_norm=1.0, + polyak=0.995, + gamma=0.99, + actor_lr=1e-4, + critic_lr=1e-4, + device="cpu", + **kwargs, + ): + self.device = device + + # * SAC components + self.actor = actor.to(self.device) + self.critic_1 = critic_1.to(self.device) + self.critic_2 = critic_2.to(self.device) + self.target_critic_1 = target_critic_1.to(self.device) + self.target_critic_2 = target_critic_2.to(self.device) + self.target_critic_1.load_state_dict(self.critic_1.state_dict()) + self.target_critic_2.load_state_dict(self.critic_2.state_dict()) + + self.log_alpha = torch.log(torch.tensor(alpha)).requires_grad_() + + self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_lr) + self.log_alpha_optimizer = optim.Adam([self.log_alpha], lr=alpha_lr) + self.critic_1_optimizer = optim.Adam(self.critic_1.parameters(), lr=critic_lr) + self.critic_2_optimizer = optim.Adam(self.critic_2.parameters(), lr=critic_lr) + + # this is probably something to put into the neural network + self.action_max = action_max + self.action_min = action_min + + self.action_delta = (self.action_max - self.action_min) / 2.0 + self.action_offset = (self.action_max + self.action_min) / 2.0 + + self.max_grad_norm = max_grad_norm + self.target_entropy = ( + target_entropy if target_entropy else -self.actor.num_actions + ) + + # * SAC parameters + self.max_gradient_steps = max_gradient_steps + self.batch_size = batch_size + self.polyak = polyak + self.gamma = gamma + # self.ent_coef = "fixed" + # self.target_entropy = "fixed" + + @property + def alpha(self): + return self.log_alpha.exp() + + @property + def alpha_lr(self): + return self.log_alpha_optimizer.param_groups[0]["lr"] + + @property + def critic_1_lr(self): + return self.critic_1_optimizer.param_groups[0]["lr"] + + @property + def critic_2_lr(self): + return self.critic_2_optimizer.param_groups[0]["lr"] + + @property + def actor_lr(self): + return self.actor_optimizer.param_groups[0]["lr"] + + def switch_to_train(self): + self.actor.train() + self.critic_1.train() + self.critic_2.train() + self.target_critic_1.train() + self.target_critic_2.train() + + def switch_to_eval(self): + self.actor.eval() + self.critic_1.eval() + self.critic_2.eval() + self.target_critic_1.eval() + self.target_critic_2.eval() + + def act(self, obs): + mean, std = self.actor.forward(obs, deterministic=False) + distribution = torch.distributions.Normal(mean, std) + actions = distribution.rsample() + actions_normalized = torch.tanh(actions) + # RSL also does a resahpe(-1, self.action_size), not sure why + actions_scaled = ( + actions_normalized * self.action_delta + self.action_offset + ).clamp(self.action_min, self.action_max) + return actions_scaled + + def act_inference(self, obs): + mean = self.actor.forward(obs, deterministic=True) + actions_normalized = torch.tanh(mean) + actions_scaled = ( + actions_normalized * self.action_delta + self.action_offset + ).clamp(self.action_min, self.action_max) + return actions_scaled + + def update(self, data): + generator = create_uniform_generator( + data, + self.batch_size, + max_gradient_steps=self.max_gradient_steps, + ) + + count = 0 + self.mean_actor_loss = 0 + self.mean_alpha_loss = 0 + self.mean_critic_1_loss = 0 + self.mean_critic_2_loss = 0 + + for batch in generator: + self.update_critic(batch) + self.update_actor_and_alpha(batch) + + count += 1 + # Update Target Networks + self.target_critic_1 = polyak_update( + self.critic_1, self.target_critic_1, self.polyak + ) + self.target_critic_2 = polyak_update( + self.critic_2, self.target_critic_2, self.polyak + ) + self.mean_actor_loss /= count + self.mean_alpha_loss /= count + self.mean_critic_1_loss /= count + self.mean_critic_2_loss /= count + + return None + + def update_critic(self, batch): + critic_obs = batch["critic_obs"] + actions = batch["actions"] + rewards = batch["rewards"] + next_actor_obs = batch["next_actor_obs"] + next_critic_obs = batch["next_critic_obs"] + dones = batch["dones"] + + with torch.no_grad(): + # * self._sample_action(actor_next_obs) + mean, std = self.actor.forward(next_actor_obs, deterministic=False) + distribution = torch.distributions.Normal(mean, std) + next_actions = distribution.rsample() + + ## * self._scale_actions(actions, intermediate=True) + actions_normalized = torch.tanh(next_actions) + # RSL also does a resahpe(-1, self.action_size), not sure why + actions_scaled = ( + actions_normalized * self.action_delta + self.action_offset + ).clamp(self.action_min, self.action_max) + ## * + action_logp = ( + distribution.log_prob(next_actions) + - torch.log(1.0 - actions_normalized.pow(2) + 1e-6) + ).sum(-1) + + # * returns target_action = actions_scaled, target_action_logp = action_logp + target_action = actions_scaled + target_action_logp = action_logp + + # * self._critic_input + # ! def should put the action computation into the actor + target_critic_in = torch.cat((next_critic_obs, target_action), dim=-1) + target_critic_prediction_1 = self.target_critic_1.forward(target_critic_in) + target_critic_prediction_2 = self.target_critic_2.forward(target_critic_in) + + target_next = ( + torch.min(target_critic_prediction_1, target_critic_prediction_2) + - self.alpha.detach() * target_action_logp + ) + # the detach inside torch.no_grad() should be redundant + target = rewards + self.gamma * dones.logical_not() * target_next + + critic_in = torch.cat((critic_obs, actions), dim=-1) + + # critic_prediction_1 = self.critic_1.forward(critic_in) + critic_loss_1 = self.critic_1.loss_fn(critic_in, target) + self.critic_1_optimizer.zero_grad() + critic_loss_1.backward() + # nn.utils.clip_grad_norm_(self.critic_1.parameters(), self.max_grad_norm) + self.critic_1_optimizer.step() + + # critic_prediction_2 = self.critic_2.forward(critic_in) + critic_loss_2 = self.critic_2.loss_fn(critic_in, target) + self.critic_2_optimizer.zero_grad() + critic_loss_2.backward() + # nn.utils.clip_grad_norm_(self.critic_2.parameters(), self.max_grad_norm) + self.critic_2_optimizer.step() + + self.mean_critic_1_loss += critic_loss_1.item() + self.mean_critic_2_loss += critic_loss_2.item() + + return + + def update_actor_and_alpha(self, batch): + actor_obs = batch["actor_obs"] + critic_obs = batch["critic_obs"] + + mean, std = self.actor.forward(actor_obs, deterministic=False) + distribution = torch.distributions.Normal(mean, std) + actions = distribution.rsample() + + ## * self._scale_actions(actions, intermediate=True) + actions_normalized = torch.tanh(actions) + # RSL also does a resahpe(-1, self.action_size), not sure why + actions_scaled = ( + actions_normalized * self.action_delta + self.action_offset + ).clamp(self.action_min, self.action_max) + ## * + action_logp = ( + distribution.log_prob(actions) + - torch.log(1.0 - actions_normalized.pow(2) + 1e-6) + ).sum(-1) + + # * returns target_action = actions_scaled, target_action_logp = action_logp + actor_prediction = actions_scaled + actor_prediction_logp = action_logp + + # entropy loss + alpha_loss = -( + self.log_alpha * (action_logp + self.target_entropy).detach() + ).mean() + + self.log_alpha_optimizer.zero_grad() + alpha_loss.backward() + self.log_alpha_optimizer.step() + + critic_in = torch.cat((critic_obs, actor_prediction), dim=-1) + q_value_1 = self.critic_1.forward(critic_in) + q_value_2 = self.critic_2.forward(critic_in) + actor_loss = ( + self.alpha.detach() * actor_prediction_logp + - torch.min(q_value_1, q_value_2) + ).mean() + self.actor_optimizer.zero_grad() + actor_loss.backward() + # nn.utils.clip_grad_norm_(self.actor.parameters(), self.max_grad_norm) + self.actor_optimizer.step() + + self.mean_alpha_loss += alpha_loss.item() + self.mean_actor_loss += actor_loss.item() diff --git a/learning/modules/QRCritics.py b/learning/modules/QRCritics.py new file mode 100644 index 00000000..9dd51489 --- /dev/null +++ b/learning/modules/QRCritics.py @@ -0,0 +1,805 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +from learning.modules.utils import RunningMeanStd, create_MLP +from learning.modules.utils import ( + quadratify_xAx, + compose_cholesky, + create_lower_diagonal, + least_squares_fit, + forward_affine, + create_PD_lower_diagonal, +) + + +def init_weights(m): + if isinstance(m, nn.Linear): + torch.nn.init.xavier_uniform_(m.weight) + m.bias.data.fill_(0.01) + + +class Critic2(nn.Module): + def __init__( + self, + num_obs, + hidden_dims=None, + activation="elu", + normalize_obs=True, + device="cuda", + **kwargs, + ): + super().__init__() + + self.NN = create_MLP(num_obs, 1, hidden_dims, activation, **kwargs) + self._normalize_obs = normalize_obs + if self._normalize_obs: + self.obs_rms = RunningMeanStd(num_obs) + self.value_offset = nn.Parameter(torch.zeros(1, device=device)) + + def forward(self, x, return_all=False): + if self._normalize_obs: + with torch.no_grad(): + x = self.obs_rms(x) + value = self.NN(x).squeeze() + self.value_offset + if return_all: + return {"value": value} + else: + return value + + def evaluate(self, critic_observations, return_all=False): + return self.forward(critic_observations, return_all=return_all) + + def loss_fn(self, obs, target, **kwargs): + return nn.functional.mse_loss(self.forward(obs), target, reduction="mean") + + +class OuterProduct(nn.Module): + def __init__( + self, + num_obs, + hidden_dims, + latent_dim=None, + activation="elu", + dropouts=0.0, + normalize_obs=False, + minimize=False, + device="cuda", + **kwargs, + ): + super().__init__() + self.device = device + if latent_dim is None: + latent_dim = num_obs + self.latent_dim = latent_dim + self.minimize = minimize + if minimize: + self.sign = torch.ones(1, device=device) + else: + self.sign = -torch.ones(1, device=device) + self.value_offset = nn.Parameter(torch.zeros(1, device=device)) + + self._normalize_obs = normalize_obs + if self._normalize_obs: + self.obs_rms = RunningMeanStd(num_obs) + + self.NN = create_MLP(num_obs, latent_dim, hidden_dims, activation, dropouts) + # todo: have a linear NN to automatically coord-shift input + + def forward(self, x, return_all=False): + z = self.NN(x) + # outer product. Both of these are equivalent + A = z.unsqueeze(-1) @ z.unsqueeze(-2) + # A2 = torch.einsum("nmx,nmy->nmxy", z, z) + # value = self.sign * quadratify_xAx(x, A) + value = quadratify_xAx(x, A) + value *= 1.0 if self.minimize else -1.0 + value += self.value_offset + + if return_all: + # return value, A, L + return {"value": value, "A": A} + else: + return value + + def evaluate(self, obs, return_all=False): + return self.forward(obs, return_all) + + def loss_fn(self, obs, target, **kwargs): + loss_NN = F.mse_loss(self.forward(obs), target, reduction="mean") + + return loss_NN + + +class OuterProductLatent(OuterProduct): + def __init__( + self, + num_obs, + hidden_dims, + latent_dim=None, + activation="elu", + dropouts=0.0, + normalize_obs=False, + minimize=False, + latent_hidden_dims=[128, 128], + latent_activation="tanh", + device="cuda", + **kwargs, + ): + super().__init__( + num_obs, + hidden_dims, + latent_dim, + activation, + dropouts, + normalize_obs, + minimize, + device, + **kwargs, + ) + self.latent_NN = create_MLP( + num_obs, latent_dim, latent_hidden_dims, latent_activation + ) + + def forward(self, x, return_all=False): + z = self.NN(x) + # outer product. Both of these are equivalent + A = z.unsqueeze(-1) @ z.unsqueeze(-2) + # A2 = torch.einsum("nmx,nmy->nmxy", z, z) + value = self.sign * quadratify_xAx(self.latent_NN(x), A) + value += self.value_offset + + if return_all: + # return value, A, L + return {"value": value, "A": A, "z": z} + else: + return value + + +class CholeskyInput(nn.Module): + def __init__( + self, + num_obs, + hidden_dims, + latent_dim=None, + activation="elu", + dropouts=0.0, + normalize_obs=False, + minimize=False, + device="cuda", + **kwargs, + ): + super().__init__() + self.device = device + if latent_dim is None: + latent_dim = num_obs + self.latent_dim = latent_dim + self.minimize = minimize + if minimize: + self.sign = torch.ones(1, device=device) + else: + self.sign = -torch.ones(1, device=device) + self.value_offset = nn.Parameter(torch.zeros(1, device=device)) + # V = x'Ax + b = a* x'Ax + b, and add a regularization for det(A) ~= 1 + # self.scaling_quadratic = nn.Parameter(torch.ones(1, device=device)) + + self._normalize_obs = normalize_obs + if self._normalize_obs: + self.obs_rms = RunningMeanStd(num_obs) + + num_lower_diag_elements = sum(range(latent_dim + 1)) + self.lower_diag_NN = create_MLP( + num_obs, num_lower_diag_elements, hidden_dims, activation, dropouts + ) + # self.lower_diag_NN.apply(init_weights) + + def forward(self, x, return_all=False): + output = self.lower_diag_NN(x) + L = create_lower_diagonal(output, self.latent_dim, self.device) + A = compose_cholesky(L) + value = self.sign * quadratify_xAx(x, A) + value += self.value_offset + # value *= self.scaling_quadratic + + if return_all: + # return value, A, L + return {"value": value, "A": A, "L": L} + else: + return value + # return self.sign * quadratify_xAx(x, A) + self.value_offset + + def evaluate(self, obs, return_all=False): + return self.forward(obs, return_all) + + def loss_fn(self, obs, target, **kwargs): + return F.mse_loss(self.forward(obs), target, reduction="mean") + + +class CholeskyLatent(CholeskyInput): + def __init__( + self, + num_obs, + hidden_dims, + latent_dim, + latent_hidden_dims=[64], + latent_activation="tanh", + latent_dropouts=0.0, + activation="elu", + dropouts=0.0, + normalize_obs=False, + minimize=False, + device="cuda", + **kwargs, + ): + super().__init__( + num_obs, + hidden_dims, + latent_dim=latent_dim, + activation=activation, + dropouts=dropouts, + normalize_obs=normalize_obs, + minimize=minimize, + device="cuda", + **kwargs, + ) + + self.latent_NN = create_MLP( + num_obs, latent_dim, latent_hidden_dims, latent_activation + ) + + # self.latent_NN.apply(init_weights) + + def forward(self, x, return_all=False): + z = self.latent_NN(x) + output = self.lower_diag_NN(x) + L = create_lower_diagonal(output, self.latent_dim, self.device) + A = compose_cholesky(L) + + value = self.sign * quadratify_xAx(z, A) + # do not affect value offset in this part of the loss + value += self.value_offset + # value *= self.scaling_quadratic + + if return_all: + # return value, A, L, z + return {"value": value, "A": A, "L": L, "z": z} + else: + return value + + def evaluate(self, obs, return_all=False): + return self.forward(obs, return_all) + + +class PDCholeskyInput(CholeskyInput): + def forward(self, x, return_all=False): + output = self.lower_diag_NN(x) + L = create_PD_lower_diagonal(output, self.latent_dim, self.device) + A = compose_cholesky(L) + # assert (torch.linalg.eigvals(A).real > 0).all() + # value = self.sign * quadratify_xAx(x, A) + value = quadratify_xAx(x, A) + value *= 1.0 if self.minimize else -1.0 + value += self.value_offset + # value *= self.scaling_quadratic + + if return_all: + # return value, A, L + return {"value": value, "A": A, "L": L} + else: + return value + # return self.sign * quadratify_xAx(x, A) + self.value_offset + + +class PDCholeskyLatent(CholeskyLatent): + def forward(self, x, return_all=False): + z = self.latent_NN(x) + output = self.lower_diag_NN(x) + L = create_PD_lower_diagonal(output, self.latent_dim, self.device) + A = compose_cholesky(L) + # assert (torch.linalg.eigvals(A).real > 0).all() + value = self.sign * quadratify_xAx(z, A) + value += self.value_offset + # value *= self.scaling_quadratic + + if return_all: + # return value, A, L + return {"value": value, "A": A, "L": L} + else: + return value + + +class SpectralLatent(nn.Module): + def __init__( + self, + num_obs, + hidden_dims, + activation="elu", + dropouts=0.0, + normalize_obs=False, + relative_dim=None, + latent_dim=None, + latent_hidden_dims=[64], + latent_activation="tanh", + latent_dropouts=0.0, + minimize=False, + device="cuda", + **kwargs, + ): + super().__init__() + self.device = device + if relative_dim is None: + relative_dim = num_obs + if latent_dim is None: + latent_dim = num_obs + + self.num_obs = num_obs + self.relative_dim = relative_dim + self.latent_dim = latent_dim + self.minimize = minimize + if minimize: + self.sign = torch.ones(1, device=device) + else: + self.sign = -torch.ones(1, device=device) + self.value_offset = nn.Parameter(torch.zeros(1, device=device)) + # self.scaling_quadratic = nn.Parameter(torch.ones(1, device=device)) + + self._normalize_obs = normalize_obs + if self._normalize_obs: + self.obs_rms = RunningMeanStd(num_obs) + # TODO up to here, same as Cholesky, can be refactored + n_outputs = ( + torch.tril_indices(self.relative_dim, self.latent_dim).shape[1] + + relative_dim + ) + self.spectral_NN = create_MLP( + num_obs, n_outputs, hidden_dims, activation, dropouts + ) + self.latent_NN = create_MLP( + num_obs, latent_dim, latent_hidden_dims, latent_activation + ) + + def forward(self, x, return_all=False): + z = self.latent_NN(x) + y = self.spectral_NN(x) + A_diag = torch.diag_embed(F.softplus(y[..., : self.relative_dim])) + # tril_indices = torch.tril_indices(self.latent_dim, self.relative_dim) + tril_indices = torch.tril_indices(self.relative_dim, self.latent_dim) + L = torch.zeros( + (*x.shape[:-1], self.relative_dim, self.latent_dim), device=self.device + ) + L[..., tril_indices[0], tril_indices[1]] = y[..., self.relative_dim :] + # Compute (L^T A_diag) L + A = torch.einsum( + "...ij,...jk->...ik", + torch.einsum("...ik,...kj->...ij", L.transpose(-1, -2), A_diag), + L, + ) + # assert (torch.linalg.eigvals(A).real >= -1e-6).all() + # ! This fails, but so far with just -e-10, so really really small... + value = self.sign * quadratify_xAx(z, A) + # with torch.no_grad(): + value += self.value_offset + + if return_all: + # return value, z, torch.diag(A_diag), L, A + return { + "value": value, + "z": z, + # "A_diag": torch.diag(A_diag), #! throws shape error + "L": L, + "A": A, + } + else: + return value + + def evaluate(self, obs, return_all=False): + return self.forward(obs, return_all) + + def loss_fn(self, obs, target, **kwargs): + loss_NN = F.mse_loss(self.forward(obs), target, reduction="mean") + + # if self.minimize: + # loss_offset = (self.value_offset / target.min() - 1.0).pow(2) + # else: + # loss_offset = (self.value_offset / target.max() - 1.0).pow(2) + + # loss_scaling = (self.scaling_quadratic - target.mean()).pow(2) + + return loss_NN # + loss_offset # + loss_scaling + + +class DenseSpectralLatent(nn.Module): + def __init__( + self, + num_obs, + hidden_dims, + activation="elu", + dropouts=0.0, + normalize_obs=False, + relative_dim=None, + latent_dim=None, + latent_hidden_dims=[64], + latent_activation="tanh", + latent_dropouts=0.0, + minimize=False, + device="cuda", + **kwargs, + ): + super().__init__() + self.device = device + if relative_dim is None: + relative_dim = num_obs + if latent_dim is None: + latent_dim = num_obs + + self.num_obs = num_obs + self.relative_dim = relative_dim + self.latent_dim = latent_dim + self.minimize = minimize + if minimize: + self.sign = torch.ones(1, device=device) + else: + self.sign = -torch.ones(1, device=device) + self.value_offset = nn.Parameter(torch.zeros(1, device=device)) + # self.scaling_quadratic = nn.Parameter(torch.ones(1, device=device)) + + self._normalize_obs = normalize_obs + if self._normalize_obs: + self.obs_rms = RunningMeanStd(num_obs) + # TODO up to here, same as Cholesky, can be refactored + n_zeros = sum(range(relative_dim)) + n_outputs = relative_dim + relative_dim * latent_dim - n_zeros + self.rel_fill = n_zeros + # n_outputs = ( + # relative_dim * latent_dim + # - torch.tril_indices(self.relative_dim, self.latent_dim).shape[1] + # + relative_dim + # ) + self.spectral_NN = create_MLP( + num_obs, n_outputs, hidden_dims, activation, dropouts + ) + self.latent_NN = create_MLP( + num_obs, latent_dim, latent_hidden_dims, latent_activation + ) + + def forward(self, x, return_all=False): + z = self.latent_NN(x) + y = self.spectral_NN(x) + A_diag = torch.diag_embed(F.softplus(y[..., : self.relative_dim])) + # tril_indices = torch.tril_indices(self.latent_dim, self.relative_dim) + tril_indices = torch.tril_indices(self.relative_dim, self.latent_dim) + L = torch.zeros( + (*x.shape[:-1], self.relative_dim, self.latent_dim), device=self.device + ) + L[..., tril_indices[0], tril_indices[1]] = y[ + ..., self.relative_dim : self.relative_dim + tril_indices.shape[1] + ] + L[..., :, self.relative_dim :] = y[ + ..., self.relative_dim + tril_indices.shape[1] : + ].view(L[..., :, self.relative_dim :].shape) + + # Compute (L^T A_diag) L + A = torch.einsum( + "...ij,...jk->...ik", + torch.einsum("...ik,...kj->...ij", L.transpose(-1, -2), A_diag), + L, + ) + # A = torch.einsum( + # "...ij,...jk->...ik", + # torch.einsum("...ik,...kj->...ij", L, A_diag), + # L.transpose(-1, -2), + # ) + # assert (torch.linalg.eigvals(A).real >= -1e-6).all() + # ! This fails, but so far with just -e-10, so really really small... + value = self.sign * quadratify_xAx(z, A) + # with torch.no_grad(): + value += self.value_offset + + if return_all: + # return value, z, torch.diag(A_diag), L, A + return { + "value": value, + "z": z, + # "A_diag": torch.diag(A_diag), #! throws shape error + "L": L, + "A": A, + } + else: + return value + + def evaluate(self, obs, return_all=False): + return self.forward(obs, return_all) + + def loss_fn(self, obs, target, **kwargs): + loss_NN = F.mse_loss(self.forward(obs), target, reduction="mean") + + # if self.minimize: + # loss_offset = (self.value_offset / target.min() - 1.0).pow(2) + # else: + # loss_offset = (self.value_offset / target.max() - 1.0).pow(2) + + # loss_scaling = (self.scaling_quadratic - target.mean()).pow(2) + + return loss_NN # + loss_offset # + loss_scaling + + +class QR(nn.Module): + def __init__( + self, + num_obs, + hidden_dims, + action_dim, + latent_dim=None, + activation="elu", + dropouts=0.0, + normalize_obs=False, + minimize=False, + device="cuda", + **kwargs, + ): + super().__init__() + self.device = device + if latent_dim is None: + latent_dim = num_obs + self.latent_dim = latent_dim + self.action_dim = action_dim + self.minimize = minimize + if minimize: + self.sign = torch.ones(1, device=device) + else: + self.sign = -torch.ones(1, device=device) + + self._normalize_obs = normalize_obs + if self._normalize_obs: + self.obs_rms = RunningMeanStd(num_obs) + + # Make Q estimator + num_lower_diag_elements_Q = sum(range(self.latent_dim + 1)) + self.lower_diag_NN_Q = create_MLP( + latent_dim, num_lower_diag_elements_Q, hidden_dims, activation, dropouts + ) + # self.lower_diag_NN_Q.apply(init_weights) + + # Make R estimator + num_lower_diag_elements_R = sum(range(action_dim + 1)) + self.lower_diag_NN_R = create_MLP( + latent_dim, num_lower_diag_elements_R, hidden_dims, activation, dropouts + ) + # self.lower_diag_NN_R.apply(init_weights) + + def forward(self, z, u, return_all=False): + Q_lower_diag = self.lower_diag_NN_Q(z) + L_Q = create_lower_diagonal(Q_lower_diag, self.latent_dim, self.device) + Q = compose_cholesky(L_Q) + + R_lower_diag = self.lower_diag_NN_R(z) + L_R = create_lower_diagonal(R_lower_diag, self.action_dim, self.device) + R = compose_cholesky(L_R) + + if return_all: + # return Q, L_Q, R, L_R + return {"Q": Q, "L_Q": L_Q, "R": R, "L_R": L_R} + else: + return Q, R + + def evaluate(self, obs, return_all=False): + return self.forward(obs, return_all) + + def loss_fn(self, z, target, actions, **kwargs): + Q, R = self.forward(z, target) + Q_value = self.sign * quadratify_xAx(z, Q) + R_value = self.sign * quadratify_xAx(actions, R) + gae_loss = F.mse_loss(Q_value + R_value, target, reduction="mean") + return gae_loss + + def riccati_loss_fn(self, z, target, P, **kwargs): + Q, R = self.forward(z, target) + A, B = self.linearize_pendulum_dynamics() + A = A.to(self.device) + B = B.to(self.device) + AT_P_A = torch.einsum( + "...ij, ...ik -> ...ik", + torch.einsum("...ij, ...jk -> ...ik", A.transpose(-1, -2), P), + A, + ) + AT_P_B = torch.einsum( + "...ij, ...ik -> ...ik", + torch.einsum("...ij, ...jk -> ...ik", A.transpose(-1, -2), P), + B, + ) + R_BT_B = R + torch.einsum("...ij, ...jk -> ...ik", B.transpose(-1, -2), B) + BT_P_A = torch.einsum( + "...ij, ...ik -> ...ik", + torch.einsum("...ij, ...jk -> ...ik", B.transpose(-1, -2), P), + A, + ) + final_term = torch.einsum( + "...ij, ...jk -> ...ik", + torch.einsum("...ij, ...jk -> ...ik", AT_P_B, torch.linalg.inv(R_BT_B)), + BT_P_A, + ) + riccati_loss = Q + AT_P_A - P - final_term + return F.mse_loss(riccati_loss, target=0.0, reduction="mean") + + def linearize_pendulum_dynamics(self, x_desired=torch.tensor([0.0, 0.0])): + m = 1.0 + b = 0.1 + length = 1.0 + g = 9.81 + ml2 = m * length**2 + + A = torch.tensor( + [[0.0, 1.0], [g / length * torch.cos(x_desired[0]), -b / ml2]], + device=self.device, + ) + B = torch.tensor([[0.0], [(1.0 / ml2)]], device=self.device) + return A, B + + +class QPNet(nn.Module): + def __init__( + self, + num_obs, + hidden_dims, + latent_dim=None, + activation="elu", + dropouts=0.0, + normalize_obs=False, + minimize=False, + device="cuda", + **kwargs, + ): + super().__init__() + self.device = device + if latent_dim is None: + latent_dim = num_obs + self.latent_dim = latent_dim + self.minimize = minimize + if minimize: + self.sign = torch.ones(1, device=device) + else: + self.sign = -torch.ones(1, device=device) + self.value_offset = nn.Parameter(torch.zeros(1, device=device)) + # V = x'Ax + b = a* x'Ax + b, and add a regularization for det(A) ~= 1 + self.scaling_quadratic = nn.Parameter(torch.ones(1, device=device)) + + self._normalize_obs = normalize_obs + if self._normalize_obs: + self.obs_rms = RunningMeanStd(num_obs) + + self.num_lower_diag_elements = sum(range(latent_dim + 1)) + self.lower_diag_NN = create_MLP( + num_obs, + self.num_lower_diag_elements + num_obs, + hidden_dims, + activation, + dropouts, + ) # ! num_obs added to output to create c for c.T@x + # self.lower_diag_NN.apply(init_weights) + + def forward(self, x, return_all=False): + res = self.lower_diag_NN(x) + output = res[..., : self.num_lower_diag_elements] + c = res[..., self.num_lower_diag_elements :] + L = create_lower_diagonal(output, self.latent_dim, self.device) + A = compose_cholesky(nn.ReLU()(L)) + value = self.sign * quadratify_xAx(x, A) + torch.einsum( + "...ij, ...ij -> ...i", c, x + ) + value += self.value_offset + # value *= self.scaling_quadratic + + if return_all: + # return value, A, L + return {"value": value, "A": A, "L": L, "c": c} + else: + return value + + def evaluate(self, obs, return_all=False): + return self.forward(obs, return_all) + + def loss_fn(self, obs, target, **kwargs): + loss_NN = F.mse_loss(self.forward(obs), target, reduction="mean") + + # loss_scaling = (self.scaling_quadratic - target.mean()).pow(2) + + return loss_NN # + loss_offset + loss_scaling + + +class NN_wRiccati(nn.Module): + def __init__(self, **kwargs): + super().__init__() + # assert ( + # "critic_name" in kwargs.keys() + # ), "Name of custom critic not specified in NN with Riccati init" + # assert "action_dim" in kwargs.keys(), "Dimension of NN actions not specified." + critic_name = kwargs["critic_name"] + device = kwargs["device"] + self.has_latent = False if kwargs["latent_dim"] is None else True + self.critic = eval(f"{critic_name}(**kwargs).to(device)") + self.QR_network = QR(**kwargs).to(device) + + def forward(self, x, return_all=False): + return self.critic.forward(x, return_all) + + def evaluate(self, obs, return_all=False): + return self.forward(obs, return_all) + + def loss_fn(self, obs, target, actions, **kwargs): + return self.value_loss(obs, target, actions) + self.reg_loss( + obs, target, actions + ) + + def value_loss(self, obs, target, actions, **kwargs): + return self.critic.loss_fn(obs, target, actions=actions) + + def reg_loss(self, obs, target, actions, **kwargs): + with torch.no_grad(): + P = self.critic(obs, return_all=True)["A"] + z = obs if self.has_latent is False else self.critic.latent_NN(obs) + return self.QR_network.riccati_loss_fn(z, target, P=P) + + +class NN_wQR(nn.Module): + def __init__(self, **kwargs): + super().__init__() + # assert ( + # "critic_name" in kwargs.keys() + # ), "Name of custom critic not specified in NN with Riccati init" + # assert "action_dim" in kwargs.keys(), "Dimension of NN actions not specified." + critic_name = kwargs["critic_name"] + device = kwargs["device"] + self.has_latent = False if kwargs["latent_dim"] is None else True + self.critic = eval(f"{critic_name}(**kwargs).to(device)") + self.QR_network = QR(**kwargs).to(device) + + def forward(self, x, return_all=False): + return self.critic.forward(x, return_all) + + def evaluate(self, obs, return_all=False): + return self.forward(obs, return_all) + + def loss_fn(self, obs, target, actions, **kwargs): + return self.value_loss(obs, target, actions) + self.reg_loss( + obs, target, actions + ) + + def value_loss(self, obs, target, actions, **kwargs): + return self.critic.loss_fn(obs, target, actions=actions) + + def reg_loss(self, obs, target, actions, **kwargs): + z = obs if self.has_latent is False else self.critic.latent_NN(obs) + return self.QR_network.loss_fn(z, target, actions=actions) + + +class NN_wLinearLatent(nn.Module): + def __init__(self, **kwargs): + super().__init__() + critic_name = kwargs["critic_name"] + device = kwargs["device"] # noqa + self.critic = eval(f"{critic_name}(**kwargs).to(device)") + + @property + def value_offset(self): + return self.critic.value_offset + + def forward(self, x, return_all=False): + return self.critic.forward(x, return_all) + + def evaluate(self, obs, return_all=False): + return self.forward(obs, return_all) + + def loss_fn(self, obs, target, actions, **kwargs): + return self.value_loss(obs, target, actions) + self.reg_loss( + obs, target, actions + ) + + def value_loss(self, obs, target, actions, **kwargs): + return self.critic.loss_fn(obs, target, actions=actions) + + def reg_loss(self, obs, target, actions, **kwargs): + z = self.critic.latent_NN(obs) + u = actions.detach().clone() + u = u.view(-1, u.shape[-1]) + K, z_offset = least_squares_fit(z, u) + return F.mse_loss(forward_affine(z, K, z_offset), target=u, reduction="mean") diff --git a/learning/modules/__init__.py b/learning/modules/__init__.py index 75cca5e2..9505377d 100644 --- a/learning/modules/__init__.py +++ b/learning/modules/__init__.py @@ -33,3 +33,7 @@ from .actor_critic import ActorCritic from .state_estimator import StateEstimatorNN from .actor import Actor +from .critic import Critic +from .chimera_actor import ChimeraActor +from .smooth_actor import SmoothActor +from .QRCritics import * diff --git a/learning/modules/actor.py b/learning/modules/actor.py index f9c82396..0f112dba 100644 --- a/learning/modules/actor.py +++ b/learning/modules/actor.py @@ -17,11 +17,6 @@ def __init__( normalize_obs=True, **kwargs, ): - if kwargs: - print( - "Actor.__init__ got unexpected arguments, " - "which will be ignored: " + str([key for key in kwargs.keys()]) - ) super().__init__() self._normalize_obs = normalize_obs @@ -30,7 +25,9 @@ def __init__( self.num_obs = num_obs self.num_actions = num_actions - self.NN = create_MLP(num_obs, num_actions, hidden_dims, activation) + self.hidden_dims = hidden_dims + self.activation = activation + self.NN = create_MLP(num_obs, num_actions, hidden_dims, activation, **kwargs) # Action noise self.std = nn.Parameter(init_noise_std * torch.ones(num_actions)) @@ -46,14 +43,16 @@ def action_mean(self): def action_std(self): return self.distribution.stddev + @property + def get_std(self): + return self.distribution.stddev + @property def entropy(self): return self.distribution.entropy().sum(dim=-1) def update_distribution(self, observations): - if self._normalize_obs: - observations = self.normalize(observations) - mean = self.NN(observations) + mean = self.act_inference(observations) self.distribution = Normal(mean, mean * 0.0 + self.std) def act(self, observations): @@ -65,16 +64,16 @@ def get_actions_log_prob(self, actions): def act_inference(self, observations): if self._normalize_obs: - observations = self.normalize(observations) - actions_mean = self.NN(observations) - return actions_mean + with torch.no_grad(): + observations = self.obs_rms(observations) + return self.NN(observations) def forward(self, observations): return self.act_inference(observations) - def normalize(self, observation): - with torch.no_grad(): - return self.obs_rms(observation) - def export(self, path): export_network(self, "policy", path, self.num_obs) + + def log_actions(self, mean, sample, path): + with open(path, "a") as f: + f.write(str(mean.item()) + ", " + str(sample.item()) + "\n") diff --git a/learning/modules/actor_critic.py b/learning/modules/actor_critic.py index ed837f5e..cec8cbb5 100644 --- a/learning/modules/actor_critic.py +++ b/learning/modules/actor_critic.py @@ -1,44 +1,17 @@ import torch.nn as nn -from .actor import Actor -from .critic import Critic - class ActorCritic(nn.Module): - def __init__( - self, - num_actor_obs, - num_critic_obs, - num_actions, - actor_hidden_dims=[256, 256, 256], - critic_hidden_dims=[256, 256, 256], - activation="elu", - init_noise_std=1.0, - normalize_obs=True, - **kwargs, - ): + def __init__(self, actor, critic, **kwargs): if kwargs: print( "ActorCritic.__init__ got unexpected arguments, " "which will be ignored: " + str([key for key in kwargs.keys()]) ) - super(ActorCritic, self).__init__() - - self.actor = Actor( - num_actor_obs, - num_actions, - actor_hidden_dims, - activation, - init_noise_std, - normalize_obs, - ) - - self.critic = Critic( - num_critic_obs, critic_hidden_dims, activation, normalize_obs - ) + super().__init__() - print(f"Actor MLP: {self.actor.NN}") - print(f"Critic MLP: {self.critic.NN}") + self.actor = actor + self.critic = critic @property def action_mean(self): diff --git a/learning/modules/chimera_actor.py b/learning/modules/chimera_actor.py new file mode 100644 index 00000000..1d5ead41 --- /dev/null +++ b/learning/modules/chimera_actor.py @@ -0,0 +1,81 @@ +import torch +import torch.nn as nn +from torch.distributions import Normal +from .utils import create_MLP +from .utils import export_network +from .utils import RunningMeanStd + + +class ChimeraActor(nn.Module): + def __init__( + self, + num_obs, + num_actions, + # hidden_dims, + # activation, + nn_params, + std_init=1.0, + log_std_max=4.0, + log_std_min=-20.0, + normalize_obs=True, + **kwargs, + ): + super().__init__() + + self._normalize_obs = normalize_obs + if self._normalize_obs: + self.obs_rms = RunningMeanStd(num_obs) + + self.log_std_max = log_std_max + self.log_std_min = log_std_min + self.log_std_init = torch.tensor([std_init]).log() # refactor + + self.num_obs = num_obs + self.num_actions = num_actions + + self.latent_NN = create_MLP( + num_inputs=num_obs, + num_outputs=nn_params["latent"]["hidden_dims"][-1], + **nn_params["latent"], + ) + self.mean_NN = create_MLP( + num_inputs=nn_params["latent"]["hidden_dims"][-1], + num_outputs=num_actions, + **nn_params["mean"], + ) + self.std_NN = create_MLP( + num_inputs=nn_params["latent"]["hidden_dims"][-1], + num_outputs=num_actions, + **nn_params["std"], + ) + + # maybe zap + self.distribution = Normal(torch.zeros(num_actions), torch.ones(num_actions)) + Normal.set_default_validate_args = False + + def forward(self, x, deterministic=True): + if self._normalize_obs: + with torch.no_grad(): + x = self.obs_rms(x) + latent = self.latent_NN(x) + mean = self.mean_NN(latent) + if deterministic: + return mean + log_std = self.log_std_init + self.std_NN(latent) + return mean, log_std.clamp(self.log_std_min, self.log_std_max).exp() + + def act(self, x): + mean, std = self.forward(x, deterministic=False) + self.distribution = Normal(mean, std) + return self.distribution.sample() + + def inference_policy(self, x): + return self.forward(x, deterministic=True) + + def export(self, path): + export_network(self.inference_policy, "policy", path, self.num_obs) + + def to(self, device): + super().to(device) + self.log_std_init = self.log_std_init.to(device) + return self diff --git a/learning/modules/critic.py b/learning/modules/critic.py index 47e0decf..5dfad32d 100644 --- a/learning/modules/critic.py +++ b/learning/modules/critic.py @@ -1,5 +1,6 @@ import torch import torch.nn as nn + from .utils import create_MLP from .utils import RunningMeanStd @@ -8,28 +9,26 @@ class Critic(nn.Module): def __init__( self, num_obs, - hidden_dims, + hidden_dims=None, activation="elu", normalize_obs=True, **kwargs, ): - if kwargs: - print( - "Critic.__init__ got unexpected arguments, " - "which will be ignored: " + str([key for key in kwargs.keys()]) - ) super().__init__() - self.NN = create_MLP(num_obs, 1, hidden_dims, activation) + self.NN = create_MLP(num_obs, 1, hidden_dims, activation, **kwargs) self._normalize_obs = normalize_obs if self._normalize_obs: self.obs_rms = RunningMeanStd(num_obs) - def evaluate(self, critic_observations): + def forward(self, x): if self._normalize_obs: - critic_observations = self.normalize(critic_observations) - return self.NN(critic_observations).squeeze() + with torch.no_grad(): + x = self.obs_rms(x) + return self.NN(x).squeeze() + + def evaluate(self, critic_observations): + return self.forward(critic_observations) - def normalize(self, observation): - with torch.no_grad(): - return self.obs_rms(observation) + def loss_fn(self, obs, target, **kwargs): + return nn.functional.mse_loss(self.forward(obs), target, reduction="mean") diff --git a/learning/modules/smooth_actor.py b/learning/modules/smooth_actor.py new file mode 100644 index 00000000..c02e9203 --- /dev/null +++ b/learning/modules/smooth_actor.py @@ -0,0 +1,125 @@ +import torch +import torch.nn as nn +from torch.distributions import Normal + +from .actor import Actor +from .utils import create_MLP + + +# The following implementation is based on the gSDE paper. See code: +# https://github.com/DLR-RM/stable-baselines3/blob/56f20e40a2206bbb16501a0f600e29ce1b112ef1/stable_baselines3/common/distributions.py#L421C7-L421C38 +class SmoothActor(Actor): + weights_dist: Normal + _latent_sde: torch.Tensor + exploration_matrices: torch.Tensor + + def __init__( + self, + *args, + full_std: bool = True, + use_exp_ln: bool = True, + learn_features: bool = True, + epsilon: float = 1e-6, + log_std_init: float = 0.0, + exploration_sample_freq: int = 16, + **kwargs, + ): + super().__init__(*args, **kwargs) + self.full_std = full_std + self.use_exp_ln = use_exp_ln + self.learn_features = learn_features + self.epsilon = epsilon + self.log_std_init = log_std_init + + # Create latent NN and last layer + self.latent_net = create_MLP( + self.num_obs, self.hidden_dims[-1], self.hidden_dims[:-1], self.activation + ) + self.latent_dim = self.hidden_dims[-1] + self.mean_actions_net = nn.Linear(self.latent_dim, self.num_actions) + # Reduce the number of parameters if needed + if self.full_std: + log_std = torch.ones(self.latent_dim, self.num_actions) + else: + log_std = torch.ones(self.latent_dim, 1) + self.log_std = nn.Parameter(log_std * self.log_std_init, requires_grad=True) + # Sample an exploration matrix + self.update_exploration_matrices(batch_size=1) + self.distribution = None + + self.call_counter = 0 + self.resampling_frequenchy = exploration_sample_freq + + def update_exploration_matrices(self, batch_size=1): + # Sample weights for the noise exploration matrix + std = self.get_std + self.weights_dist = Normal(torch.zeros_like(std), std) + # Pre-compute matrices in case of parallel exploration + self.exploration_matrices = self.weights_dist.rsample((batch_size,)) + + @property + def get_std(self): + # TODO[lm]: Check if this is ok, and can use action_std in ActorCritic normally + if self.use_exp_ln: + # From gSDE paper, it allows to keep variance + # above zero and prevent it from growing too fast + below_threshold = torch.exp(self.log_std) * (self.log_std <= 0) + # Avoid NaN: zeros values that are below zero + safe_log_std = self.log_std * (self.log_std > 0) + self.epsilon + above_threshold = (torch.log1p(safe_log_std) + 1.0) * (self.log_std > 0) + std = below_threshold + above_threshold + else: + # Use normal exponential + std = torch.exp(self.log_std) + + if self.full_std: + return std + assert self.latent_dim is not None + # Reduce the number of parameters: + return torch.ones(self.latent_dim, 1).to(self.log_std.device) * std + + def update_distribution(self, observations): + if self._normalize_obs: + with torch.no_grad(): + observations = self.obs_rms(observations) + # Get latent features and compute distribution + self._latent_sde = self.latent_net(observations) + if not self.learn_features: + self._latent_sde = self._latent_sde.detach() + if self._latent_sde.dim() == 2: + variance = torch.mm(self._latent_sde**2, self.get_std**2) + elif self._latent_sde.dim() == 3: + variance = torch.einsum("abc,cd->abd", self._latent_sde**2, self.get_std**2) + else: + raise ValueError("Invalid latent_sde dimension") + mean_actions = self.mean_actions_net(self._latent_sde) + self.distribution = Normal(mean_actions, torch.sqrt(variance + self.epsilon)) + + def act(self, observations): + if self.call_counter % self.resampling_frequenchy == 0: + self.update_exploration_matrices(observations.shape[0]) + self.call_counter += 1 + + self.update_distribution(observations) + mean = self.distribution.mean + sample = mean + self.get_noise() + return sample + + def act_inference(self, observations): + if self._normalize_obs: + with torch.no_grad(): + observations = self.obs_rms(observations) + latent_sde = self.latent_net(observations) + mean_actions = self.mean_actions_net(latent_sde) + return mean_actions + + def get_noise(self): + latent_sde = self._latent_sde + if not self.learn_features: + latent_sde = latent_sde.detach() + # Use batch matrix multiplication for efficient computation + # (batch_size, n_features) -> (batch_size, 1, n_features) + latent_sde = latent_sde.unsqueeze(dim=1) + # (batch_size, 1, n_actions) + noise = torch.bmm(latent_sde, self.exploration_matrices.to(latent_sde.device)) + return noise.squeeze(dim=1) diff --git a/learning/modules/state_estimator.py b/learning/modules/state_estimator.py index f8cfe221..dfba137d 100644 --- a/learning/modules/state_estimator.py +++ b/learning/modules/state_estimator.py @@ -20,7 +20,7 @@ def __init__( num_outputs, hidden_dims=[256, 128], activation="elu", - dropouts=None, + dropouts=0.0, **kwargs, ): if kwargs: diff --git a/learning/modules/utils/__init__.py b/learning/modules/utils/__init__.py index 1422b82d..35868ca0 100644 --- a/learning/modules/utils/__init__.py +++ b/learning/modules/utils/__init__.py @@ -1,2 +1,3 @@ from .neural_net import create_MLP, export_network from .normalize import RunningMeanStd +from .matfncs import create_lower_diagonal, create_PD_lower_diagonal, compose_cholesky, quadratify_xAx, least_squares_fit, forward_affine \ No newline at end of file diff --git a/learning/modules/utils/matfncs.py b/learning/modules/utils/matfncs.py new file mode 100644 index 00000000..92ebd0e7 --- /dev/null +++ b/learning/modules/utils/matfncs.py @@ -0,0 +1,53 @@ +import torch + + +def create_lower_diagonal(x, n, device): + L = torch.zeros((*x.shape[:-1], n, n), device=device, requires_grad=False) + tril_indices = torch.tril_indices(row=n, col=n, offset=0) + rows, cols = tril_indices + L[..., rows, cols] = x + return L + + +def create_PD_lower_diagonal(x, n, device): + tril_indices = torch.tril_indices(row=n, col=n, offset=0) + diag_indices = (tril_indices[0] == tril_indices[1]).nonzero(as_tuple=True)[0] + x[..., diag_indices] = F.softplus(x[..., diag_indices]) + L = torch.zeros((*x.shape[:-1], n, n), device=device, requires_grad=False) + rows, cols = tril_indices + L[..., rows, cols] = x + return L + + +def compose_cholesky(L): + return torch.einsum("...ij, ...jk -> ...ik", L, L.transpose(-2, -1)) + + +def quadratify_xAx(x, A): + return torch.einsum( + "...ij, ...jk -> ...ik", + torch.einsum("...ij, ...jk -> ...ik", x.unsqueeze(-1).transpose(-2, -1), A), + x.unsqueeze(-1), + ).squeeze() + + +def least_squares_fit(x, y): + x_flat = x.view(-1, x.shape[-1]) + y_flat = y.view(-1, y.shape[-1]) + ones = torch.ones(x_flat.shape[0], 1, device=x.device) + x_aug = torch.cat([ones, x_flat], dim=1) + # Use torch.linalg.lstsq to find the least-squares solution + result = torch.linalg.lstsq(x_aug, y_flat) + coefficients = result.solution + bias = coefficients[0] + weights = coefficients[1:] + + return weights, bias + + +def forward_affine(x, weights, bias): + x_flat = x.view(-1, x.shape[-1]) + x_aug = torch.cat( + [torch.ones(x_flat.shape[0], 1, device=x_flat.device), x_flat], dim=1 + ) + return x_aug.matmul(torch.cat([bias.unsqueeze(0), weights], dim=0)) diff --git a/learning/modules/utils/neural_net.py b/learning/modules/utils/neural_net.py index 5fd6ab3b..27b47a54 100644 --- a/learning/modules/utils/neural_net.py +++ b/learning/modules/utils/neural_net.py @@ -3,29 +3,64 @@ import copy -def create_MLP(num_inputs, num_outputs, hidden_dims, activation, dropouts=None): - activation = get_activation(activation) - - if dropouts is None: - dropouts = [0] * len(hidden_dims) +def create_MLP( + num_inputs, + num_outputs, + hidden_dims, + activations=None, + dropouts=0.0, + layer_norm=False, + just_list=False, + **kwargs, +): + if not isinstance(activations, list): + activations = [activations] * len(hidden_dims) + if not isinstance(dropouts, list): + dropouts = [dropouts] * len(hidden_dims) + if not isinstance(layer_norm, list): + layer_norm = [layer_norm] * len(hidden_dims) layers = [] - if not hidden_dims: # handle no hidden layers + # first layer + if len(hidden_dims) > 0: + add_layer( + layers, + num_inputs, + hidden_dims[0], + activations[0], + dropouts[0], + layer_norm[0], + ) + for i in range(len(hidden_dims) - 1): + add_layer( + layers, + hidden_dims[i], + hidden_dims[i + 1], + activations[i + 1], + dropouts[i + 1], + layer_norm=layer_norm[i + 1], + ) + else: + add_layer(layers, hidden_dims[-1], num_outputs) + else: # handle no hidden dims, just linear layer add_layer(layers, num_inputs, num_outputs) + + if just_list: + return layers else: - add_layer(layers, num_inputs, hidden_dims[0], activation, dropouts[0]) - for i in range(len(hidden_dims)): - if i == len(hidden_dims) - 1: - add_layer(layers, hidden_dims[i], num_outputs) - else: - add_layer( - layers, - hidden_dims[i], - hidden_dims[i + 1], - activation, - dropouts[i + 1], - ) - return torch.nn.Sequential(*layers) + return torch.nn.Sequential(*layers) + + +def add_layer( + layer_list, num_inputs, num_outputs, activation=None, dropout=0, layer_norm=False +): + layer_list.append(torch.nn.Linear(num_inputs, num_outputs)) + if layer_norm: + layer_list.append(torch.nn.LayerNorm(num_outputs)) + if dropout > 0: + layer_list.append(torch.nn.Dropout(p=dropout)) + if activation is not None: + layer_list.append(get_activation(activation)) def get_activation(act_name): @@ -36,26 +71,28 @@ def get_activation(act_name): elif act_name == "relu": return torch.nn.ReLU() elif act_name == "crelu": - return torch.nn.ReLU() + return torch.nn.CELU() elif act_name == "lrelu": return torch.nn.LeakyReLU() elif act_name == "tanh": return torch.nn.Tanh() elif act_name == "sigmoid": return torch.nn.Sigmoid() + elif act_name == "softplus": + return torch.nn.Softplus() + elif act_name == "softmax": + return torch.nn.Softmax(dim=-1) + elif act_name == "randrelu": + return torch.nn.RReLU() + elif act_name == "softsign": + return torch.nn.Softsign() + elif act_name == "mish": + return torch.nn.Mish() else: print("invalid activation function!") return None -def add_layer(layer_list, num_inputs, num_outputs, activation=None, dropout=0): - layer_list.append(torch.nn.Linear(num_inputs, num_outputs)) - if dropout > 0: - layer_list.append(torch.nn.Dropout(p=dropout)) - if activation is not None: - layer_list.append(activation) - - def export_network(network, network_name, path, num_inputs): """ Thsi function traces and exports the given network module in .pt and @@ -72,9 +109,11 @@ def export_network(network, network_name, path, num_inputs): path_TS = os.path.join(path, network_name + ".pt") # TorchScript path path_onnx = os.path.join(path, network_name + ".onnx") # ONNX path model = copy.deepcopy(network).to("cpu") + model.device = "cpu" # force all tensors created as intermediate steps to CPU # To trace model, must be evaluated once with arbitrary input model.eval() - dummy_input = torch.rand((1, num_inputs)) + dummy_input = torch.rand((num_inputs)) + # dummy_input = torch.rand((2, num_inputs)) model_traced = torch.jit.trace(model, dummy_input) torch.jit.save(model_traced, path_TS) torch.onnx.export(model_traced, dummy_input, path_onnx) diff --git a/learning/modules/utils/normalize.py b/learning/modules/utils/normalize.py index 5623b80c..246bafa4 100644 --- a/learning/modules/utils/normalize.py +++ b/learning/modules/utils/normalize.py @@ -2,31 +2,10 @@ import torch.nn as nn -def get_mean_var_with_masks(values, masks): - """ - Applies a mask to the input values and calculates - the mean and variance over the valid entries, as specified - by the mask. - """ - sum_mask = masks.sum() - masked_vals = values * masks - values_mean = masked_vals.sum() / sum_mask - min_sqr = (((masked_vals) ** 2) / sum_mask).sum() - ( - (masked_vals / sum_mask).sum() - ) ** 2 - values_var = min_sqr * sum_mask / (sum_mask - 1) - return values_mean, values_var - - class RunningMeanStd(nn.Module): - """ - Keeps a running mean to normalize tensor of choice. - """ - - def __init__(self, num_items, axis=0, epsilon=1e-05): - super(RunningMeanStd, self).__init__() + def __init__(self, num_items, epsilon=1e-05): + super().__init__() self.num_items = num_items - self.axis = axis self.epsilon = epsilon self.register_buffer( @@ -44,11 +23,6 @@ def _update_mean_var_from_moments( batch_var, batch_count, ): - """ - Implements parallel algorithm for combining arbitrary sets A and B - https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance - #Parallel_algorithm - """ tot_count = running_count + batch_count delta = batch_mean - running_mean @@ -64,16 +38,10 @@ def _update_mean_var_from_moments( new_var = M2 / (tot_count - 1) return new_mean, new_var, tot_count - def forward(self, input, mask=None): - """ - Returns the normalized version of the input. - """ + def forward(self, input): if self.training: - if mask is not None: - mean, var = get_mean_var_with_masks(input, mask) - else: - mean = input.mean(self.axis) - var = input.var(self.axis) + mean = input.mean(tuple(range(input.dim() - 1))) + var = input.var(tuple(range(input.dim() - 1))) ( self.running_mean, self.running_var, diff --git a/learning/runners/BaseRunner.py b/learning/runners/BaseRunner.py index c0462853..1545293b 100644 --- a/learning/runners/BaseRunner.py +++ b/learning/runners/BaseRunner.py @@ -1,6 +1,6 @@ import torch -from learning.algorithms import PPO -from learning.modules import ActorCritic +from learning.algorithms import * # noqa: F403 +from learning.modules import Actor, Critic, SmoothActor from learning.utils import remove_zero_weighted_rewards @@ -8,33 +8,46 @@ class BaseRunner: def __init__(self, env, train_cfg, device="cpu"): self.device = device self.env = env + self.setup_reward_functions() self.parse_train_cfg(train_cfg) - - num_actor_obs = self.get_obs_size(self.policy_cfg["actor_obs"]) - num_critic_obs = self.get_obs_size(self.policy_cfg["critic_obs"]) - num_actions = self.get_action_size(self.policy_cfg["actions"]) - actor_critic = ActorCritic( - num_actor_obs, num_critic_obs, num_actions, **self.policy_cfg - ).to(self.device) - - alg_class = eval(self.cfg["algorithm_class_name"]) - self.alg: PPO = alg_class(actor_critic, device=self.device, **self.alg_cfg) - self.num_steps_per_env = self.cfg["num_steps_per_env"] self.save_interval = self.cfg["save_interval"] self.num_learning_iterations = self.cfg["max_iterations"] self.tot_timesteps = 0 self.it = 0 - - # * init storage and model - self.init_storage() self.log_dir = train_cfg["log_dir"] + self._set_up_alg() + + def setup_reward_functions(self): + self.reward_functions = { + method.replace("_reward_", ""): getattr(self.env, method) + for method in dir(self.env) + if callable(getattr(self.env, method)) and method.startswith("_reward_") + } + + def _set_up_alg(self): + num_actor_obs = self.get_obs_size(self.actor_cfg["obs"]) + num_actions = self.get_action_size(self.actor_cfg["actions"]) + num_critic_obs = self.get_obs_size(self.critic_cfg["obs"]) + + if self.actor_cfg["smooth_exploration"]: + actor = SmoothActor(num_actor_obs, num_actions, **self.actor_cfg) + else: + actor = Actor(num_actor_obs, num_actions, **self.actor_cfg) + + critic = Critic(num_critic_obs, **self.critic_cfg) + alg_class = eval(self.cfg["algorithm_class_name"]) + self.alg = alg_class(actor, critic, device=self.device, **self.alg_cfg) def parse_train_cfg(self, train_cfg): self.cfg = train_cfg["runner"] self.alg_cfg = train_cfg["algorithm"] - remove_zero_weighted_rewards(train_cfg["policy"]["reward"]["weights"]) - self.policy_cfg = train_cfg["policy"] + remove_zero_weighted_rewards(train_cfg["critic"]["reward"]["weights"]) + remove_zero_weighted_rewards( + train_cfg["critic"]["reward"]["termination_weight"] + ) + self.actor_cfg = train_cfg["actor"] + self.critic_cfg = train_cfg["critic"] def init_storage(self): raise NotImplementedError @@ -89,8 +102,8 @@ def get_rewards(self, reward_weights, modifier=1, mask=None): if mask is None: mask = 1.0 for name, weight in reward_weights.items(): - rewards_dict[name] = mask * self._get_reward({name: weight}, modifier) + rewards_dict[name] = mask * self._get_reward(name, weight * modifier) return rewards_dict - def _get_reward(self, name_weight, modifier=1): - return modifier * self.env.compute_reward(name_weight).to(self.device) + def _get_reward(self, name, weight): + return weight * self.reward_functions[name]().to(self.device) diff --git a/learning/runners/__init__.py b/learning/runners/__init__.py index a24a0c9c..657b2f25 100644 --- a/learning/runners/__init__.py +++ b/learning/runners/__init__.py @@ -31,4 +31,9 @@ # Copyright (c) 2021 ETH Zurich, Nikita Rudin from .on_policy_runner import OnPolicyRunner -from .my_runner import MyRunner \ No newline at end of file +from .my_runner import MyRunner +from .old_policy_runner import OldPolicyRunner +from .off_policy_runner import OffPolicyRunner +from .custom_critic_runner import CustomCriticRunner +from .datalogging_runner import DataLoggingRunner +from .psd_sac_runner import PSACRunner \ No newline at end of file diff --git a/learning/runners/custom_critic_runner.py b/learning/runners/custom_critic_runner.py new file mode 100644 index 00000000..fbfb8c05 --- /dev/null +++ b/learning/runners/custom_critic_runner.py @@ -0,0 +1,178 @@ +import os +import numpy as np +import torch +from tensordict import TensorDict + +from gym import LEGGED_GYM_ROOT_DIR +from learning.algorithms import * # noqa: F403 +from learning.modules import Actor +from learning.utils import Logger + +from .on_policy_runner import OnPolicyRunner +from learning.storage import DictStorage + +logger = Logger() +storage = DictStorage() + + +class CustomCriticRunner(OnPolicyRunner): + def __init__(self, env, train_cfg, device="cpu"): + super().__init__(env, train_cfg, device) + logger.initialize( + self.env.num_envs, + self.env.dt, + self.cfg["max_iterations"], + self.device, + ) + + def _set_up_alg(self): + num_actor_obs = self.get_obs_size(self.actor_cfg["obs"]) + num_actions = self.get_action_size(self.actor_cfg["actions"]) + num_critic_obs = self.get_obs_size(self.critic_cfg["obs"]) # noqa: F841 + critic_class_name = self.critic_cfg["critic_class_name"] + actor = Actor(num_actor_obs, num_actions, **self.actor_cfg) + critic = eval(f"{critic_class_name}(num_critic_obs, **self.critic_cfg)") + alg_class = eval(self.cfg["algorithm_class_name"]) + self.alg = alg_class(actor, critic, device=self.device, **self.alg_cfg) + + def learn(self): + self.set_up_logger() + + rewards_dict = {} + + self.alg.switch_to_train() + actor_obs = self.get_obs(self.actor_cfg["obs"]) + critic_obs = self.get_obs(self.critic_cfg["obs"]) + tot_iter = self.it + self.num_learning_iterations + self.all_obs = torch.zeros(self.env.num_envs * (tot_iter - self.it + 1), 2) + self.all_obs[: self.env.num_envs, :] = actor_obs + self.save() + + # * start up storage + transition = TensorDict({}, batch_size=self.env.num_envs, device=self.device) + transition.update( + { + "actor_obs": actor_obs, + "actions": self.alg.act(actor_obs, critic_obs), + "critic_obs": critic_obs, + "rewards": self.get_rewards({"termination": 0.0})["termination"], + "dones": self.get_timed_out(), + } + ) + storage.initialize( + transition, + self.env.num_envs, + self.env.num_envs * self.num_steps_per_env, + device=self.device, + ) + + logger.tic("runtime") + for self.it in range(self.it + 1, tot_iter + 1): + logger.tic("iteration") + logger.tic("collection") + # * Rollout + with torch.inference_mode(): + for i in range(self.num_steps_per_env): + actions = self.alg.act(actor_obs, critic_obs) + self.set_actions( + self.actor_cfg["actions"], + actions, + self.actor_cfg["disable_actions"], + ) + + transition.update( + { + "actor_obs": actor_obs, + "actions": actions, + "critic_obs": critic_obs, + } + ) + + self.env.step() + + actor_obs = self.get_noisy_obs( + self.actor_cfg["obs"], self.actor_cfg["noise"] + ) + critic_obs = self.get_obs(self.critic_cfg["obs"]) + start = self.env.num_envs * self.it + end = self.env.num_envs * (self.it + 1) + self.all_obs[start:end, :] = actor_obs + + # * get time_outs + timed_out = self.get_timed_out() + terminated = self.get_terminated() + dones = timed_out | terminated + + self.update_rewards(rewards_dict, terminated) + total_rewards = torch.stack(tuple(rewards_dict.values())).sum(dim=0) + + transition.update( + { + "rewards": total_rewards, + "timed_out": timed_out, + "dones": dones, + } + ) + storage.add_transitions(transition) + + logger.log_rewards(rewards_dict) + logger.log_rewards({"total_rewards": total_rewards}) + logger.finish_step(dones) + logger.toc("collection") + + logger.tic("learning") + self.alg.update(storage.data) + storage.clear() + logger.toc("learning") + logger.log_all_categories() + + logger.finish_iteration() + logger.toc("iteration") + logger.toc("runtime") + logger.print_to_terminal() + + if self.it % self.save_interval == 0: + self.save() + self.all_obs = self.all_obs.detach().cpu().numpy() + save_path = os.path.join( + LEGGED_GYM_ROOT_DIR, + "logs", + "lqrc", + "standard_training_data.npy" + if "Cholesky" not in self.critic_cfg["critic_class_name"] + else "custom_training_data.npy", + ) + + dir_path = os.path.dirname(save_path) + if not os.path.exists(dir_path): + os.makedirs(dir_path) + np.save(save_path, self.all_obs) + print(f"Saved training observations to {save_path}") + self.save() + + def update_rewards(self, rewards_dict, terminated): + rewards_dict.update( + self.get_rewards( + self.critic_cfg["reward"]["termination_weight"], mask=terminated + ) + ) + rewards_dict.update( + self.get_rewards( + self.critic_cfg["reward"]["weights"], + modifier=self.env.dt, + mask=~terminated, + ) + ) + + def set_up_logger(self): + logger.register_rewards(list(self.critic_cfg["reward"]["weights"].keys())) + logger.register_rewards( + list(self.critic_cfg["reward"]["termination_weight"].keys()) + ) + logger.register_rewards(["total_rewards"]) + logger.register_category( + "algorithm", self.alg, ["mean_value_loss", "mean_surrogate_loss"] + ) + logger.register_category("actor", self.alg.actor, ["action_std", "entropy"]) + + logger.attach_torch_obj_to_wandb((self.alg.actor, self.alg.critic)) diff --git a/learning/runners/datalogging_runner.py b/learning/runners/datalogging_runner.py new file mode 100644 index 00000000..2f853fae --- /dev/null +++ b/learning/runners/datalogging_runner.py @@ -0,0 +1,213 @@ +import os +import torch +from tensordict import TensorDict + +from learning.utils import Logger + +from .BaseRunner import BaseRunner +from learning.storage import DictStorage +from learning.utils import export_to_numpy + +logger = Logger() +storage = DictStorage() + + +class DataLoggingRunner(BaseRunner): + def __init__(self, env, train_cfg, device="cpu"): + super().__init__(env, train_cfg, device) + logger.initialize( + self.env.num_envs, + self.env.dt, + self.cfg["max_iterations"], + self.device, + ) + + def learn(self): + self.set_up_logger() + + rewards_dict = {} + + self.alg.switch_to_train() + actor_obs = self.get_obs(self.actor_cfg["obs"]) + critic_obs = self.get_obs(self.critic_cfg["obs"]) + tot_iter = self.it + self.num_learning_iterations + + # * start up storage + transition = TensorDict({}, batch_size=self.env.num_envs, device=self.device) + transition.update( + { + "actor_obs": actor_obs, + "next_actor_obs": actor_obs, + "actions": self.alg.act(actor_obs), + "critic_obs": critic_obs, + "next_critic_obs": critic_obs, + "rewards": self.get_rewards({"termination": 0.0})["termination"], + "timed_out": self.get_timed_out(), + "terminated": self.get_terminated(), + "dones": self.get_timed_out() | self.get_terminated(), + } + ) + storage.initialize( + transition, + self.env.num_envs, + self.env.num_envs * self.num_steps_per_env, + device=self.device, + ) + + self.save() + # burn in observation normalization. + if self.actor_cfg["normalize_obs"] or self.critic_cfg["normalize_obs"]: + self.burn_in_normalization() + self.env.reset() + + logger.tic("runtime") + for self.it in range(self.it + 1, tot_iter + 1): + logger.tic("iteration") + logger.tic("collection") + # * Rollout + with torch.inference_mode(): + for i in range(self.num_steps_per_env): + actions = self.alg.act(actor_obs) + self.set_actions( + self.actor_cfg["actions"], + actions, + self.actor_cfg["disable_actions"], + ) + + transition.update( + { + "actor_obs": actor_obs, + "actions": actions, + "critic_obs": critic_obs, + } + ) + + self.env.step() + + actor_obs = self.get_noisy_obs( + self.actor_cfg["obs"], self.actor_cfg["noise"] + ) + critic_obs = self.get_obs(self.critic_cfg["obs"]) + + # * get time_outs + timed_out = self.get_timed_out() + terminated = self.get_terminated() + dones = timed_out | terminated + + self.update_rewards(rewards_dict, terminated) + total_rewards = torch.stack(tuple(rewards_dict.values())).sum(dim=0) + + transition.update( + { + "next_actor_obs": actor_obs, + "next_critic_obs": critic_obs, + "rewards": total_rewards, + "timed_out": timed_out, + "terminated": terminated, + "dones": dones, + } + ) + storage.add_transitions(transition) + + logger.log_rewards(rewards_dict) + logger.log_rewards({"total_rewards": total_rewards}) + logger.finish_step(dones) + logger.toc("collection") + + logger.tic("learning") + self.save() + self.alg.update(storage.data) + storage.clear() + logger.toc("learning") + logger.log_all_categories() + + logger.finish_iteration() + logger.toc("iteration") + logger.toc("runtime") + logger.print_to_terminal() + + # if self.it % self.save_interval == 0: + # self.save() + # self.save() + # print(f"Saved data from LQR Pendulum run to {dict_save_path}") + + @torch.no_grad + def burn_in_normalization(self, n_iterations=100): + actor_obs = self.get_obs(self.actor_cfg["obs"]) + critic_obs = self.get_obs(self.critic_cfg["obs"]) + for _ in range(n_iterations): + actions = self.alg.act(actor_obs) + self.set_actions(self.actor_cfg["actions"], actions) + self.env.step() + actor_obs = self.get_noisy_obs( + self.actor_cfg["obs"], self.actor_cfg["noise"] + ) + critic_obs = self.get_obs(self.critic_cfg["obs"]) + self.alg.critic.evaluate(critic_obs) + self.env.reset() + + def update_rewards(self, rewards_dict, terminated): + rewards_dict.update( + self.get_rewards( + self.critic_cfg["reward"]["termination_weight"], mask=terminated + ) + ) + rewards_dict.update( + self.get_rewards( + self.critic_cfg["reward"]["weights"], + modifier=self.env.dt, + mask=~terminated, + ) + ) + + def set_up_logger(self): + logger.register_rewards(list(self.critic_cfg["reward"]["weights"].keys())) + logger.register_rewards( + list(self.critic_cfg["reward"]["termination_weight"].keys()) + ) + logger.register_rewards(["total_rewards"]) + logger.register_category( + "algorithm", self.alg, ["mean_value_loss", "mean_surrogate_loss"] + ) + logger.register_category("actor", self.alg.actor, ["action_std", "entropy"]) + + logger.attach_torch_obj_to_wandb((self.alg.actor, self.alg.critic)) + + def save(self): + os.makedirs(self.log_dir, exist_ok=True) + path = os.path.join(self.log_dir, "model_{}.pt".format(self.it)) + torch.save( + { + "actor_state_dict": self.alg.actor.state_dict(), + "critic_state_dict": self.alg.critic.state_dict(), + "optimizer_state_dict": self.alg.optimizer.state_dict(), + "critic_optimizer_state_dict": self.alg.critic_optimizer.state_dict(), + "iter": self.it, + }, + path, + ) + path_data = os.path.join(self.log_dir, "data_{}".format(self.it)) + torch.save(storage.data.cpu(), path_data + ".pt") + export_to_numpy(storage.data, path_data + ".npz") + + def load(self, path, load_optimizer=True): + loaded_dict = torch.load(path, weights_only=True) + self.alg.actor.load_state_dict(loaded_dict["actor_state_dict"]) + self.alg.critic.load_state_dict(loaded_dict["critic_state_dict"]) + if load_optimizer: + self.alg.optimizer.load_state_dict(loaded_dict["optimizer_state_dict"]) + self.alg.critic_optimizer.load_state_dict( + loaded_dict["critic_optimizer_state_dict"] + ) + self.it = loaded_dict["iter"] + + def switch_to_eval(self): + self.alg.actor.eval() + self.alg.critic.eval() + + def get_inference_actions(self): + obs = self.get_noisy_obs(self.actor_cfg["obs"], self.actor_cfg["noise"]) + return self.alg.actor.act_inference(obs) + + def export(self, path): + self.alg.actor.export(path) diff --git a/learning/runners/my_runner.py b/learning/runners/my_runner.py index c3b0b477..e0da1a1b 100644 --- a/learning/runners/my_runner.py +++ b/learning/runners/my_runner.py @@ -1,81 +1,135 @@ import torch +from tensordict import TensorDict from learning.utils import Logger -from learning.utils import PotentialBasedRewardShaping -from learning.utils import remove_zero_weighted_rewards - from .on_policy_runner import OnPolicyRunner +from learning.storage import DictStorage +from learning.algorithms import PPO2 # noqa F401 +from learning.modules.actor import Actor +from learning.modules.critic import Critic # noqa F401 +from learning.modules.QRCritics import * # noqa F401 logger = Logger() +storage = DictStorage() class MyRunner(OnPolicyRunner): def __init__(self, env, train_cfg, device="cpu"): super().__init__(env, train_cfg, device) - logger.initialize( - self.env.num_envs, - self.env.dt, - self.cfg["max_iterations"], - self.device, - ) - def learn(self): - self.set_up_logger() - remove_zero_weighted_rewards(self.policy_cfg["reward"]["pbrs_weights"]) - PBRS = PotentialBasedRewardShaping( - self.policy_cfg["reward"]["pbrs_weights"], self.device - ) - logger.register_rewards(PBRS.get_reward_keys()) + def _set_up_alg(self): + num_actor_obs = self.get_obs_size(self.actor_cfg["obs"]) + num_actions = self.get_action_size(self.actor_cfg["actions"]) + num_critic_obs = self.get_obs_size(self.critic_cfg["obs"]) # noqa: F841 + actor = Actor(num_actor_obs, num_actions, **self.actor_cfg) + critic_class_name = self.critic_cfg["critic_class_name"] + critic = eval(f"{critic_class_name}(num_critic_obs, **self.critic_cfg)") + alg_class = eval(self.cfg["algorithm_class_name"]) + self.alg = alg_class(actor, critic, device=self.device, **self.alg_cfg) - rewards_dict = {} + def learn(self, states_to_log_dict=None): + n_policy_steps = int((1 / self.env.dt) / self.actor_cfg["frequency"]) + assert n_policy_steps > 0, "actor frequency should be less than ctrl_freq" + self.set_up_logger(dt=self.env.dt * n_policy_steps) - self.alg.actor_critic.train() - actor_obs = self.get_obs(self.policy_cfg["actor_obs"]) - critic_obs = self.get_obs(self.policy_cfg["critic_obs"]) - tot_iter = self.it + self.num_learning_iterations + rewards_dict = self.initialize_rewards_dict(n_policy_steps) + self.alg.switch_to_train() + actor_obs = self.get_obs(self.actor_cfg["obs"]) + critic_obs = self.get_obs(self.critic_cfg["obs"]) + tot_iter = self.it + self.num_learning_iterations self.save() + # * start up storage + transition = TensorDict({}, batch_size=self.env.num_envs, device=self.device) + transition.update( + { + "actor_obs": actor_obs, + "next_actor_obs": actor_obs, + "actions": self.alg.act(actor_obs), + "critic_obs": critic_obs, + "next_critic_obs": critic_obs, + "rewards": self.get_rewards({"termination": 0.0})["termination"], + "timed_out": self.get_timed_out(), + "terminated": self.get_terminated(), + "dones": self.get_timed_out() | self.get_terminated(), + } + ) + storage.initialize( + transition, + self.env.num_envs, + self.env.num_envs * self.num_steps_per_env, + device=self.device, + ) + + # burn in observation normalization. + self.burn_in_normalization() + logger.tic("runtime") for self.it in range(self.it + 1, tot_iter + 1): logger.tic("iteration") logger.tic("collection") + + # * Simulate environment and log states + if states_to_log_dict is not None: + it_idx = self.it - 1 + if it_idx % 10 == 0: + self.sim_and_log_states(states_to_log_dict, it_idx) + # * Rollout with torch.inference_mode(): for i in range(self.num_steps_per_env): - actions = self.alg.act(actor_obs, critic_obs) + actions = self.alg.act(actor_obs) self.set_actions( - self.policy_cfg["actions"], + self.actor_cfg["actions"], actions, - self.policy_cfg["disable_actions"], + self.actor_cfg["disable_actions"], ) - PBRS.pre_step(self.env) - self.env.step() + transition.update( + { + "actor_obs": actor_obs, + "actions": actions, + "critic_obs": critic_obs, + } + ) + for step in range(n_policy_steps): + self.env.step() + # put reward integration here + self.update_rewards_dict(rewards_dict, step) + else: + # catch and reset failed envs + to_be_reset = self.env.timed_out | self.env.terminated + env_ids = (to_be_reset).nonzero(as_tuple=False).flatten() + self.env._reset_idx(env_ids) + total_rewards = torch.stack( + tuple(rewards_dict.sum(dim=0).values()) + ).sum(dim=(0)) actor_obs = self.get_noisy_obs( - self.policy_cfg["actor_obs"], self.policy_cfg["noise"] + self.actor_cfg["obs"], self.actor_cfg["noise"] ) - critic_obs = self.get_obs(self.policy_cfg["critic_obs"]) - # * get time_outs - timed_out = self.get_timed_out() - terminated = self.get_terminated() - dones = timed_out | terminated + critic_obs = self.get_obs(self.critic_cfg["obs"]) - self.update_rewards(rewards_dict, terminated) - rewards_dict.update(PBRS.post_step(self.env, dones)) - total_rewards = torch.stack(tuple(rewards_dict.values())).sum(dim=0) + transition.update( + { + "next_actor_obs": actor_obs, + "next_critic_obs": critic_obs, + "rewards": total_rewards, + "timed_out": self.env.timed_out, + "dones": self.env.timed_out | self.env.terminated, + } + ) + storage.add_transitions(transition) - logger.log_rewards(rewards_dict) + logger.log_rewards(rewards_dict.sum(dim=0)) logger.log_rewards({"total_rewards": total_rewards}) - logger.finish_step(dones) - - self.alg.process_env_step(total_rewards, dones, timed_out) - self.alg.compute_returns(critic_obs) + logger.finish_step(self.env.timed_out | self.env.terminated) logger.toc("collection") logger.tic("learning") - self.alg.update() + self.alg.update(storage.data) + storage.clear() logger.toc("learning") logger.log_all_categories() @@ -88,33 +142,118 @@ def learn(self): self.save() self.save() + @torch.no_grad + def burn_in_normalization(self, n_iterations=100): + actor_obs = self.get_obs(self.actor_cfg["obs"]) + critic_obs = self.get_obs(self.critic_cfg["obs"]) + for _ in range(n_iterations): + actions = self.alg.act(actor_obs) + self.set_actions(self.actor_cfg["actions"], actions) + self.env.step() + actor_obs = self.get_noisy_obs( + self.actor_cfg["obs"], self.actor_cfg["noise"] + ) + critic_obs = self.get_obs(self.critic_cfg["obs"]) + self.alg.critic.evaluate(critic_obs) + self.env.reset() + + def set_up_logger(self, dt=None): + if dt is None: + dt = self.env.dt + logger.initialize( + self.env.num_envs, dt, self.cfg["max_iterations"], self.device + ) + + logger.register_rewards(list(self.critic_cfg["reward"]["weights"].keys())) + logger.register_rewards( + list(self.critic_cfg["reward"]["termination_weight"].keys()) + ) + logger.register_rewards(["total_rewards"]) + logger.register_category( + "algorithm", + self.alg, + ["mean_value_loss", "mean_surrogate_loss", "learning_rate"], + ) + logger.register_category("actor", self.alg.actor, ["action_std", "entropy"]) + logger.attach_torch_obj_to_wandb((self.alg.actor, self.alg.critic)) + def update_rewards(self, rewards_dict, terminated): + # sum existing rewards with new rewards + rewards_dict.update( self.get_rewards( - self.policy_cfg["reward"]["termination_weight"], mask=terminated + self.critic_cfg["reward"]["termination_weight"], mask=terminated ) ) rewards_dict.update( self.get_rewards( - self.policy_cfg["reward"]["weights"], + self.critic_cfg["reward"]["weights"], modifier=self.env.dt, mask=~terminated, ) ) - def set_up_logger(self): - logger.register_rewards(list(self.policy_cfg["reward"]["weights"].keys())) - logger.register_rewards( - list(self.policy_cfg["reward"]["termination_weight"].keys()) + def initialize_rewards_dict(self, n_steps): + # sum existing rewards with new rewards + rewards_dict = TensorDict( + {}, batch_size=(n_steps, self.env.num_envs), device=self.device ) - logger.register_rewards(["total_rewards"]) - logger.register_category( - "algorithm", self.alg, ["mean_value_loss", "mean_surrogate_loss"] + for key in self.critic_cfg["reward"]["termination_weight"]: + rewards_dict.update( + {key: torch.zeros(n_steps, self.env.num_envs, device=self.device)} + ) + for key in self.critic_cfg["reward"]["weights"]: + rewards_dict.update( + {key: torch.zeros(n_steps, self.env.num_envs, device=self.device)} + ) + return rewards_dict + + def update_rewards_dict(self, rewards_dict, step): + # sum existing rewards with new rewards + rewards_dict[step].update( + self.get_rewards( + self.critic_cfg["reward"]["termination_weight"], + modifier=self.env.dt, + mask=self.env.terminated, + ), + inplace=True, ) - logger.register_category( - "actor", self.alg.actor_critic, ["action_std", "entropy"] + rewards_dict[step].update( + self.get_rewards( + self.critic_cfg["reward"]["weights"], + modifier=self.env.dt, + mask=~self.env.terminated, + ), + inplace=True, ) - logger.attach_torch_obj_to_wandb( - (self.alg.actor_critic.actor, self.alg.actor_critic.critic) - ) + def sim_and_log_states(self, states_to_log_dict, it_idx): + # Simulate environment for as many steps as expected in the dict. + # Log states to the dict, as well as whether the env terminated. + steps = states_to_log_dict["terminated"].shape[2] + actor_obs = self.get_obs(self.policy_cfg["actor_obs"]) + + with torch.inference_mode(): + for i in range(steps): + actions = self.alg.act(actor_obs) + self.set_actions( + self.policy_cfg["actions"], + actions, + self.policy_cfg["disable_actions"], + ) + + self.env.step() + + actor_obs = self.get_noisy_obs( + self.policy_cfg["actor_obs"], self.policy_cfg["noise"] + ) + + # Log states (just for the first env) + terminated = self.get_terminated()[0] + for state in states_to_log_dict: + if state == "terminated": + states_to_log_dict[state][0, it_idx, i, :] = terminated + else: + states_to_log_dict[state][0, it_idx, i, :] = getattr( + self.env, state + )[0, :] diff --git a/learning/runners/off_policy_runner.py b/learning/runners/off_policy_runner.py new file mode 100644 index 00000000..83249aef --- /dev/null +++ b/learning/runners/off_policy_runner.py @@ -0,0 +1,318 @@ +import os +import torch +from tensordict import TensorDict + +from learning.utils import Logger + +from .BaseRunner import BaseRunner +from learning.modules import Critic, ChimeraActor +from learning.storage import ReplayBuffer +from learning.algorithms import SAC + +logger = Logger() +storage = ReplayBuffer() + + +class OffPolicyRunner(BaseRunner): + def __init__(self, env, train_cfg, device="cpu"): + super().__init__(env, train_cfg, device) + + def _set_up_alg(self): + num_actor_obs = self.get_obs_size(self.actor_cfg["obs"]) + num_actions = self.get_action_size(self.actor_cfg["actions"]) + num_critic_obs = self.get_obs_size(self.critic_cfg["obs"]) + actor = ChimeraActor(num_actor_obs, num_actions, **self.actor_cfg) + critic_1 = Critic(num_critic_obs + num_actions, **self.critic_cfg) + critic_2 = Critic(num_critic_obs + num_actions, **self.critic_cfg) + target_critic_1 = Critic(num_critic_obs + num_actions, **self.critic_cfg) + target_critic_2 = Critic(num_critic_obs + num_actions, **self.critic_cfg) + + print(actor) + + self.alg = SAC( + actor, + critic_1, + critic_2, + target_critic_1, + target_critic_2, + device=self.device, + **self.alg_cfg, + ) + + def learn(self): + n_policy_steps = int((1 / self.env.dt) / self.actor_cfg["frequency"]) + assert n_policy_steps > 0, "actor frequency should be less than ctrl_freq" + self.set_up_logger(dt=self.env.dt * n_policy_steps) + + rewards_dict = self.initialize_rewards_dict(n_policy_steps) + + self.alg.switch_to_train() + actor_obs = self.get_obs(self.actor_cfg["obs"]) + critic_obs = self.get_obs(self.critic_cfg["obs"]) + actions = self.alg.act(actor_obs) + tot_iter = self.it + self.num_learning_iterations + self.save() + + # * start up storage + transition = TensorDict({}, batch_size=self.env.num_envs, device=self.device) + transition.update( + { + "actor_obs": actor_obs, + "next_actor_obs": actor_obs, + "actions": self.alg.act(actor_obs), + "critic_obs": critic_obs, + "next_critic_obs": critic_obs, + "rewards": self.get_rewards({"termination": 0.0})["termination"], + "dones": self.get_timed_out(), + } + ) + storage.initialize( + transition, + self.env.num_envs, + self.alg_cfg["storage_size"], + device=self.device, + ) + + # fill buffer + for _ in range(self.alg_cfg["initial_fill"]): + with torch.inference_mode(): + actions = torch.rand_like(actions) * 2 - 1 + self.set_actions( + self.actor_cfg["actions"], + actions, + self.actor_cfg["disable_actions"], + ) + transition.update( + { + "actor_obs": actor_obs, + "actions": actions, + "critic_obs": critic_obs, + } + ) + + for step in range(n_policy_steps): + self.env.step() + # put reward integration here + self.update_rewards_dict(rewards_dict, step) + else: + # catch and reset failed envs + to_be_reset = self.env.timed_out | self.env.terminated + env_ids = (to_be_reset).nonzero(as_tuple=False).flatten() + self.env._reset_idx(env_ids) + + total_rewards = torch.stack( + tuple(rewards_dict.sum(dim=0).values()) + ).sum(dim=(0)) + + actor_obs = self.get_noisy_obs( + self.actor_cfg["obs"], self.actor_cfg["noise"] + ) + critic_obs = self.get_obs(self.critic_cfg["obs"]) + + transition.update( + { + "next_actor_obs": actor_obs, + "next_critic_obs": critic_obs, + "rewards": total_rewards, + "timed_out": self.env.timed_out, + "dones": self.env.timed_out | self.env.terminated, + } + ) + storage.add_transitions(transition) + # print every 10% of initial fill + if (self.alg_cfg["initial_fill"] > 10) and ( + _ % (self.alg_cfg["initial_fill"] // 10) == 0 + ): + print(f"Filled {100 * _ / self.alg_cfg['initial_fill']}%") + + logger.tic("runtime") + for self.it in range(self.it + 1, tot_iter + 1): + logger.tic("iteration") + logger.tic("collection") + # * Rollout + with torch.inference_mode(): + for i in range(self.num_steps_per_env): + actions = self.alg.act(actor_obs) + self.set_actions( + self.actor_cfg["actions"], + actions, + self.actor_cfg["disable_actions"], + ) + + transition.update( + { + "actor_obs": actor_obs, + "actions": actions, + "critic_obs": critic_obs, + } + ) + + for step in range(n_policy_steps): + self.env.step() + # put reward integration here + self.update_rewards_dict(rewards_dict, step) + else: + # catch and reset failed envs + to_be_reset = self.env.timed_out | self.env.terminated + env_ids = (to_be_reset).nonzero(as_tuple=False).flatten() + self.env._reset_idx(env_ids) + + total_rewards = torch.stack( + tuple(rewards_dict.sum(dim=0).values()) + ).sum(dim=(0)) + + actor_obs = self.get_noisy_obs( + self.actor_cfg["obs"], self.actor_cfg["noise"] + ) + critic_obs = self.get_obs(self.critic_cfg["obs"]) + + transition.update( + { + "next_actor_obs": actor_obs, + "next_critic_obs": critic_obs, + "rewards": total_rewards, + "timed_out": self.env.timed_out, + "dones": self.env.timed_out | self.env.terminated, + } + ) + storage.add_transitions(transition) + + logger.log_rewards(rewards_dict.sum(dim=0)) + logger.log_rewards({"total_rewards": total_rewards}) + logger.finish_step(self.env.timed_out | self.env.terminated) + logger.toc("collection") + + logger.tic("learning") + self.alg.update(storage.get_data()) + logger.toc("learning") + logger.log_all_categories() + + logger.finish_iteration() + logger.toc("iteration") + logger.toc("runtime") + logger.print_to_terminal() + + if self.it % self.save_interval == 0: + self.save() + self.save() + + def update_rewards_dict(self, rewards_dict, step): + # sum existing rewards with new rewards + rewards_dict[step].update( + self.get_rewards( + self.critic_cfg["reward"]["termination_weight"], + modifier=self.env.dt, + mask=self.env.terminated, + ), + inplace=True, + ) + rewards_dict[step].update( + self.get_rewards( + self.critic_cfg["reward"]["weights"], + modifier=self.env.dt, + mask=~self.env.terminated, + ), + inplace=True, + ) + + def initialize_rewards_dict(self, n_steps): + # sum existing rewards with new rewards + rewards_dict = TensorDict( + {}, batch_size=(n_steps, self.env.num_envs), device=self.device + ) + for key in self.critic_cfg["reward"]["termination_weight"]: + rewards_dict.update( + {key: torch.zeros(n_steps, self.env.num_envs, device=self.device)} + ) + for key in self.critic_cfg["reward"]["weights"]: + rewards_dict.update( + {key: torch.zeros(n_steps, self.env.num_envs, device=self.device)} + ) + return rewards_dict + + def set_up_logger(self, dt=None): + if dt is None: + dt = self.env.dt + logger.initialize( + self.env.num_envs, dt, self.cfg["max_iterations"], self.device + ) + + logger.register_rewards(list(self.critic_cfg["reward"]["weights"].keys())) + logger.register_rewards( + list(self.critic_cfg["reward"]["termination_weight"].keys()) + ) + logger.register_rewards(["total_rewards"]) + logger.register_category( + "algorithm", + self.alg, + [ + "mean_critic_1_loss", + "mean_critic_2_loss", + "mean_actor_loss", + "mean_alpha_loss", + "alpha", + "alpha_lr", + "critic_1_lr", + "critic_2_lr", + "actor_lr", + ], + ) + # logger.register_category("actor", self.alg.actor, ["action_std", "entropy"]) + + logger.attach_torch_obj_to_wandb( + (self.alg.actor, self.alg.critic_1, self.alg.critic_2) + ) + + def save(self): + os.makedirs(self.log_dir, exist_ok=True) + path = os.path.join(self.log_dir, "model_{}.pt".format(self.it)) + save_dict = { + "actor_state_dict": self.alg.actor.state_dict(), + "critic_1_state_dict": self.alg.critic_1.state_dict(), + "critic_2_state_dict": self.alg.critic_2.state_dict(), + "log_alpha": self.alg.log_alpha, + "actor_optimizer_state_dict": self.alg.actor_optimizer.state_dict(), + "critic_1_optimizer_state_dict": self.alg.critic_1_optimizer.state_dict(), + "critic_2_optimizer_state_dict": self.alg.critic_2_optimizer.state_dict(), + "log_alpha_optimizer_state_dict": self.alg.log_alpha_optimizer.state_dict(), + "iter": self.it, + } + torch.save(save_dict, path) + + def load(self, path, load_optimizer=True): + loaded_dict = torch.load(path, weights_only=True) + self.alg.actor.load_state_dict(loaded_dict["actor_state_dict"]) + self.alg.critic_1.load_state_dict(loaded_dict["critic_1_state_dict"]) + self.alg.critic_2.load_state_dict(loaded_dict["critic_2_state_dict"]) + self.log_alpha = loaded_dict["log_alpha"] + if load_optimizer: + self.alg.actor_optimizer.load_state_dict( + loaded_dict["actor_optimizer_state_dict"] + ) + self.alg.critic_1_optimizer.load_state_dict( + loaded_dict["critic_1_optimizer_state_dict"] + ) + self.alg.critic_2_optimizer.load_state_dict( + loaded_dict["critic_2_optimizer_state_dict"] + ) + self.alg.log_alpha_optimizer.load_state_dict( + loaded_dict["log_alpha_optimizer_state_dict"] + ) + self.it = loaded_dict["iter"] + + def switch_to_eval(self): + self.alg.actor.eval() + self.alg.critic_1.eval() + self.alg.critic_2.eval() + + def get_inference_actions(self): + obs = self.get_noisy_obs(self.actor_cfg["obs"], self.actor_cfg["noise"]) + mean = self.alg.actor.forward(obs) + actions = torch.tanh(mean) + actions = (actions * self.alg.action_delta + self.alg.action_offset).clamp( + self.alg.action_min, self.alg.action_max + ) + return actions + + def export(self, path): + self.alg.actor.export(path) diff --git a/learning/runners/old_policy_runner.py b/learning/runners/old_policy_runner.py new file mode 100644 index 00000000..d6fe5ddf --- /dev/null +++ b/learning/runners/old_policy_runner.py @@ -0,0 +1,164 @@ +import os +import torch + +from learning.utils import Logger +from .BaseRunner import BaseRunner +from learning.algorithms import PPO # noqa: F401 +from learning.modules import ActorCritic, Actor, Critic + +logger = Logger() + + +class OldPolicyRunner(BaseRunner): + def __init__(self, env, train_cfg, device="cpu"): + super().__init__(env, train_cfg, device) + self.init_storage() + logger.initialize( + self.env.num_envs, + self.env.dt, + self.cfg["max_iterations"], + self.device, + ) + + def _set_up_alg(self): + num_actor_obs = self.get_obs_size(self.actor_cfg["obs"]) + num_actions = self.get_action_size(self.actor_cfg["actions"]) + num_critic_obs = self.get_obs_size(self.critic_cfg["obs"]) + actor = Actor(num_actor_obs, num_actions, **self.actor_cfg) + critic = Critic(num_critic_obs, **self.critic_cfg) + actor_critic = ActorCritic(actor, critic) + alg_class = eval(self.cfg["algorithm_class_name"]) + self.alg = alg_class(actor_critic, device=self.device, **self.alg_cfg) + + def learn(self): + self.set_up_logger() + + rewards_dict = {} + + self.alg.actor_critic.train() + actor_obs = self.get_obs(self.actor_cfg["obs"]) + critic_obs = self.get_obs(self.critic_cfg["obs"]) + tot_iter = self.it + self.num_learning_iterations + + self.save() + + logger.tic("runtime") + for self.it in range(self.it + 1, tot_iter + 1): + logger.tic("iteration") + logger.tic("collection") + # * Rollout + with torch.inference_mode(): + for i in range(self.num_steps_per_env): + actions = self.alg.act(actor_obs, critic_obs) + self.set_actions( + self.actor_cfg["actions"], + actions, + self.actor_cfg["disable_actions"], + ) + + self.env.step() + + actor_obs = self.get_noisy_obs( + self.actor_cfg["obs"], self.actor_cfg["noise"] + ) + critic_obs = self.get_obs(self.critic_cfg["obs"]) + # * get time_outs + timed_out = self.get_timed_out() + terminated = self.get_terminated() + dones = timed_out | terminated + + self.update_rewards(rewards_dict, terminated) + total_rewards = torch.stack(tuple(rewards_dict.values())).sum(dim=0) + + logger.log_rewards(rewards_dict) + logger.log_rewards({"total_rewards": total_rewards}) + logger.finish_step(dones) + + self.alg.process_env_step(total_rewards, dones, timed_out) + self.alg.compute_returns(critic_obs) + logger.toc("collection") + + logger.tic("learning") + self.alg.update() + logger.toc("learning") + logger.log_all_categories() + + logger.finish_iteration() + logger.toc("iteration") + logger.toc("runtime") + logger.print_to_terminal() + + if self.it % self.save_interval == 0: + self.save() + self.save() + + def update_rewards(self, rewards_dict, terminated): + rewards_dict.update( + self.get_rewards( + self.critic_cfg["reward"]["termination_weight"], mask=terminated + ) + ) + rewards_dict.update( + self.get_rewards( + self.critic_cfg["reward"]["weights"], + modifier=self.env.dt, + mask=~terminated, + ) + ) + + def set_up_logger(self): + logger.register_rewards(list(self.critic_cfg["reward"]["weights"].keys())) + logger.register_rewards( + list(self.critic_cfg["reward"]["termination_weight"].keys()) + ) + logger.register_rewards(["total_rewards"]) + logger.register_category( + "algorithm", self.alg, ["mean_value_loss", "mean_surrogate_loss"] + ) + + logger.attach_torch_obj_to_wandb( + (self.alg.actor_critic.actor, self.alg.actor_critic.critic) + ) + + def save(self): + os.makedirs(self.log_dir, exist_ok=True) + path = os.path.join(self.log_dir, "model_{}.pt".format(self.it)) + torch.save( + { + "actor_state_dict": self.alg.actor_critic.actor.state_dict(), + "critic_state_dict": self.alg.actor_critic.critic.state_dict(), + "optimizer_state_dict": self.alg.optimizer.state_dict(), + "iter": self.it, + }, + path, + ) + + def load(self, path, load_optimizer=True): + loaded_dict = torch.load(path, weights_only=True) + self.alg.actor_critic.actor.load_state_dict(loaded_dict["actor_state_dict"]) + self.alg.actor_critic.critic.load_state_dict(loaded_dict["critic_state_dict"]) + if load_optimizer: + self.alg.optimizer.load_state_dict(loaded_dict["optimizer_state_dict"]) + self.it = loaded_dict["iter"] + + def switch_to_eval(self): + self.alg.actor_critic.eval() + + def get_inference_actions(self): + obs = self.get_noisy_obs(self.actor_cfg["obs"], self.actor_cfg["noise"]) + return self.alg.actor_critic.actor.act_inference(obs) + + def export(self, path): + self.alg.actor_critic.export_policy(path) + + def init_storage(self): + num_actor_obs = self.get_obs_size(self.actor_cfg["obs"]) + num_critic_obs = self.get_obs_size(self.critic_cfg["obs"]) + num_actions = self.get_action_size(self.actor_cfg["actions"]) + self.alg.init_storage( + self.env.num_envs, + self.num_steps_per_env, + actor_obs_shape=[num_actor_obs], + critic_obs_shape=[num_critic_obs], + action_shape=[num_actions], + ) diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py index 3a40974a..4831b615 100644 --- a/learning/runners/on_policy_runner.py +++ b/learning/runners/on_policy_runner.py @@ -1,73 +1,125 @@ import os import torch +from tensordict import TensorDict from learning.utils import Logger from .BaseRunner import BaseRunner +from learning.storage import DictStorage logger = Logger() +storage = DictStorage() class OnPolicyRunner(BaseRunner): def __init__(self, env, train_cfg, device="cpu"): super().__init__(env, train_cfg, device) - logger.initialize( - self.env.num_envs, - self.env.dt, - self.cfg["max_iterations"], - self.device, - ) - def learn(self): - self.set_up_logger() + def learn(self, states_to_log_dict=None): + n_policy_steps = int((1 / self.env.dt) / self.actor_cfg["frequency"]) + assert n_policy_steps > 0, "actor frequency should be less than ctrl_freq" + self.set_up_logger(dt=self.env.dt * n_policy_steps) - rewards_dict = {} + rewards_dict = self.initialize_rewards_dict(n_policy_steps) - self.alg.actor_critic.train() - actor_obs = self.get_obs(self.policy_cfg["actor_obs"]) - critic_obs = self.get_obs(self.policy_cfg["critic_obs"]) + self.alg.switch_to_train() + actor_obs = self.get_obs(self.actor_cfg["obs"]) + critic_obs = self.get_obs(self.critic_cfg["obs"]) tot_iter = self.it + self.num_learning_iterations - self.save() + # * start up storage + transition = TensorDict({}, batch_size=self.env.num_envs, device=self.device) + transition.update( + { + "actor_obs": actor_obs, + "next_actor_obs": actor_obs, + "actions": self.alg.act(actor_obs), + "critic_obs": critic_obs, + "next_critic_obs": critic_obs, + "rewards": self.get_rewards({"termination": 0.0})["termination"], + "timed_out": self.get_timed_out(), + "terminated": self.get_terminated(), + "dones": self.get_timed_out() | self.get_terminated(), + } + ) + storage.initialize( + transition, + self.env.num_envs, + self.env.num_envs * self.num_steps_per_env, + device=self.device, + ) + + # burn in observation normalization. + if self.actor_cfg["normalize_obs"] or self.critic_cfg["normalize_obs"]: + self.burn_in_normalization() + logger.tic("runtime") for self.it in range(self.it + 1, tot_iter + 1): logger.tic("iteration") logger.tic("collection") + + # * Simulate environment and log states + if states_to_log_dict is not None: + it_idx = self.it - 1 + if it_idx % 10 == 0: + self.sim_and_log_states(states_to_log_dict, it_idx) + # * Rollout with torch.inference_mode(): for i in range(self.num_steps_per_env): - actions = self.alg.act(actor_obs, critic_obs) + actions = self.alg.act(actor_obs) self.set_actions( - self.policy_cfg["actions"], + self.actor_cfg["actions"], actions, - self.policy_cfg["disable_actions"], + self.actor_cfg["disable_actions"], + ) + + transition.update( + { + "actor_obs": actor_obs, + "actions": actions, + "critic_obs": critic_obs, + } ) + for step in range(n_policy_steps): + self.env.step() + # put reward integration here + self.update_rewards_dict(rewards_dict, step) + else: + # catch and reset failed envs + to_be_reset = self.env.timed_out | self.env.terminated + env_ids = (to_be_reset).nonzero(as_tuple=False).flatten() + self.env._reset_idx(env_ids) - self.env.step() + total_rewards = torch.stack( + tuple(rewards_dict.sum(dim=0).values()) + ).sum(dim=(0)) actor_obs = self.get_noisy_obs( - self.policy_cfg["actor_obs"], self.policy_cfg["noise"] + self.actor_cfg["obs"], self.actor_cfg["noise"] ) - critic_obs = self.get_obs(self.policy_cfg["critic_obs"]) - # * get time_outs - timed_out = self.get_timed_out() - terminated = self.get_terminated() - dones = timed_out | terminated + critic_obs = self.get_obs(self.critic_cfg["obs"]) - self.update_rewards(rewards_dict, terminated) - total_rewards = torch.stack(tuple(rewards_dict.values())).sum(dim=0) + transition.update( + { + "next_actor_obs": actor_obs, + "next_critic_obs": critic_obs, + "rewards": total_rewards, + "timed_out": self.env.timed_out, + "dones": self.env.timed_out | self.env.terminated, + } + ) + storage.add_transitions(transition) - logger.log_rewards(rewards_dict) + logger.log_rewards(rewards_dict.sum(dim=0)) logger.log_rewards({"total_rewards": total_rewards}) - logger.finish_step(dones) - - self.alg.process_env_step(total_rewards, dones, timed_out) - self.alg.compute_returns(critic_obs) + logger.finish_step(self.env.timed_out | self.env.terminated) logger.toc("collection") logger.tic("learning") - self.alg.update() + self.alg.update(storage.data) + storage.clear() logger.toc("learning") logger.log_all_categories() @@ -80,76 +132,139 @@ def learn(self): self.save() self.save() - def update_rewards(self, rewards_dict, terminated): - rewards_dict.update( - self.get_rewards( - self.policy_cfg["reward"]["termination_weight"], mask=terminated + @torch.no_grad + def burn_in_normalization(self, n_iterations=100): + actor_obs = self.get_obs(self.actor_cfg["obs"]) + critic_obs = self.get_obs(self.critic_cfg["obs"]) + for _ in range(n_iterations): + actions = self.alg.act(actor_obs) + self.set_actions(self.actor_cfg["actions"], actions) + self.env.step() + actor_obs = self.get_noisy_obs( + self.actor_cfg["obs"], self.actor_cfg["noise"] ) + critic_obs = self.get_obs(self.critic_cfg["obs"]) + self.alg.critic.evaluate(critic_obs) + self.env.reset() + + def update_rewards_dict(self, rewards_dict, step): + # sum existing rewards with new rewards + rewards_dict[step].update( + self.get_rewards( + self.critic_cfg["reward"]["termination_weight"], + modifier=self.env.dt, + mask=self.env.terminated, + ), + inplace=True, ) - rewards_dict.update( + rewards_dict[step].update( self.get_rewards( - self.policy_cfg["reward"]["weights"], + self.critic_cfg["reward"]["weights"], modifier=self.env.dt, - mask=~terminated, + mask=~self.env.terminated, + ), + inplace=True, + ) + + def initialize_rewards_dict(self, n_steps): + # sum existing rewards with new rewards + rewards_dict = TensorDict( + {}, batch_size=(n_steps, self.env.num_envs), device=self.device + ) + for key in self.critic_cfg["reward"]["termination_weight"]: + rewards_dict.update( + {key: torch.zeros(n_steps, self.env.num_envs, device=self.device)} ) + for key in self.critic_cfg["reward"]["weights"]: + rewards_dict.update( + {key: torch.zeros(n_steps, self.env.num_envs, device=self.device)} + ) + return rewards_dict + + def set_up_logger(self, dt=None): + if dt is None: + dt = self.env.dt + logger.initialize( + self.env.num_envs, dt, self.cfg["max_iterations"], self.device ) - def set_up_logger(self): - logger.register_rewards(list(self.policy_cfg["reward"]["weights"].keys())) + logger.register_rewards(list(self.critic_cfg["reward"]["weights"].keys())) logger.register_rewards( - list(self.policy_cfg["reward"]["termination_weight"].keys()) + list(self.critic_cfg["reward"]["termination_weight"].keys()) ) logger.register_rewards(["total_rewards"]) logger.register_category( - "algorithm", self.alg, ["mean_value_loss", "mean_surrogate_loss"] - ) - logger.register_category( - "actor", self.alg.actor_critic, ["action_std", "entropy"] + "algorithm", + self.alg, + ["mean_value_loss", "mean_surrogate_loss", "learning_rate"], ) + logger.register_category("actor", self.alg.actor, ["action_std", "entropy"]) - logger.attach_torch_obj_to_wandb( - (self.alg.actor_critic.actor, self.alg.actor_critic.critic) - ) + logger.attach_torch_obj_to_wandb((self.alg.actor, self.alg.critic)) def save(self): os.makedirs(self.log_dir, exist_ok=True) path = os.path.join(self.log_dir, "model_{}.pt".format(self.it)) torch.save( { - "actor_state_dict": self.alg.actor_critic.actor.state_dict(), - "critic_state_dict": self.alg.actor_critic.critic.state_dict(), + "actor_state_dict": self.alg.actor.state_dict(), + "critic_state_dict": self.alg.critic.state_dict(), "optimizer_state_dict": self.alg.optimizer.state_dict(), + "critic_optimizer_state_dict": self.alg.critic_optimizer.state_dict(), "iter": self.it, }, path, ) def load(self, path, load_optimizer=True): - loaded_dict = torch.load(path) - self.alg.actor_critic.actor.load_state_dict(loaded_dict["actor_state_dict"]) - self.alg.actor_critic.critic.load_state_dict(loaded_dict["critic_state_dict"]) + loaded_dict = torch.load(path, weights_only=True) + self.alg.actor.load_state_dict(loaded_dict["actor_state_dict"]) + self.alg.critic.load_state_dict(loaded_dict["critic_state_dict"]) if load_optimizer: self.alg.optimizer.load_state_dict(loaded_dict["optimizer_state_dict"]) + self.alg.critic_optimizer.load_state_dict( + loaded_dict["critic_optimizer_state_dict"] + ) self.it = loaded_dict["iter"] def switch_to_eval(self): - self.alg.actor_critic.eval() + self.alg.actor.eval() + self.alg.critic.eval() def get_inference_actions(self): - obs = self.get_noisy_obs(self.policy_cfg["actor_obs"], self.policy_cfg["noise"]) - return self.alg.actor_critic.actor.act_inference(obs) + obs = self.get_noisy_obs(self.actor_cfg["obs"], self.actor_cfg["noise"]) + return self.alg.actor.act_inference(obs) def export(self, path): - self.alg.actor_critic.export_policy(path) + self.alg.actor.export(path) - def init_storage(self): - num_actor_obs = self.get_obs_size(self.policy_cfg["actor_obs"]) - num_critic_obs = self.get_obs_size(self.policy_cfg["critic_obs"]) - num_actions = self.get_action_size(self.policy_cfg["actions"]) - self.alg.init_storage( - self.env.num_envs, - self.num_steps_per_env, - actor_obs_shape=[num_actor_obs], - critic_obs_shape=[num_critic_obs], - action_shape=[num_actions], - ) + def sim_and_log_states(self, states_to_log_dict, it_idx): + # Simulate environment for as many steps as expected in the dict. + # Log states to the dict, as well as whether the env terminated. + steps = states_to_log_dict["terminated"].shape[2] + actor_obs = self.get_obs(self.policy_cfg["actor_obs"]) + + with torch.inference_mode(): + for i in range(steps): + actions = self.alg.act(actor_obs) + self.set_actions( + self.policy_cfg["actions"], + actions, + self.policy_cfg["disable_actions"], + ) + + self.env.step() + + actor_obs = self.get_noisy_obs( + self.policy_cfg["actor_obs"], self.policy_cfg["noise"] + ) + + # Log states (just for the first env) + terminated = self.get_terminated()[0] + for state in states_to_log_dict: + if state == "terminated": + states_to_log_dict[state][0, it_idx, i, :] = terminated + else: + states_to_log_dict[state][0, it_idx, i, :] = getattr( + self.env, state + )[0, :] diff --git a/learning/runners/psd_sac_runner.py b/learning/runners/psd_sac_runner.py new file mode 100644 index 00000000..31afda18 --- /dev/null +++ b/learning/runners/psd_sac_runner.py @@ -0,0 +1,54 @@ +from learning.utils import Logger + +from learning.runners import OffPolicyRunner +from learning.modules import Critic, ChimeraActor # noqa F401 +from learning.modules.QRCritics import * # noqa F401 + +from learning.storage import ReplayBuffer +from learning.algorithms import SAC + + +logger = Logger() +storage = ReplayBuffer() + + +class PSACRunner(OffPolicyRunner): + def __init__(self, env, train_cfg, device="cpu"): + super().__init__(env, train_cfg, device) + logger.initialize( + self.env.num_envs, + self.env.dt, + self.cfg["max_iterations"], + self.device, + ) + + def _set_up_alg(self): + num_actor_obs = self.get_obs_size(self.actor_cfg["obs"]) + num_actions = self.get_action_size(self.actor_cfg["actions"]) + num_critic_in = self.get_obs_size(self.critic_cfg["obs"]) + num_actions # noqa: F841 + actor = ChimeraActor(num_actor_obs, num_actions, **self.actor_cfg) + + critic_class = self.critic_cfg["critic_class_name"] + critic_1 = eval(f"{critic_class}(num_critic_in, **self.critic_cfg)") + critic_2 = eval(f"{critic_class}(num_critic_in, **self.critic_cfg)") + target_critic_1 = eval(f"{critic_class}(num_critic_in, **self.critic_cfg)") + target_critic_2 = eval(f"{critic_class}(num_critic_in, **self.critic_cfg)") + + # critic_1 = Critic(num_critic_obs + num_actions, **self.critic_cfg) + # critic_2 = Critic(num_critic_obs + num_actions, **self.critic_cfg) + + # target_critic_1 = Critic(num_critic_obs + num_actions, **self.critic_cfg) + # target_critic_2 = Critic(num_critic_obs + num_actions, **self.critic_cfg) + + print(actor) + print(critic_1) + + self.alg = SAC( + actor, + critic_1, + critic_2, + target_critic_1, + target_critic_2, + device=self.device, + **self.alg_cfg, + ) diff --git a/learning/storage/SE_storage.py b/learning/storage/SE_storage.py index a63f5f9f..0f4eaf41 100644 --- a/learning/storage/SE_storage.py +++ b/learning/storage/SE_storage.py @@ -53,9 +53,9 @@ def clear(self): def mini_batch_generator(self, num_mini_batches, num_epochs=8): """Generate mini batch for learning""" batch_size = self.num_envs * self.num_transitions_per_env - mini_batch_size = batch_size // num_mini_batches + batch_size = batch_size // num_mini_batches indices = torch.randperm( - num_mini_batches * mini_batch_size, + num_mini_batches * batch_size, requires_grad=False, device=self.device, ) @@ -69,8 +69,8 @@ def mini_batch_generator(self, num_mini_batches, num_epochs=8): for epoch in range(num_epochs): for i in range(num_mini_batches): - start = i * mini_batch_size - end = (i + 1) * mini_batch_size + start = i * batch_size + end = (i + 1) * batch_size batch_idx = indices[start:end] obs_batch = observations[batch_idx] diff --git a/learning/storage/__init__.py b/learning/storage/__init__.py index edd7bd55..47f2593b 100644 --- a/learning/storage/__init__.py +++ b/learning/storage/__init__.py @@ -3,3 +3,5 @@ from .rollout_storage import RolloutStorage from .SE_storage import SERolloutStorage +from .dict_storage import DictStorage +from .replay_buffer import ReplayBuffer \ No newline at end of file diff --git a/learning/storage/dict_storage.py b/learning/storage/dict_storage.py new file mode 100644 index 00000000..2191093c --- /dev/null +++ b/learning/storage/dict_storage.py @@ -0,0 +1,48 @@ +import torch +from tensordict import TensorDict + + +class DictStorage: + def __init__(self): + self.initialized = False + + def initialize( + self, + dummy_dict, + num_envs=2**12, + max_storage=2**17, + device="cpu", + ): + self.device = device + self.num_envs = num_envs + max_length = max_storage // num_envs + self.max_length = max_length + self.data = TensorDict({}, batch_size=(max_length, num_envs), device=device) + self.fill_count = 0 + + for key in dummy_dict.keys(): + if dummy_dict[key].dim() == 1: # if scalar + self.data[key] = torch.zeros( + (max_length, num_envs), + dtype=dummy_dict[key].dtype, + device=self.device, + ) + else: + self.data[key] = torch.zeros( + (max_length, num_envs, dummy_dict[key].shape[1]), + dtype=dummy_dict[key].dtype, + device=self.device, + ) + + @torch.inference_mode + def add_transitions(self, transition: TensorDict): + if self.fill_count >= self.max_length: + raise AssertionError("Rollout buffer overflow") + self.data[self.fill_count] = transition + self.fill_count += 1 + + def clear(self): + self.fill_count = 0 + with torch.inference_mode(): + for tensor in self.data: + tensor.zero_() diff --git a/learning/storage/replay_buffer.py b/learning/storage/replay_buffer.py new file mode 100644 index 00000000..350a2c25 --- /dev/null +++ b/learning/storage/replay_buffer.py @@ -0,0 +1,54 @@ +import torch +from tensordict import TensorDict + + +class ReplayBuffer: + def __init__(self): + self.initialized = False + + def initialize( + self, + dummy_dict, + num_envs=2**12, + max_storage=2**17, + device="cpu", + ): + self.device = device + self.num_envs = num_envs + max_length = max_storage // num_envs + self.max_length = max_length + self.data = TensorDict({}, batch_size=(max_length, num_envs), device=device) + self.fill_count = 0 + self.add_index = 0 + + for key in dummy_dict.keys(): + if dummy_dict[key].dim() == 1: # if scalar + self.data[key] = torch.zeros( + (max_length, num_envs), + dtype=dummy_dict[key].dtype, + device=self.device, + ) + else: + self.data[key] = torch.zeros( + (max_length, num_envs, dummy_dict[key].shape[1]), + dtype=dummy_dict[key].dtype, + device=self.device, + ) + + @torch.inference_mode + def add_transitions(self, transition: TensorDict): + if self.fill_count >= self.max_length and self.add_index >= self.max_length: + self.add_index = 0 + self.data[self.add_index] = transition + self.fill_count += 1 + self.add_index += 1 + + def get_data(self): + return self.data[: min(self.fill_count, self.max_length), :] + + def clear(self): + self.fill_count = 0 + self.add_index = 0 + with torch.inference_mode(): + for tensor in self.data: + tensor.zero_() diff --git a/learning/storage/rollout_storage.py b/learning/storage/rollout_storage.py index bba48e02..0f2885a3 100644 --- a/learning/storage/rollout_storage.py +++ b/learning/storage/rollout_storage.py @@ -182,11 +182,11 @@ def get_statistics(self): trajectory_lengths = done_indices[1:] - done_indices[:-1] return trajectory_lengths.float().mean(), self.rewards.mean() - def mini_batch_generator(self, num_mini_batches, num_epochs=8): + def mini_batch_generator(self, num_mini_batches=1, num_epochs=8): batch_size = self.num_envs * self.num_transitions_per_env - mini_batch_size = batch_size // num_mini_batches + batch_size = batch_size // num_mini_batches indices = torch.randperm( - num_mini_batches * mini_batch_size, + num_mini_batches * batch_size, requires_grad=False, device=self.device, ) @@ -207,8 +207,8 @@ def mini_batch_generator(self, num_mini_batches, num_epochs=8): for epoch in range(num_epochs): for i in range(num_mini_batches): - start = i * mini_batch_size - end = (i + 1) * mini_batch_size + start = i * batch_size + end = (i + 1) * batch_size batch_idx = indices[start:end] obs_batch = observations[batch_idx] diff --git a/learning/utils/PBRS/test_usecase.py b/learning/utils/PBRS/test_usecase.py index a84ef547..be5d0412 100644 --- a/learning/utils/PBRS/test_usecase.py +++ b/learning/utils/PBRS/test_usecase.py @@ -30,14 +30,14 @@ def test_PBRS(): logger.initialize(num_envs=1, episode_dt=0.1, total_iterations=100, device="cpu") self_env = MyTask() - # let's create a dummy policy_cfg with just the reward weights - policy_cfg = {"reward": {"weights": {"dummy": 1}}} - policy_cfg["reward"]["pbrs_weights"] = {"first": 1, "second": 2} + # let's create a dummy critic_cfg with just the reward weights + critic_cfg = {"reward": {"weights": {"dummy": 1}}} + critic_cfg["reward"]["pbrs_weights"] = {"first": 1, "second": 2} # and a dummy env PBRS = PotentialBasedRewardShaping( - policy_cfg["reward"]["pbrs_weights"], device="cpu" + critic_cfg["reward"]["pbrs_weights"], device="cpu" ) assert PBRS.get_reward_keys() == ["PBRS_first", "PBRS_second"] diff --git a/learning/utils/__init__.py b/learning/utils/__init__.py index a39a3daf..b39dc16d 100644 --- a/learning/utils/__init__.py +++ b/learning/utils/__init__.py @@ -1,40 +1,8 @@ -# SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. -# All rights reserved. -# SPDX-License-Identifier: BSD-3-Clause -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, -# this list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# 3. Neither the name of the copyright holder nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE -# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF -# THE POSSIBILITY OF SUCH DAMAGE. -# -# Copyright (c) 2021 ETH Zurich, Nikita Rudin - from .utils import ( - split_and_pad_trajectories, - unpad_trajectories, remove_zero_weighted_rewards, set_discount_from_horizon, + polyak_update ) +from .dict_utils import * from .logger import Logger from .PBRS.PotentialBasedRewardShaping import PotentialBasedRewardShaping \ No newline at end of file diff --git a/learning/utils/dict_utils.py b/learning/utils/dict_utils.py new file mode 100644 index 00000000..73039526 --- /dev/null +++ b/learning/utils/dict_utils.py @@ -0,0 +1,96 @@ +import numpy as np +import torch +from tensordict import TensorDict + + +@torch.no_grad +def compute_MC_returns(data: TensorDict, gamma, critic=None): + # todo not as accurate as taking + if critic is None: + last_values = torch.zeros_like(data["rewards"][0]) + else: + last_values = critic.evaluate(data["critic_obs"][-1]) + + returns = torch.zeros_like(data["rewards"]) + returns[-1] = data["rewards"][-1] + gamma * last_values * ~data["terminated"][-1] + for k in reversed(range(data["rewards"].shape[0] - 1)): + not_done = ~data["dones"][k] + returns[k] = data["rewards"][k] + gamma * returns[k + 1] * not_done + if critic is not None: + returns[k] += ( + gamma + * critic.evaluate(data["critic_obs"][k]) + * data["timed_out"][k] + * ~data["terminated"][k] + ) + return returns + + +@torch.no_grad +def normalize(input, eps=1e-8): + return (input - input.mean()) / (input.std() + eps) + + +@torch.no_grad +def compute_generalized_advantages(data, gamma, lam, critic): + last_values = critic.evaluate(data["next_critic_obs"][-1]) + advantages = torch.zeros_like(data["values"]) + if last_values is not None: + # todo check this + not_done = ~data["dones"][-1] + advantages[-1] = ( + data["rewards"][-1] + + gamma * data["values"][-1] * data["timed_out"][-1] + + gamma * last_values * not_done + - data["values"][-1] + ) + + for k in reversed(range(data["values"].shape[0] - 1)): + not_done = ~data["dones"][k] + td_error = ( + data["rewards"][k] + + gamma * data["values"][k] * data["timed_out"][k] + + gamma * data["values"][k + 1] * not_done + - data["values"][k] + ) + advantages[k] = td_error + gamma * lam * not_done * advantages[k + 1] + + return advantages + + +# todo change num_epochs to num_batches +@torch.no_grad +def create_uniform_generator(data, batch_size, max_gradient_steps=100): + n, m = data.shape + total_data = n * m + + if batch_size > total_data: + batch_size = total_data + + num_batches_per_epoch = total_data // batch_size + num_epochs = max_gradient_steps // num_batches_per_epoch + last_epoch_batches = max_gradient_steps - num_epochs * num_batches_per_epoch + + for epoch in range(num_epochs): + indices = torch.randperm(total_data, device=data.device) + for i in range(num_batches_per_epoch): + batched_data = data.flatten(0, 1)[ + indices[i * batch_size : (i + 1) * batch_size] + ] + yield batched_data + else: + indices = torch.randperm(total_data, device=data.device) + for i in range(last_epoch_batches): + batched_data = data.flatten(0, 1)[ + indices[i * batch_size : (i + 1) * batch_size] + ] + yield batched_data + + +@torch.no_grad +def export_to_numpy(data, path): + # check if path ends iwth ".npz", and if not append it. + if not path.endswith(".npz"): + path += ".npz" + np.savez_compressed(path, **{key: val.cpu().numpy() for key, val in data.items()}) + return diff --git a/learning/utils/utils.py b/learning/utils/utils.py index b1a20bcf..d4eac980 100644 --- a/learning/utils/utils.py +++ b/learning/utils/utils.py @@ -1,96 +1,3 @@ -# SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. -# All rights reserved. -# SPDX-License-Identifier: BSD-3-Clause -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, -# this list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# 3. Neither the name of the copyright holder nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE -# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF -# THE POSSIBILITY OF SUCH DAMAGE. -# -# Copyright (c) 2021 ETH Zurich, Nikita Rudin - -import torch - - -def split_and_pad_trajectories(tensor, dones): - """ - Splits trajectories at done indices. Then concatenates them and pads - with zeros up to the length og the longest trajectory. - Returns masks corresponding to valid parts of the trajectories - Example: - Input: [ [a1, a2, a3, a4 | a5, a6], - [b1, b2 | b3, b4, b5 | b6] - ] - - Output:[ [a1, a2, a3, a4], | [ [True, True, True, True], - [a5, a6, 0, 0], | [True, True, False, False], - [b1, b2, 0, 0], | [True, True, False, False], - [b3, b4, b5, 0], | [True, True, True, False], - [b6, 0, 0, 0] | [True, False, False, False], - ] | ] - - Assumes that the inputy has the following dimension order: - [time, number of envs, aditional dimensions] - """ - dones = dones.clone() - dones[-1] = 1 - # * Permute the buffers to have order - # * (num_envs, num_transitions_per_env, ...), for correct reshaping - flat_dones = dones.transpose(1, 0).reshape(-1, 1) - - # * Get length of trajectory by counting the number of - # * successive not done elements - done_indices = torch.cat( - ( - flat_dones.new_tensor([-1], dtype=torch.int64), - flat_dones.nonzero()[:, 0], - ) - ) - trajectory_lengths = done_indices[1:] - done_indices[:-1] - trajectory_lengths_list = trajectory_lengths.tolist() - # * Extract the individual trajectories - trajectories = torch.split( - tensor.transpose(1, 0).flatten(0, 1), trajectory_lengths_list - ) - padded_trajectories = torch.nn.utils.rnn.pad_sequence(trajectories) - - trajectory_masks = trajectory_lengths > torch.arange( - 0, tensor.shape[0], device=tensor.device - ).unsqueeze(1) - return padded_trajectories, trajectory_masks - - -def unpad_trajectories(trajectories, masks): - """Does the inverse operation of split_and_pad_trajectories()""" - # * Need to transpose before and after the masking to have proper reshaping - return ( - trajectories.transpose(1, 0)[masks.transpose(1, 0)] - .view(-1, trajectories.shape[0], trajectories.shape[-1]) - .transpose(1, 0) - ) - - def remove_zero_weighted_rewards(reward_weights): for name in list(reward_weights.keys()): if reward_weights[name] == 0: @@ -110,3 +17,9 @@ def set_discount_from_horizon(dt, horizon): discount_factor = 1 - 1 / discrete_time_horizon return discount_factor + + +def polyak_update(online, target, polyak_factor): + for op, tp in zip(online.parameters(), target.parameters()): + tp.data.copy_((1.0 - polyak_factor) * op.data + polyak_factor * tp.data) + return target diff --git a/pyproject.toml b/pyproject.toml index a1906be6..131f359f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,8 +40,6 @@ target-version = "py38" # McCabe complexity (`C901`) by default. ignore = ["E722"] -ignore-init-module-imports = true - # Allow unused variables when underscore-prefixed. dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$" diff --git a/requirements.txt b/requirements.txt index e2a6fe2f..923df0e8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,4 +12,6 @@ mss ruff pre-commit onnx -onnxruntime \ No newline at end of file +onnxruntime +tensordict +optuna \ No newline at end of file diff --git a/resources/robots/mit_humanoid/friction_model_L.pt b/resources/robots/mit_humanoid/friction_model_L.pt new file mode 100644 index 00000000..d1d680d5 Binary files /dev/null and b/resources/robots/mit_humanoid/friction_model_L.pt differ diff --git a/resources/robots/mit_humanoid/friction_model_R.pt b/resources/robots/mit_humanoid/friction_model_R.pt new file mode 100644 index 00000000..ce831fc8 Binary files /dev/null and b/resources/robots/mit_humanoid/friction_model_R.pt differ diff --git a/resources/robots/mit_humanoid/urdf/humanoid_F_sf.urdf b/resources/robots/mit_humanoid/urdf/humanoid_F_sf.urdf index 51a84eee..f7f55186 100644 --- a/resources/robots/mit_humanoid/urdf/humanoid_F_sf.urdf +++ b/resources/robots/mit_humanoid/urdf/humanoid_F_sf.urdf @@ -562,7 +562,7 @@ Simple Foot: foot approximated as single box-contact --> + type="fixed"> @@ -607,7 +607,7 @@ Simple Foot: foot approximated as single box-contact --> + type="fixed"> @@ -660,7 +660,7 @@ Simple Foot: foot approximated as single box-contact --> + type="fixed"> @@ -711,7 +711,7 @@ Simple Foot: foot approximated as single box-contact --> + type="fixed"> @@ -790,7 +790,7 @@ Simple Foot: foot approximated as single box-contact --> + type="fixed"> @@ -835,7 +835,7 @@ Simple Foot: foot approximated as single box-contact --> + type="fixed"> @@ -888,7 +888,7 @@ Simple Foot: foot approximated as single box-contact --> + type="fixed"> @@ -939,7 +939,7 @@ Simple Foot: foot approximated as single box-contact --> + type="fixed"> diff --git a/resources/robots/mit_humanoid/urdf/humanoid_F_sf_learnt.urdf b/resources/robots/mit_humanoid/urdf/humanoid_F_sf_learnt.urdf new file mode 100644 index 00000000..98a0773a --- /dev/null +++ b/resources/robots/mit_humanoid/urdf/humanoid_F_sf_learnt.urdf @@ -0,0 +1,996 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/resources/robots/pendulum/urdf/pendulum.urdf b/resources/robots/pendulum/urdf/pendulum.urdf new file mode 100644 index 00000000..90e894e4 --- /dev/null +++ b/resources/robots/pendulum/urdf/pendulum.urdf @@ -0,0 +1,56 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/scripts/play.py b/scripts/play.py index 29a837ea..37882f7e 100644 --- a/scripts/play.py +++ b/scripts/play.py @@ -9,16 +9,16 @@ def setup(args): env_cfg, train_cfg = task_registry.create_cfgs(args) - env_cfg.env.num_envs = min(env_cfg.env.num_envs, 16) + env_cfg.env.num_envs = 32 if hasattr(env_cfg, "push_robots"): env_cfg.push_robots.toggle = False if hasattr(env_cfg, "commands"): env_cfg.commands.resampling_time = 9999 - env_cfg.env.episode_length_s = 9999 + env_cfg.env.episode_length_s = 50 env_cfg.env.num_projectiles = 20 task_registry.make_gym_and_sim() + env_cfg.init_state.reset_mode = "reset_to_range" env = task_registry.make_env(args.task, env_cfg) - env.cfg.init_state.reset_mode = "reset_to_basic" train_cfg.runner.resume = True train_cfg.logging.enable_local_saving = False runner = task_registry.make_alg_runner(env, train_cfg) @@ -46,9 +46,9 @@ def play(env, runner, train_cfg): if env.cfg.viewer.record: recorder.update(i) runner.set_actions( - runner.policy_cfg["actions"], + runner.actor_cfg["actions"], runner.get_inference_actions(), - runner.policy_cfg["disable_actions"], + runner.actor_cfg["disable_actions"], ) env.step() env.check_exit() diff --git a/scripts/train.py b/scripts/train.py index dfa6afea..759992cf 100644 --- a/scripts/train.py +++ b/scripts/train.py @@ -13,7 +13,6 @@ def setup(): wandb_helper.setup_wandb(env_cfg=env_cfg, train_cfg=train_cfg, args=args) env = task_registry.make_env(name=args.task, env_cfg=env_cfg) randomize_episode_counters(env) - policy_runner = task_registry.make_alg_runner(env, train_cfg) local_code_save_helper.save_local_files_to_logs(train_cfg.log_dir) diff --git a/setup.py b/setup.py index c918d911..06709c4e 100644 --- a/setup.py +++ b/setup.py @@ -2,20 +2,16 @@ from distutils.core import setup setup( - name="gpuGym", - version="1.0.1", + name="QGym", + version="1.0.3", author="Biomimetic Robotics Lab", license="BSD-3-Clause", packages=find_packages(), - description="Isaac Gym environments for Legged Robots", + description="Isaac Gym environments", install_requires=[ "isaacgym", - "matplotlib", - "pandas", - "tensorboard", "setuptools==59.5.0", "torch>=1.4.0", "torchvision>=0.5.0", - "numpy>=1.16.4", ], ) diff --git a/tests/integration_tests/test_runner_integration.py b/tests/integration_tests/test_runner_integration.py index 4781a665..ce645b5c 100644 --- a/tests/integration_tests/test_runner_integration.py +++ b/tests/integration_tests/test_runner_integration.py @@ -7,7 +7,7 @@ # * imports for onnx test import onnx import onnxruntime -from learning.modules import ActorCritic +from learning.modules import Actor from gym.utils.helpers import get_load_path from gym import LEGGED_GYM_ROOT_DIR @@ -27,21 +27,18 @@ def learn_policy(args): def load_saved_policy(runner): - num_actor_obs = runner.get_obs_size(runner.policy_cfg["actor_obs"]) - num_critic_obs = runner.get_obs_size(runner.policy_cfg["critic_obs"]) - num_actions = runner.get_action_size(runner.policy_cfg["actions"]) - actor_critic = ActorCritic( - num_actor_obs, num_critic_obs, num_actions, **runner.policy_cfg - ).to(runner.device) + num_actor_obs = runner.get_obs_size(runner.actor_cfg["obs"]) + num_actions = runner.get_action_size(runner.actor_cfg["actions"]) + actor = Actor(num_actor_obs, num_actions, **runner.actor_cfg).to(runner.device) resume_path = get_load_path( name=runner.cfg["experiment_name"], load_run=runner.cfg["load_run"], checkpoint=runner.cfg["checkpoint"], ) - loaded_dict = torch.load(resume_path) - actor_critic.actor.load_state_dict(loaded_dict["actor_state_dict"]) - actor_critic.eval() - return actor_critic + loaded_dict = torch.load(resume_path, weights_only=True) + actor.load_state_dict(loaded_dict["actor_state_dict"]) + actor.eval() + return actor class TestDefaultIntegration: @@ -60,7 +57,7 @@ def test_default_integration_settings(self, args): with torch.no_grad(): actions = runner.get_inference_actions() - deployed_actions = runner.env.get_states(runner.policy_cfg["actions"]) + deployed_actions = runner.env.get_states(runner.actor_cfg["actions"]) assert ( torch.equal(actions, torch.zeros_like(actions)) is False ), "Policy returning all zeros" @@ -95,20 +92,20 @@ def test_default_integration_settings(self, args): f"{model_5_path}" "(last iteration) was not saved" ) - obs = torch.randn_like(runner.get_obs(runner.policy_cfg["actor_obs"])) - actions_first = runner.alg.actor_critic.act_inference(obs).cpu().clone() + obs = torch.randn_like(runner.get_obs(runner.actor_cfg["obs"])) + actions_first = runner.alg.actor.act_inference(obs).cpu().clone() runner.load(model_8_path) - actions_loaded = runner.alg.actor_critic.act_inference(obs).cpu().clone() + actions_loaded = runner.alg.actor.act_inference(obs).cpu().clone() assert torch.equal(actions_first, actions_loaded), "Model loading failed" # * test onnx exporting - loaded_actor_critic = load_saved_policy(runner) + loaded_actor = load_saved_policy(runner) export_path = os.path.join( LEGGED_GYM_ROOT_DIR, "tests", "integration_tests", "exported_policy" ) - loaded_actor_critic.export_policy(export_path) + loaded_actor.export(export_path) # load the exported onnx path_to_onnx = os.path.join(export_path, "policy.onnx") @@ -121,9 +118,9 @@ def test_default_integration_settings(self, args): ) # compute torch output with torch.no_grad(): - test_input = runner.get_obs(runner.policy_cfg["actor_obs"])[0:1] - runner_out = runner.alg.actor_critic.actor.act_inference(test_input) - loaded_out = loaded_actor_critic.actor.act_inference(test_input) + test_input = runner.get_obs(runner.actor_cfg["obs"])[0] + runner_out = runner.alg.actor.act_inference(test_input) + loaded_out = loaded_actor.act_inference(test_input) # compute ONNX Runtime output prediction ort_inputs = {ort_session.get_inputs()[0].name: test_input.cpu().numpy()} @@ -131,8 +128,8 @@ def test_default_integration_settings(self, args): # compare ONNX Runtime and PyTorch results torch.testing.assert_close( - runner_out.cpu().numpy(), loaded_out.cpu().numpy(), rtol=1e-05, atol=1e-8 + runner_out.cpu().numpy(), loaded_out.cpu().numpy(), rtol=1e-05, atol=1e-7 ) torch.testing.assert_close( - runner_out.cpu().numpy(), ort_out[0], rtol=1e-05, atol=1e-8 + runner_out.cpu().numpy(), ort_out[0], rtol=1e-05, atol=1e-7 ) diff --git a/tests/regression_tests/run_n_iterations.py b/tests/regression_tests/run_n_iterations.py index 6a3a4965..29e2f424 100644 --- a/tests/regression_tests/run_n_iterations.py +++ b/tests/regression_tests/run_n_iterations.py @@ -33,7 +33,7 @@ def _run_training_test(output_tensor_file, env, runner): runner.learn() # * get the test values after learning - actions = runner.env.get_states(runner.policy_cfg["actions"]) + actions = runner.env.get_states(runner.actor_cfg["actions"]) # * return the values to the parent for assertion actions.detach().cpu() diff --git a/tests/regression_tests/test_generated_torch_files.py b/tests/regression_tests/test_generated_torch_files.py index ef8ddc15..11b9ff5a 100644 --- a/tests/regression_tests/test_generated_torch_files.py +++ b/tests/regression_tests/test_generated_torch_files.py @@ -2,8 +2,8 @@ def compare_tensors(file1, file2): - tensor1 = torch.load(file1) - tensor2 = torch.load(file2) + tensor1 = torch.load(file1, weights_only=True) + tensor2 = torch.load(file2, weights_only=True) assert torch.all( torch.eq(tensor1, tensor2) ), f"Tensors in {file1} and {file2} are not equal."