diff --git a/gym/envs/__init__.py b/gym/envs/__init__.py index 763007cc..4b5a1546 100644 --- a/gym/envs/__init__.py +++ b/gym/envs/__init__.py @@ -19,8 +19,8 @@ "Anymal": ".anymal_c.anymal", "A1": ".a1.a1", "HumanoidRunning": ".mit_humanoid.humanoid_running", - "HumanoidBouncing": ".mit_humanoid.humanoid_bouncing", "Pendulum": ".pendulum.pendulum", + "LQRPendulum": ".pendulum.lqr_pendulum", } config_dict = { @@ -32,8 +32,8 @@ "A1Cfg": ".a1.a1_config", "AnymalCFlatCfg": ".anymal_c.flat.anymal_c_flat_config", "HumanoidRunningCfg": ".mit_humanoid.humanoid_running_config", - "HumanoidBouncingCfg": ".mit_humanoid.humanoid_bouncing_config", "PendulumCfg": ".pendulum.pendulum_config", + "LQRPendulumCfg": ".pendulum.lqr_pendulum_config", } runner_config_dict = { @@ -45,8 +45,8 @@ "A1RunnerCfg": ".a1.a1_config", "AnymalCFlatRunnerCfg": ".anymal_c.flat.anymal_c_flat_config", "HumanoidRunningRunnerCfg": ".mit_humanoid.humanoid_running_config", - "HumanoidBouncingRunnerCfg": ".mit_humanoid.humanoid_bouncing_config", "PendulumRunnerCfg": ".pendulum.pendulum_config", + "LQRPendulumRunnerCfg": ".pendulum.lqr_pendulum_config", } task_dict = { @@ -68,12 +68,9 @@ "HumanoidRunningCfg", "HumanoidRunningRunnerCfg", ], - "humanoid_bouncing": ["HumanoidBouncing", - "HumanoidBouncingCfg", - "HumanoidBouncingRunnerCfg"], - "a1": ["A1", "A1Cfg", "A1RunnerCfg"], "flat_anymal_c": ["Anymal", "AnymalCFlatCfg", "AnymalCFlatRunnerCfg"], - "pendulum": ["Pendulum", "PendulumCfg", "PendulumRunnerCfg"] + "pendulum": ["Pendulum", "PendulumCfg", "PendulumRunnerCfg"], + "lqr_pendulum": ["LQRPendulum", "LQRPendulumCfg", "LQRPendulumRunnerCfg"], } for class_name, class_location in class_dict.items(): diff --git a/gym/envs/a1/a1_config.py b/gym/envs/a1/a1_config.py index cc19d8f3..87477b56 100644 --- a/gym/envs/a1/a1_config.py +++ b/gym/envs/a1/a1_config.py @@ -137,13 +137,13 @@ class scaling(LeggedRobotCfg.scaling): class A1RunnerCfg(LeggedRobotRunnerCfg): seed = -1 + runner_class_name = "OldPolicyRunner" - class policy(LeggedRobotRunnerCfg.policy): - actor_hidden_dims = [256, 256, 256] - critic_hidden_dims = [256, 256, 256] + class actor(LeggedRobotRunnerCfg.actor): + hidden_dims = [256, 256, 256] # activation can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" - actor_obs = [ + obs = [ "base_height", "base_lin_vel", "base_ang_vel", @@ -154,7 +154,19 @@ class policy(LeggedRobotRunnerCfg.policy): "commands", ] - critic_obs = [ + actions = ["dof_pos_target"] + + class noise: + dof_pos_obs = 0.005 + dof_vel = 0.005 + base_ang_vel = 0.05 + projected_gravity = 0.02 + + class critic: + hidden_dims = [256, 256, 256] + # activation can be elu, relu, selu, crelu, lrelu, tanh, sigmoid + activation = "elu" + obs = [ "base_height", "base_lin_vel", "base_ang_vel", @@ -165,16 +177,8 @@ class policy(LeggedRobotRunnerCfg.policy): "commands", ] - actions = ["dof_pos_target"] - - class noise: - dof_pos_obs = 0.005 # can be made very low - dof_vel = 0.005 - base_ang_vel = 0.05 - projected_gravity = 0.02 - - class reward(LeggedRobotRunnerCfg.policy.reward): - class weights(LeggedRobotRunnerCfg.policy.reward.weights): + class reward: + class weights: tracking_lin_vel = 1.0 tracking_ang_vel = 1.0 lin_vel_z = 0.0 diff --git a/gym/envs/anymal_c/flat/anymal_c_flat_config.py b/gym/envs/anymal_c/flat/anymal_c_flat_config.py index 4ce7f214..d18f7cca 100644 --- a/gym/envs/anymal_c/flat/anymal_c_flat_config.py +++ b/gym/envs/anymal_c/flat/anymal_c_flat_config.py @@ -85,6 +85,7 @@ class push_robots: toggle = True interval_s = 1 max_push_vel_xy = 0.5 + push_box_dims = [0.2, 0.2, 0.2] class domain_rand(LeggedRobotCfg.domain_rand): randomize_base_mass = True @@ -120,14 +121,14 @@ class scaling(LeggedRobotCfg.scaling): class AnymalCFlatRunnerCfg(LeggedRobotRunnerCfg): seed = -1 + runner_class_name = "OldPolicyRunner" - class policy(LeggedRobotRunnerCfg.policy): - actor_hidden_dims = [256, 256, 256] - critic_hidden_dims = [256, 256, 256] + class actor(LeggedRobotRunnerCfg.actor): + hidden_dims = [256, 256, 256] # can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" - actor_obs = [ + obs = [ "base_height", "base_lin_vel", "base_ang_vel", @@ -138,7 +139,19 @@ class policy(LeggedRobotRunnerCfg.policy): "dof_pos_history", ] - critic_obs = [ + actions = ["dof_pos_target"] + + class noise: + dof_pos_obs = 0.005 + dof_vel = 0.005 + base_ang_vel = 0.05 # 0.027, 0.14, 0.37 + projected_gravity = 0.02 + + class critic(LeggedRobotRunnerCfg.critic): + hidden_dims = [256, 256, 256] + # can be elu, relu, selu, crelu, lrelu, tanh, sigmoid + activation = "elu" + obs = [ "base_height", "base_lin_vel", "base_ang_vel", @@ -149,16 +162,8 @@ class policy(LeggedRobotRunnerCfg.policy): "dof_pos_history", ] - actions = ["dof_pos_target"] - - class noise: - dof_pos_obs = 0.005 # can be made very low - dof_vel = 0.005 - base_ang_vel = 0.05 # 0.027, 0.14, 0.37 - projected_gravity = 0.02 - - class reward(LeggedRobotRunnerCfg.policy.reward): - class weights(LeggedRobotRunnerCfg.policy.reward.weights): + class reward: + class weights: tracking_lin_vel = 3.0 tracking_ang_vel = 1.0 lin_vel_z = 0.0 diff --git a/gym/envs/anymal_c/mixed_terrains/anymal_c_rough_config.py b/gym/envs/anymal_c/mixed_terrains/anymal_c_rough_config.py index badc3faa..85b0c42b 100644 --- a/gym/envs/anymal_c/mixed_terrains/anymal_c_rough_config.py +++ b/gym/envs/anymal_c/mixed_terrains/anymal_c_rough_config.py @@ -56,12 +56,15 @@ class domain_rand(AnymalCFlatCfg.domain_rand): class AnymalCRoughCCfgPPO(AnymalCFlatCfgPPO): - class policy(AnymalCFlatCfgPPO.policy): - actor_hidden_dims = [128, 64, 32] + class actor(AnymalCFlatCfgPPO.actor): + hidden_dims = [128, 64, 32] critic_hidden_dims = [128, 64, 32] # can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" + class critic(AnymalCFlatCfgPPO.critic): + pass + class algorithm(AnymalCFlatCfgPPO.algorithm): entropy_coef = 0.01 diff --git a/gym/envs/base/fixed_robot.py b/gym/envs/base/fixed_robot.py index a7c1b7ae..b8f47ceb 100644 --- a/gym/envs/base/fixed_robot.py +++ b/gym/envs/base/fixed_robot.py @@ -417,8 +417,8 @@ def _init_buffers(self): self.act_idx = to_torch(actuated_idx, dtype=torch.long, device=self.device) # * check that init range highs and lows are consistent # * and repopulate to match - if self.cfg.init_state.reset_mode == "reset_to_range": - self.initialize_ranges_for_initial_conditions() + # if self.cfg.init_state.reset_mode == "reset_to_range": + self.initialize_ranges_for_initial_conditions() def initialize_ranges_for_initial_conditions(self): self.dof_pos_range = torch.zeros( diff --git a/gym/envs/base/fixed_robot_config.py b/gym/envs/base/fixed_robot_config.py index a59cc9eb..b5b5e88d 100644 --- a/gym/envs/base/fixed_robot_config.py +++ b/gym/envs/base/fixed_robot_config.py @@ -123,34 +123,33 @@ class FixedRobotCfgPPO(BaseConfig): class logging: enable_local_saving = True - class policy: + class actor: init_noise_std = 1.0 - actor_hidden_dims = [512, 256, 128] - critic_hidden_dims = [512, 256, 128] + hidden_dims = [512, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" - # only for 'ActorCriticRecurrent': - # rnn_type = 'lstm' - # rnn_hidden_size = 512 - # rnn_num_layers = 1 - - actor_obs = [ + obs = [ "observation_a", "observation_b", "these_need_to_be_atributes_(states)_of_the_robot_env", ] - - critic_obs = [ - "observation_x", - "observation_y", - "critic_obs_can_be_the_same_or_different_than_actor_obs", - ] + normalize_obs = True actions = ["tau_ff"] disable_actions = False class noise: - noise = 0.1 # implement as needed, also in your robot class + observation_a = 0.1 # implement as needed, also in your robot class + + class critic: + hidden_dims = [512, 256, 128] + activation = "elu" + normalize_obs = True + obs = [ + "observation_x", + "observation_y", + "critic_obs_can_be_the_same_or_different_than_actor_obs", + ] class rewards: class weights: @@ -182,7 +181,7 @@ class algorithm: class runner: policy_class_name = "ActorCritic" - algorithm_class_name = "PPO" + algorithm_class_name = "PPO2" num_steps_per_env = 24 # per iteration max_iterations = 500 # number of policy updates diff --git a/gym/envs/base/legged_robot_config.py b/gym/envs/base/legged_robot_config.py index 0de83e0e..b6667502 100644 --- a/gym/envs/base/legged_robot_config.py +++ b/gym/envs/base/legged_robot_config.py @@ -233,10 +233,9 @@ class LeggedRobotRunnerCfg(BaseConfig): class logging: enable_local_saving = True - class policy: + class actor: init_noise_std = 1.0 - actor_hidden_dims = [512, 256, 128] - critic_hidden_dims = [512, 256, 128] + hidden_dims = [512, 256, 128] # can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" normalize_obs = True @@ -263,6 +262,17 @@ class noise: projected_gravity = 0.05 height_measurements = 0.1 + class critic: + hidden_dims = [512, 256, 128] + # can be elu, relu, selu, crelu, lrelu, tanh, sigmoid + activation = "elu" + normalize_obs = True + obs = [ + "observation_x", + "observation_y", + "critic_obs_can_be_the_same_or_different_than_actor_obs", + ] + class reward: class weights: tracking_lin_vel = 0.0 @@ -283,25 +293,30 @@ class termination_weight: termination = 0.01 class algorithm: - # * training params - value_loss_coef = 1.0 - use_clipped_value_loss = True + # both + gamma = 0.99 + lam = 0.95 + # shared + batch_size = 2**15 + max_grad_steps = 10 + # new + storage_size = 2**17 # new + mini_batch_size = 2**15 # new + clip_param = 0.2 - entropy_coef = 0.01 - num_learning_epochs = 5 - # * mini batch size = num_envs*nsteps / nminibatches - num_mini_batches = 4 learning_rate = 1.0e-3 + max_grad_norm = 1.0 + # Critic + use_clipped_value_loss = True + # Actor + entropy_coef = 0.01 schedule = "adaptive" # could be adaptive, fixed - gamma = 0.99 - lam = 0.95 desired_kl = 0.01 - max_grad_norm = 1.0 class runner: policy_class_name = "ActorCritic" - algorithm_class_name = "PPO" - num_steps_per_env = 24 + algorithm_class_name = "PPO2" + num_steps_per_env = 24 # deprecate max_iterations = 1500 save_interval = 50 run_name = "" diff --git a/gym/envs/base/task_skeleton.py b/gym/envs/base/task_skeleton.py index ffbac6ce..038fde5a 100644 --- a/gym/envs/base/task_skeleton.py +++ b/gym/envs/base/task_skeleton.py @@ -45,6 +45,7 @@ def reset(self): """Reset all robots""" self._reset_idx(torch.arange(self.num_envs, device=self.device)) self.step() + self.episode_length_buf[:] = 0 def _reset_buffers(self): self.to_be_reset[:] = False diff --git a/gym/envs/cartpole/cartpole_config.py b/gym/envs/cartpole/cartpole_config.py index d8a19292..6e073d5c 100644 --- a/gym/envs/cartpole/cartpole_config.py +++ b/gym/envs/cartpole/cartpole_config.py @@ -67,15 +67,14 @@ class CartpoleRunnerCfg(FixedRobotCfgPPO): seed = -1 runner_class_name = "OnPolicyRunner" - class policy(FixedRobotCfgPPO.policy): + class actor(FixedRobotCfgPPO.actor): init_noise_std = 1.0 num_layers = 2 num_units = 32 - actor_hidden_dims = [num_units] * num_layers - critic_hidden_dims = [num_units] * num_layers + hidden_dims = [num_units] * num_layers activation = "elu" - actor_obs = [ + obs = [ "cart_obs", "pole_trig_obs", "dof_vel", @@ -83,8 +82,6 @@ class policy(FixedRobotCfgPPO.policy): "pole_vel_square", ] - critic_obs = actor_obs - actions = ["tau_ff"] class noise: @@ -94,6 +91,20 @@ class noise: pole_vel = 0.010 actuation = 0.00 + class critic: + num_layers = 2 + num_units = 32 + hidden_dims = [num_units] * num_layers + activation = "elu" + + obs = [ + "cart_obs", + "pole_trig_obs", + "dof_vel", + "cart_vel_square", + "pole_vel_square", + ] + class reward: class weights: pole_pos = 5 @@ -125,7 +136,7 @@ class algorithm(FixedRobotCfgPPO.algorithm): class runner(FixedRobotCfgPPO.runner): policy_class_name = "ActorCritic" - algorithm_class_name = "PPO" + algorithm_class_name = "PPO2" num_steps_per_env = 96 # per iteration max_iterations = 500 # number of policy updates diff --git a/gym/envs/mini_cheetah/mini_cheetah_config.py b/gym/envs/mini_cheetah/mini_cheetah_config.py index de015b21..447d4d3b 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_config.py @@ -124,24 +124,13 @@ class scaling(LeggedRobotCfg.scaling): class MiniCheetahRunnerCfg(LeggedRobotRunnerCfg): seed = -1 + runner_class_name = "OnPolicyRunner" - class policy(LeggedRobotRunnerCfg.policy): - actor_hidden_dims = [256, 256, 128] - critic_hidden_dims = [128, 64] + class actor: + hidden_dims = [256, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" - - actor_obs = [ - "base_lin_vel", - "base_ang_vel", - "projected_gravity", - "commands", - "dof_pos_obs", - "dof_vel", - "dof_pos_target", - ] - critic_obs = [ - "base_height", + obs = [ "base_lin_vel", "base_ang_vel", "projected_gravity", @@ -150,9 +139,10 @@ class policy(LeggedRobotRunnerCfg.policy): "dof_vel", "dof_pos_target", ] - + normalize_obs = True actions = ["dof_pos_target"] add_noise = True + disable_actions = False class noise: scale = 1.0 @@ -164,8 +154,23 @@ class noise: ang_vel = [0.3, 0.15, 0.4] gravity_vec = 0.1 - class reward(LeggedRobotRunnerCfg.policy.reward): - class weights(LeggedRobotRunnerCfg.policy.reward.weights): + class critic: + hidden_dims = [128, 64] + # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid + activation = "elu" + obs = [ + "base_height", + "base_lin_vel", + "base_ang_vel", + "projected_gravity", + "commands", + "dof_pos_obs", + "dof_vel", + "dof_pos_target", + ] + + class reward: + class weights: tracking_lin_vel = 4.0 tracking_ang_vel = 2.0 lin_vel_z = 0.0 @@ -185,24 +190,11 @@ class termination_weight: termination = 0.01 class algorithm(LeggedRobotRunnerCfg.algorithm): - # * training params - value_loss_coef = 1.0 - use_clipped_value_loss = True - clip_param = 0.2 - entropy_coef = 0.02 - num_learning_epochs = 4 - # * mini batch size = num_envs*nsteps / nminibatches - num_mini_batches = 8 - learning_rate = 1.0e-5 - schedule = "adaptive" # can be adaptive or fixed - discount_horizon = 1.0 # [s] - GAE_bootstrap_horizon = 2.0 # [s] - desired_kl = 0.01 - max_grad_norm = 1.0 + pass class runner(LeggedRobotRunnerCfg.runner): run_name = "" experiment_name = "mini_cheetah" max_iterations = 500 - algorithm_class_name = "PPO" + algorithm_class_name = "PPO2" num_steps_per_env = 32 diff --git a/gym/envs/mini_cheetah/mini_cheetah_osc_config.py b/gym/envs/mini_cheetah/mini_cheetah_osc_config.py index a08d5179..a4acefa6 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_osc_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_osc_config.py @@ -161,13 +161,13 @@ class scaling(MiniCheetahCfg.scaling): class MiniCheetahOscRunnerCfg(MiniCheetahRunnerCfg): seed = -1 - class policy(MiniCheetahRunnerCfg.policy): - actor_hidden_dims = [256, 256, 128] + class policy: + hidden_dims = [256, 256, 128] critic_hidden_dims = [256, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" - actor_obs = [ + obs = [ "base_ang_vel", "projected_gravity", "commands", @@ -244,5 +244,5 @@ class runner(MiniCheetahRunnerCfg.runner): run_name = "" experiment_name = "FullSend" max_iterations = 500 # number of policy updates - algorithm_class_name = "PPO" + algorithm_class_name = "PPO2" num_steps_per_env = 32 diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py index 41bb634f..eedbd904 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py @@ -67,15 +67,13 @@ class scaling(MiniCheetahCfg.scaling): class MiniCheetahRefRunnerCfg(MiniCheetahRunnerCfg): seed = -1 - runner_class_name = "MyRunner" + runner_class_name = "OnPolicyRunner" - class policy(MiniCheetahRunnerCfg.policy): - actor_hidden_dims = [256, 256, 128] - critic_hidden_dims = [256, 256, 128] + class actor: + hidden_dims = [256, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" - - actor_obs = [ + obs = [ "base_ang_vel", "projected_gravity", "commands", @@ -83,8 +81,26 @@ class policy(MiniCheetahRunnerCfg.policy): "dof_vel", "phase_obs", ] + normalize_obs = True + + actions = ["dof_pos_target"] + disable_actions = False - critic_obs = [ + class noise: + scale = 1.0 + dof_pos_obs = 0.01 + base_ang_vel = 0.01 + dof_pos = 0.005 + dof_vel = 0.005 + lin_vel = 0.05 + ang_vel = [0.3, 0.15, 0.4] + gravity_vec = 0.1 + + class critic: + hidden_dims = [256, 256, 128] + # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid + activation = "elu" + obs = [ "base_height", "base_lin_vel", "base_ang_vel", @@ -96,12 +112,6 @@ class policy(MiniCheetahRunnerCfg.policy): "dof_pos_target", ] - actions = ["dof_pos_target"] - disable_actions = False - - class noise: - pass - class reward: class weights: tracking_lin_vel = 4.0 @@ -123,35 +133,15 @@ class weights: swing_grf = 1.5 stance_grf = 1.5 - class pbrs_weights: - reference_traj = 0.0 - swing_grf = 0.0 - stance_grf = 0.0 - min_base_height = 0.0 - orientation = 0.0 - class termination_weight: termination = 0.15 class algorithm(MiniCheetahRunnerCfg.algorithm): - # training params - value_loss_coef = 1.0 - use_clipped_value_loss = True - clip_param = 0.2 - entropy_coef = 0.01 - num_learning_epochs = 6 - # mini batch size = num_envs*nsteps/nminibatches - num_mini_batches = 4 - learning_rate = 5.0e-5 - schedule = "adaptive" # can be adaptive, fixed - discount_horizon = 1.0 # [s] - GAE_bootstrap_horizon = 1.0 # [s] - desired_kl = 0.01 - max_grad_norm = 1.0 + pass class runner(MiniCheetahRunnerCfg.runner): run_name = "" experiment_name = "mini_cheetah_ref" max_iterations = 500 # number of policy updates - algorithm_class_name = "PPO" - num_steps_per_env = 32 + algorithm_class_name = "PPO2" + num_steps_per_env = 32 # deprecate diff --git a/gym/envs/mit_humanoid/humanoid_bouncing.py b/gym/envs/mit_humanoid/humanoid_bouncing.py deleted file mode 100644 index 6dedf8dd..00000000 --- a/gym/envs/mit_humanoid/humanoid_bouncing.py +++ /dev/null @@ -1,443 +0,0 @@ -import torch - -from isaacgym.torch_utils import quat_rotate_inverse -from gym.envs import LeggedRobot -from isaacgym import gymutil -from isaacgym import gymapi - - -class HumanoidBouncing(LeggedRobot): - def __init__(self, gym, sim, cfg, sim_params, sim_device, headless): - super().__init__(gym, sim, cfg, sim_params, sim_device, headless) - - def _init_buffers(self): - super()._init_buffers() - - # * get the body_name to body_index dict - body_dict = self.gym.get_actor_rigid_body_dict( - self.envs[0], self.actor_handles[0] - ) - # * extract a list of body_names where the index is the id number - body_names = [ - body_tuple[0] - for body_tuple in sorted( - body_dict.items(), key=lambda body_tuple: body_tuple[1] - ) - ] - # * construct a list of id numbers corresponding to end_effectors - self.end_effector_ids = [] - for end_effector_name in self.cfg.asset.end_effector_names: - self.end_effector_ids.extend( - [ - body_names.index(body_name) - for body_name in body_names - if end_effector_name in body_name - ] - ) - - # * end_effector_pos is world-frame and converted to env_origin - self.end_effector_pos = self._rigid_body_pos[ - :, self.end_effector_ids - ] - self.env_origins.unsqueeze(dim=1).expand( - self.num_envs, len(self.end_effector_ids), 3 - ) - self.end_effector_quat = self._rigid_body_quat[:, self.end_effector_ids] - - self.end_effector_lin_vel = torch.zeros( - self.num_envs, - len(self.end_effector_ids), - 3, - dtype=torch.float, - device=self.device, - ) - self.end_effector_ang_vel = torch.zeros( - self.num_envs, - len(self.end_effector_ids), - 3, - dtype=torch.float, - device=self.device, - ) - - # * end_effector vels are body-relative like body vels above - for index in range(len(self.end_effector_ids)): - self.end_effector_lin_vel[:, index, :] = quat_rotate_inverse( - self.base_quat, - self._rigid_body_lin_vel[:, self.end_effector_ids][:, index, :], - ) - self.end_effector_ang_vel[:, index, :] = quat_rotate_inverse( - self.base_quat, - self._rigid_body_ang_vel[:, self.end_effector_ids][:, index, :], - ) - - # * separate legs and arms - self.dof_pos_target_legs = torch.zeros( - self.num_envs, 10, dtype=torch.float, device=self.device - ) - self.dof_pos_target_arms = torch.zeros( - self.num_envs, 8, dtype=torch.float, device=self.device - ) - self.dof_pos_legs = torch.zeros( - self.num_envs, 10, dtype=torch.float, device=self.device - ) - self.dof_pos_arms = torch.zeros( - self.num_envs, 8, dtype=torch.float, device=self.device - ) - self.dof_vel_legs = torch.zeros( - self.num_envs, 10, dtype=torch.float, device=self.device - ) - self.dof_vel_arms = torch.zeros( - self.num_envs, 8, dtype=torch.float, device=self.device - ) - - # * other - self.base_pos = self.root_states[:, 0:3] - self.phase = torch.zeros( - self.num_envs, 1, dtype=torch.float, device=self.device - ) - self.phase_freq = 1.0 - - # * high level - self.hl_impulses = torch.zeros( - self.num_envs, 4, 5, dtype=torch.float, device=self.device - ) - self.hl_impulses_flat = self.hl_impulses.flatten(start_dim=1) - self.hl_ix = torch.zeros( - self.num_envs, 1, dtype=torch.float, device=self.device - ) - self.hl_commands = torch.zeros( - self.num_envs, 6, dtype=torch.float, device=self.device - ) - self.time_since_hl_query = torch.zeros( - self.num_envs, 1, device=self.device, dtype=torch.float - ) - - def _pre_physics_step(self): - super()._pre_physics_step() - self.dof_pos_target[:, :10] = self.dof_pos_target_legs - self.dof_pos_target[:, 10:] = self.dof_pos_target_arms - - envs_to_resample = torch.where( - self.time_since_hl_query == self.cfg.high_level.interval, - True, - False, - ) - if envs_to_resample.any().item(): - self._resample_high_level( - envs_to_resample.nonzero(as_tuple=False).flatten() - ) - - self.hl_impulses_flat = self.hl_impulses.flatten(start_dim=1) - - def _reset_system(self, env_ids): - super()._reset_system(env_ids) - # if self.cfg.commands.resampling_time == -1: - # self.commands[env_ids, :] = 0. - self.phase[env_ids, 0] = torch.rand( - (torch.numel(env_ids),), requires_grad=False, device=self.device - ) - self._resample_high_level(env_ids) - self.hl_impulses_flat = self.hl_impulses.flatten(start_dim=1) - - def _post_physics_step(self): - super()._post_physics_step() - self.time_since_hl_query += 1 # self.dt - self.phase_sin = torch.sin(2 * torch.pi * self.phase) - self.phase_cos = torch.cos(2 * torch.pi * self.phase) - - self.in_contact = self.contact_forces[:, self.end_effector_ids, 2].gt(0.0) - self.phase = torch.fmod(self.phase + self.dt, 1.0) - - self.dof_pos_legs = self.dof_pos[:, :10] - self.dof_pos_arms = self.dof_pos[:, 10:] - self.dof_vel_legs = self.dof_vel[:, :10] - self.dof_vel_arms = self.dof_vel[:, 10:] - - # * end_effector_pos is world-frame and converted to env_origin - self.end_effector_pos = self._rigid_body_pos[ - :, self.end_effector_ids - ] - self.env_origins.unsqueeze(dim=1).expand( - self.num_envs, len(self.end_effector_ids), 3 - ) - self.end_effector_quat = self._rigid_body_quat[:, self.end_effector_ids] - - # * end_effector vels are body-relative like body vels above - for index in range(len(self.end_effector_ids)): - self.end_effector_lin_vel[:, index, :] = quat_rotate_inverse( - self.base_quat, - self._rigid_body_lin_vel[:, self.end_effector_ids][:, index, :], - ) - self.end_effector_ang_vel[:, index, :] = quat_rotate_inverse( - self.base_quat, - self._rigid_body_ang_vel[:, self.end_effector_ids][:, index, :], - ) - - # * update HL indices - delta_hl_now = self.hl_impulses[:, 0, :] - self.time_since_hl_query.view( - self.num_envs, 1 - ).expand(self.num_envs, 5) - self.hl_ix = torch.argmin( - torch.where( - delta_hl_now >= 0, - delta_hl_now, - float("inf") * torch.ones_like(delta_hl_now), - ), - dim=1, - ).unsqueeze(1) - self._update_hl_commands() - # roll out next time step HL command - self.hl_commands[:, :3] += self.hl_commands[:, 3:] * self.dt - self.hl_commands[:, 5] -= 9.81 * self.dt - - def _resample_high_level(self, env_ids): - """ - Updates impulse sequences for envs its passed s.t. the first impulse compensates for - tracking error and the rest enforce the same bouncing ball trajectory on the COM - """ - self.time_since_hl_query[env_ids] = 0.0 - delta_touchdown = self._time_to_touchdown( - ( - self.root_states[env_ids, 2:3] - - self.cfg.reward_settings.base_height_target - ), - self.base_lin_vel[env_ids, 2].unsqueeze(1), - -9.81, - ) - - step_time = self.cfg.high_level.sec_per_gait - - first_xy = torch.tensor( - [[[1], [0]]], dtype=torch.float, device=self.device - ).repeat(env_ids.shape[0], 1, 1) - ( - self.base_lin_vel[env_ids, :2] * delta_touchdown.repeat(1, 2) - ).unsqueeze(2) - first_z = torch.tensor( - [[[step_time * 9.81 / 2]]], dtype=torch.float, device=self.device - ).repeat(env_ids.shape[0], 1, 1) - ( - self.base_lin_vel[env_ids, 2] - 9.81 * delta_touchdown.squeeze(1) - ).view(env_ids.shape[0], 1, 1) - first_impulse = torch.cat((first_xy, first_z), dim=1) - remaining_impulses = torch.tensor( - [[0], [0], [step_time * 9.81]], dtype=torch.float, device=self.device - ).repeat(env_ids.shape[0], 1, 4) - impulse_mag_buf = torch.cat((first_impulse, remaining_impulses), dim=2) - - time_rollout = torch.cat( - ( - delta_touchdown, - delta_touchdown + (1.0 * step_time), - delta_touchdown + (2.0 * step_time), - delta_touchdown + (3.0 * step_time), - delta_touchdown + (4.0 * step_time), - ), - dim=1, - ).unsqueeze(1) - - time_rollout = time_rollout / self.dt - time_rollout = time_rollout.round().int() - - self.hl_impulses[env_ids] = torch.cat((time_rollout, impulse_mag_buf), dim=1) - self.hl_ix[env_ids] = 0 - - # seed the root states for roll out via post physics - self.hl_commands[env_ids, :3] = self.base_pos[env_ids, :] - self.hl_commands[env_ids, 3:] = self.base_lin_vel[env_ids, :] - - if self.cfg.viewer.record: - # draw new high level target trajectories - self.gym.clear_lines(self.viewer) - sphere_geom = gymutil.WireframeSphereGeometry( - 0.1, 64, 64, None, color=(1, 1, 0) - ) - - for i in range(self.num_envs): - base_pos = (self.root_states[i, :3]).cpu().numpy() - sphere_pose = gymapi.Transform( - gymapi.Vec3(base_pos[0], base_pos[1], base_pos[2]), r=None - ) - gymutil.draw_lines( - sphere_geom, self.gym, self.viewer, self.envs[i], sphere_pose - ) - line = [ - base_pos[0], - base_pos[1], - self.cfg.reward_settings.base_height_target, - base_pos[0] + 100, - base_pos[1], - self.cfg.reward_settings.base_height_target, - ] - self.gym.add_lines(self.viewer, self.envs[i], 1, line, [0.85, 0.1, 0.1]) - - def _time_to_touchdown(self, pos, vel, acc): - """ - Assumes robot COM in projectile motion to calculate next touchdown - """ - determinant = torch.square(vel) - 4 * pos * 0.5 * acc - solution = torch.where(determinant < 0.0, False, True) - no_solution = torch.eq(solution, False).nonzero(as_tuple=True)[0] - t1 = (-vel + torch.sqrt(determinant)) / (2 * 0.5 * acc) - t2 = (-vel - torch.sqrt(determinant)) / (2 * 0.5 * acc) - result = torch.where(t2 > t1, t2, t1) - result[no_solution] = self.dt - return result - - def _update_hl_commands(self): - """ - Checks which environments should have an impulse imparted on them and - updates HL command per-step rollout's velocities accordingly - """ - vel_update_indices = torch.argmax( - (self.hl_impulses[:, 0, :] == self.time_since_hl_query).float(), dim=1 - ).unsqueeze(1) - vel_update_indices[ - (vel_update_indices == 0) - & (self.hl_impulses[:, 0, 0].unsqueeze(1) != self.time_since_hl_query) - ] = -1 - vel_update_indices = vel_update_indices.squeeze(1) - vel_update_envs = torch.nonzero(vel_update_indices != -1) - self.hl_commands[vel_update_envs, 3:] += self.hl_impulses[ - vel_update_envs, 1:, vel_update_indices[vel_update_envs] - ] - - def _check_terminations_and_timeouts(self): - """Check if environments need to be reset""" - super()._check_terminations_and_timeouts() - - # * Termination for velocities, orientation, and low height - self.terminated |= (self.base_lin_vel.norm(dim=-1, keepdim=True) > 10).any( - dim=1 - ) - self.terminated |= (self.base_ang_vel.norm(dim=-1, keepdim=True) > 5).any(dim=1) - self.terminated |= (self.projected_gravity[:, 0:1].abs() > 0.7).any(dim=1) - self.terminated |= (self.projected_gravity[:, 1:2].abs() > 0.7).any(dim=1) - self.terminated |= (self.base_pos[:, 2:3] < 0.3).any(dim=1) - - self.to_be_reset = self.timed_out | self.terminated - - # ########################## REWARDS ######################## # - - # * Task rewards * # - - def _reward_tracking_hl_pos(self): - # error = self.hl_commands[:, 2] - self.base_pos[:, 2] - # error /= self.scales["hl_pos"] - # error = error.flatten() - # return self._sqrdexp(error) - error = self.hl_commands[:, :3] - self.base_pos[:, :3] - error /= self.scales["hl_pos"] - return self._sqrdexp(error).sum(dim=1) - - def _reward_tracking_hl_vel(self): - error = self.hl_commands[:, 3:] - self.base_lin_vel - error /= self.scales["hl_vel"] - return self._sqrdexp(error).sum(dim=1) - - def _reward_tracking_ang_vel(self): - ang_vel_error = 0.0 - self.base_ang_vel[:, 2] - ang_vel_error /= self.scales["base_ang_vel"] - return self._sqrdexp(ang_vel_error) - - # * Shaping rewards * # - - def _reward_base_height(self): - error = self.base_height - self.cfg.reward_settings.base_height_target - error /= self.scales["base_height"] - error = error.flatten() - return self._sqrdexp(error) - - def _reward_orientation(self): - return self._sqrdexp(self.projected_gravity[:, 2] + 1) - - def _reward_joint_regularization_legs(self): - # * Reward joint poses and symmetry - reward = self._reward_hip_yaw_zero() - reward += self._reward_hip_abad_symmetry() - reward += self._reward_hip_pitch_symmetry() - return reward / 3.0 - - def _reward_hip_yaw_zero(self): - error = self.dof_pos[:, 0] - self.default_dof_pos[:, 0] - reward = self._sqrdexp(error / self.scales["dof_pos"][0]) / 2.0 - error = self.dof_pos[:, 5] - self.default_dof_pos[:, 5] - reward += self._sqrdexp(error / self.scales["dof_pos"][5]) / 2.0 - return reward - - def _reward_hip_abad_symmetry(self): - error = ( - self.dof_pos[:, 1] / self.scales["dof_pos"][1] - - self.dof_pos[:, 6] / self.scales["dof_pos"][6] - ) - return self._sqrdexp(error) - - def _reward_hip_pitch_symmetry(self): - error = ( - self.dof_pos[:, 2] / self.scales["dof_pos"][2] - + self.dof_pos[:, 7] / self.scales["dof_pos"][7] - ) - return self._sqrdexp(error) - - def _reward_joint_regularization_arms(self): - reward = 0 - reward += self._reward_arm_yaw_symmetry() - reward += self._reward_arm_yaw_zero() - reward += self._reward_arm_abad_zero() - reward += self._reward_arm_abad_symmetry() - reward += self._reward_arm_pitch_symmetry() - reward += self._reward_arm_pitch_zero() - reward += self._reward_elbow_zero() - return reward / 6.0 - - def _reward_arm_pitch_symmetry(self): - error = ( - self.dof_pos[:, 10] / self.scales["dof_pos"][10] - + self.dof_pos[:, 14] / self.scales["dof_pos"][14] - ) - return self._sqrdexp(error) - - def _reward_arm_pitch_zero(self): - error = self.dof_pos[:, 10] - self.default_dof_pos[:, 10] - reward = self._sqrdexp(error / self.scales["dof_pos"][10]) - error = self.dof_pos[:, 14] - self.default_dof_pos[:, 14] - reward += self._sqrdexp(error / self.scales["dof_pos"][14]) - return reward / 2.0 - - def _reward_elbow_symmetry(self): - error = ( - self.dof_pos[:, 13] / self.scales["dof_pos"][13] - + self.dof_pos[:, 17] / self.scales["dof_pos"][17] - ) - return self._sqrdexp(error) - - def _reward_elbow_zero(self): - error = self.dof_pos[:, 13] - self.default_dof_pos[:, 13] - reward = self._sqrdexp(error / self.scales["dof_pos"][13]) - error = self.dof_pos[:, 17] - self.default_dof_pos[:, 17] - reward += self._sqrdexp(error / self.scales["dof_pos"][17]) - return reward / 2.0 - - def _reward_arm_yaw_symmetry(self): - error = ( - self.dof_pos[:, 12] / self.scales["dof_pos"][12] - - self.dof_pos[:, 16] / self.scales["dof_pos"][16] - ) - return self._sqrdexp(error) - - def _reward_arm_yaw_zero(self): - error = self.dof_pos[:, 12] - self.default_dof_pos[:, 12] - reward = self._sqrdexp(error / self.scales["dof_pos"][12]) - error = self.dof_pos[:, 16] - self.default_dof_pos[:, 16] - reward += self._sqrdexp(error / self.scales["dof_pos"][16]) - return reward / 2.0 - - def _reward_arm_abad_symmetry(self): - error = ( - self.dof_pos[:, 11] / self.scales["dof_pos"][11] - - self.dof_pos[:, 15] / self.scales["dof_pos"][15] - ) - return self._sqrdexp(error) - - def _reward_arm_abad_zero(self): - error = self.dof_pos[:, 11] - self.default_dof_pos[:, 11] - reward = self._sqrdexp(error / self.scales["dof_pos"][11]) - error = self.dof_pos[:, 15] - self.default_dof_pos[:, 15] - reward += self._sqrdexp(error / self.scales["dof_pos"][15]) - return reward / 2.0 diff --git a/gym/envs/mit_humanoid/humanoid_bouncing_config.py b/gym/envs/mit_humanoid/humanoid_bouncing_config.py deleted file mode 100644 index ee123ad0..00000000 --- a/gym/envs/mit_humanoid/humanoid_bouncing_config.py +++ /dev/null @@ -1,296 +0,0 @@ -""" -Configuration file for "fixed arm" (FA) humanoid environment -with potential-based rewards implemented -""" - -import torch -from gym.envs.base.legged_robot_config import LeggedRobotCfg, LeggedRobotRunnerCfg - - -class HumanoidBouncingCfg(LeggedRobotCfg): - class env(LeggedRobotCfg.env): - num_envs = 4096 - num_actuators = 18 - episode_length_s = 5 # 100 - - class terrain(LeggedRobotCfg.terrain): - curriculum = False - mesh_type = "plane" - measure_heights = False - - class init_state(LeggedRobotCfg.init_state): - reset_mode = "reset_to_range" - pos = [0.0, 0.0, 0.75] # x,y,z [m] - rot = [0.0, 0.0, 0.0, 1.0] # x,y,z,w [quat] - lin_vel = [0.0, 0.0, 0.0] # x,y,z [m/s] - ang_vel = [0.0, 0.0, 0.0] # x,y,z [rad/s] - - default_joint_angles = { - "hip_yaw": 0.0, - "hip_abad": 0.0, - "hip_pitch": -0.2, - "knee": 0.25, # 0.6 - "ankle": 0.0, - "shoulder_pitch": 0.0, - "shoulder_abad": 0.0, - "shoulder_yaw": 0.0, - "elbow": -1.25, - } - - # ranges for [x, y, z, roll, pitch, yaw] - root_pos_range = [ - [0.0, 0.0], # x - [0.0, 0.0], # y - [0.7, 0.72], # z - [-torch.pi / 10, torch.pi / 10], # roll - [-torch.pi / 10, torch.pi / 10], # pitch - [-torch.pi / 10, torch.pi / 10], # yaw - ] - - # ranges for [v_x, v_y, v_z, w_x, w_y, w_z] - root_vel_range = [ - [-0.5, 2.5], # x - [-0.5, 0.5], # y - [-0.5, 0.5], # z - [-0.5, 0.5], # roll - [-0.5, 0.5], # pitch - [-0.5, 0.5], # yaw - ] - - dof_pos_range = { - "hip_yaw": [-0.1, 0.1], - "hip_abad": [-0.2, 0.2], - "hip_pitch": [-0.2, 0.2], - "knee": [0.6, 0.7], - "ankle": [-0.3, 0.3], - "shoulder_pitch": [0.0, 0.0], - "shoulder_abad": [0.0, 0.0], - "shoulder_yaw": [0.0, 0.0], - "elbow": [0.0, 0.0], - } - - dof_vel_range = { - "hip_yaw": [-0.1, 0.1], - "hip_abad": [-0.1, 0.1], - "hip_pitch": [-0.1, 0.1], - "knee": [-0.1, 0.1], - "ankle": [-0.1, 0.1], - "shoulder_pitch": [0.0, 0.0], - "shoulder_abad": [0.0, 0.0], - "shoulder_yaw": [0.0, 0.0], - "elbow": [0.0, 0.0], - } - - class control(LeggedRobotCfg.control): - # stiffness and damping for joints - stiffness = { - "hip_yaw": 30.0, - "hip_abad": 30.0, - "hip_pitch": 30.0, - "knee": 30.0, - "ankle": 30.0, - "shoulder_pitch": 30.0, - "shoulder_abad": 30.0, - "shoulder_yaw": 30.0, - "elbow": 50.0, - } # [N*m/rad] - damping = { - "hip_yaw": 5.0, - "hip_abad": 5.0, - "hip_pitch": 5.0, - "knee": 5.0, - "ankle": 5.0, - "shoulder_pitch": 5.0, - "shoulder_abad": 5.0, - "shoulder_yaw": 5.0, - "elbow": 1.0, - } # [N*m*s/rad] - ctrl_frequency = 100 - desired_sim_frequency = 800 - - class commands(LeggedRobotCfg.commands): - resampling_time = 5.0 - - class ranges: - # TRAINING COMMAND RANGES # - lin_vel_x = [0, 4.5] # min max [m/s] - lin_vel_y = 0.75 # min max [m/s] - yaw_vel = 4.0 # min max [rad/s] - # # PLAY COMMAND RANGES # - # lin_vel_x = [3., 3.] # min max [m/s] - # lin_vel_y = 0. # min max [m/s] - # yaw_vel = 0. # min max [rad/s] - - class high_level: - sec_per_gait = 0.25 - interval = (int)(sec_per_gait * (5.0 / 2.0) * 100) # time steps - - class push_robots: - toggle = True - interval_s = 2.5 - max_push_vel_xy = 0.5 - push_box_dims = [0.1, 0.2, 0.3] # x,y,z [m] - - class domain_rand(LeggedRobotCfg.domain_rand): - randomize_friction = False - friction_range = [0.5, 1.25] - - randomize_base_mass = False - added_mass_range = [-1.0, 1.0] - - class asset(LeggedRobotCfg.asset): - # file = ('{LEGGED_GYM_ROOT_DIR}/resources/robots/rom/urdf/' - # +'humanoid_fixed_arms_full.urdf') - file = ( - "{LEGGED_GYM_ROOT_DIR}/resources/robots/" - + "mit_humanoid/urdf/humanoid_F_sf.urdf" - ) - keypoints = ["base"] - end_effectors = ["left_foot", "right_foot"] - # end_effector_names = ['left_toe', 'left_heel', - # 'right_toe', 'right_heel'] - foot_name = "foot" - terminate_after_contacts_on = [ - "base", - "left_upper_leg", - "left_lower_leg", - "right_upper_leg", - "right_lower_leg", - "left_upper_arm", - "right_upper_arm", - "left_lower_arm", - "right_lower_arm", - "left_hand", - "right_hand", - ] - - fix_base_link = False - disable_gravity = False - - disable_motors = False - - # (1: disable, 0: enable...bitwise filter) - self_collisions = 0 - collapse_fixed_joints = False - flip_visual_attachments = False - - # Check GymDofDriveModeFlags - # (0: none, 1: pos tgt, 2: vel target, 3: effort) - default_dof_drive_mode = 3 - - class reward_settings(LeggedRobotCfg.reward_settings): - soft_dof_pos_limit = 0.9 - soft_dof_vel_limit = 0.9 - soft_torque_limit = 0.8 - max_contact_force = 1500.0 - - # negative total rewards clipped at zero (avoids early termination) - only_positive_rewards = False # ! zap? - base_height_target = 0.6565 - tracking_sigma = 0.5 - - class scaling(LeggedRobotCfg.scaling): - hl_pos = 0.05 # 0.3 # maximum altitude for current bouncing ball param - hl_vel = 2.0 # 4.905 # max impulse imparted by HL - base_height = 0.6565 - base_lin_vel = 1.0 - base_ang_vel = torch.pi - dof_pos = 2 * [0.5, 1, 3, 2, 2] + 2 * [2, 1, 0.5, 2.0] - dof_vel = 1.0 - dof_pos_target = dof_pos - clip_actions = 1000.0 - - -class HumanoidBouncingRunnerCfg(LeggedRobotRunnerCfg): - do_wandb = True - seed = -1 - - class policy(LeggedRobotRunnerCfg.policy): - init_noise_std = 1.0 - actor_hidden_dims = [256, 256, 256] - critic_hidden_dims = [256, 256, 256] - # (elu, relu, selu, crelu, lrelu, tanh, sigmoid) - activation = "elu" - normalize_obs = True # True, False - - actor_obs = [ - "base_height", - "base_lin_vel", - "base_ang_vel", - "projected_gravity", - "hl_commands", - # "hl_impulses_flat", - # "commands", - # "phase_sin", - # "phase_cos", - # "time_since_hl_query", - "dof_pos_legs", - "dof_vel_legs", - "in_contact", - ] - - critic_obs = actor_obs - - actions = ["dof_pos_target_legs"] - disable_actions = False - - add_noise = True - noise_level = 1.0 # scales other values - - class noise: - base_height = 0.05 - base_lin_vel = 0.1 - base_ang_vel = 0.05 - projected_gravity = 0.05 - dof_pos = 0.005 - dof_vel = 0.01 - in_contact = 0.1 - - class reward: - class weights: - # * Behavioral rewards * # - action_rate = 1.0e-3 - action_rate2 = 1.0e-4 - # tracking_lin_vel = 5.0 - tracking_hl_pos = 5.0 - tracking_hl_vel = 5.0 - tracking_ang_vel = 5.0 - torques = 1e-4 - # dof_pos_limits = 10 - torque_limits = 1e-2 - dof_vel = 1e-4 - - # * Shaping rewards * # - base_height = 0 # 0.1 - orientation = 1.0 - hip_yaw_zero = 2.0 - hip_abad_symmetry = 0.2 - - class termination_weight: - termination = 1 - - class algorithm(LeggedRobotRunnerCfg.algorithm): - # algorithm training hyperparameters - value_loss_coef = 1.0 - use_clipped_value_loss = True - clip_param = 0.2 - entropy_coef = 0.01 - num_learning_epochs = 5 - num_mini_batches = 4 # minibatch size = num_envs*nsteps/nminibatches - learning_rate = 1.0e-5 - schedule = "adaptive" # could be adaptive, fixed - gamma = 0.999 - lam = 0.99 - desired_kl = 0.01 - max_grad_norm = 1.0 - - class runner(LeggedRobotRunnerCfg.runner): - policy_class_name = "ActorCritic" - algorithm_class_name = "PPO" - num_steps_per_env = 32 - max_iterations = 1500 - run_name = "" - experiment_name = "HumanoidTrajectoryTracking" - save_interval = 50 - plot_input_gradients = False - plot_parameter_gradients = False diff --git a/gym/envs/mit_humanoid/humanoid_running_config.py b/gym/envs/mit_humanoid/humanoid_running_config.py index 37852232..8f072f79 100644 --- a/gym/envs/mit_humanoid/humanoid_running_config.py +++ b/gym/envs/mit_humanoid/humanoid_running_config.py @@ -165,7 +165,6 @@ class asset(LeggedRobotCfg.asset): fix_base_link = False disable_gravity = False - disable_actions = False disable_motors = False # (1: disable, 0: enable...bitwise filter) @@ -201,15 +200,14 @@ class HumanoidRunningRunnerCfg(LeggedRobotRunnerCfg): do_wandb = True seed = -1 - class policy(LeggedRobotRunnerCfg.policy): + class actor(LeggedRobotRunnerCfg.actor): init_noise_std = 1.0 - actor_hidden_dims = [256, 256, 256] - critic_hidden_dims = [256, 256, 256] + hidden_dims = [256, 256, 256] # (elu, relu, selu, crelu, lrelu, tanh, sigmoid) activation = "elu" normalize_obs = True # True, False - actor_obs = [ + obs = [ "base_height", "base_lin_vel", "base_ang_vel", @@ -221,10 +219,8 @@ class policy(LeggedRobotRunnerCfg.policy): "in_contact", ] - critic_obs = actor_obs - actions = ["dof_pos_target_legs"] - + disable_actions = False add_noise = True noise_level = 1.0 # scales other values @@ -237,6 +233,24 @@ class noise: dof_vel = 0.01 in_contact = 0.1 + class critic: + hidden_dims = [256, 256, 256] + # (elu, relu, selu, crelu, lrelu, tanh, sigmoid) + activation = "elu" + normalize_obs = True # True, False + + obs = [ + "base_height", + "base_lin_vel", + "base_ang_vel", + "projected_gravity", + "commands", + "phase_obs", + "dof_pos_legs", + "dof_vel_legs", + "in_contact", + ] + class reward: class weights: # * Behavioral rewards * # @@ -275,7 +289,7 @@ class algorithm(LeggedRobotRunnerCfg.algorithm): class runner(LeggedRobotRunnerCfg.runner): policy_class_name = "ActorCritic" - algorithm_class_name = "PPO" + algorithm_class_name = "PPO2" num_steps_per_env = 32 max_iterations = 1000 run_name = "HumanoidRunning" diff --git a/gym/envs/mit_humanoid/mit_humanoid_config.py b/gym/envs/mit_humanoid/mit_humanoid_config.py index 3439e58c..c7e061a9 100644 --- a/gym/envs/mit_humanoid/mit_humanoid_config.py +++ b/gym/envs/mit_humanoid/mit_humanoid_config.py @@ -181,14 +181,14 @@ class MITHumanoidRunnerCfg(LeggedRobotRunnerCfg): seed = -1 runner_class_name = "OnPolicyRunner" - class policy(LeggedRobotRunnerCfg.policy): + class actor: init_noise_std = 1.0 - actor_hidden_dims = [512, 256, 128] + hidden_dims = [512, 256, 128] critic_hidden_dims = [512, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" - actor_obs = [ + obs = [ "base_height", "base_lin_vel", "base_ang_vel", @@ -198,9 +198,10 @@ class policy(LeggedRobotRunnerCfg.policy): "dof_vel", "dof_pos_history", ] - critic_obs = actor_obs + normalize_obs = True actions = ["dof_pos_target"] + disable_actions = False class noise: base_height = 0.05 @@ -211,6 +212,23 @@ class noise: projected_gravity = 0.05 height_measurements = 0.1 + class critic: + hidden_dims = [512, 256, 128] + # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid + activation = "elu" + + obs = [ + "base_height", + "base_lin_vel", + "base_ang_vel", + "projected_gravity", + "commands", + "dof_pos_obs", + "dof_vel", + "dof_pos_history", + ] + normalize_obs = True + class reward: class weights: tracking_ang_vel = 0.5 @@ -248,7 +266,7 @@ class algorithm(LeggedRobotRunnerCfg.algorithm): class runner(LeggedRobotRunnerCfg.runner): policy_class_name = "ActorCritic" - algorithm_class_name = "PPO" + algorithm_class_name = "PPO2" num_steps_per_env = 24 max_iterations = 1000 run_name = "Standing" diff --git a/gym/envs/pendulum/lqr_pendulum.py b/gym/envs/pendulum/lqr_pendulum.py new file mode 100644 index 00000000..49d99696 --- /dev/null +++ b/gym/envs/pendulum/lqr_pendulum.py @@ -0,0 +1,87 @@ +import torch +import scipy + +from gym.envs.pendulum.pendulum import Pendulum + + +class LQRPendulum(Pendulum): + def _init_buffers(self): + super()._init_buffers() + self.x_desired = torch.tensor([0.0, 0.0], device=self.device) + self.u_desired = torch.zeros(self.torques.shape, device=self.device) + self.A, self.B = self.linearize_pendulum_dynamics(self.x_desired) + self.Q = torch.eye(self.dof_state.shape[1], device=self.device) + self.Q[0, 0] = 10.0 + self.R = torch.eye(self.torques.shape[1], device=self.device) + self.S = ( + torch.from_numpy(self.solve_ricatti(self.Q, self.R)).float().to(self.device) + ) + + def _compute_torques(self): + """Compute torques from actions. + Actions can be interpreted as position or velocity targets given + to a PD controller, or directly as scaled torques. + [NOTE]: torques must have the same dimension as the number of DOFs, + even if some DOFs are not actuated. + + Args: + actions (torch.Tensor): Actions + + Returns: + [torch.Tensor]: Torques sent to the simulation + """ + actuated_dof_pos = torch.zeros( + self.num_envs, self.num_actuators, device=self.device + ) + actuated_dof_vel = torch.zeros( + self.num_envs, self.num_actuators, device=self.device + ) + for dof_idx in range(self.num_dof): + idx = 0 + if self.cfg.control.actuated_joints_mask[dof_idx]: + actuated_dof_pos[:, idx] = self.dof_pos[:, dof_idx] + actuated_dof_vel[:, idx] = self.dof_vel[:, dof_idx] + idx += 1 + + torques = self.lqr_u_prime( + self.S, self.R, self.dof_state, self.x_desired, self.u_desired + ) + return torques.view(self.torques.shape) + + def solve_ricatti(self, Q, R): + A = self.A.cpu().detach().numpy() + B = self.B.cpu().detach().numpy() + Q = Q.cpu().detach().numpy() + R = R.cpu().detach().numpy() + S = scipy.linalg.solve_continuous_are(A, B, Q, R) + return S + + def lqr_u_prime(self, S, R, x, x_desired, u_desired): + batch_envs = x.shape[0] + B_T = self.B.transpose(-1, -2).expand(batch_envs, -1, -1) + S = S.expand(batch_envs, -1, -1) + R_inv = torch.linalg.inv(R.expand(batch_envs, -1, -1)) + x_bar = x - x_desired.expand(batch_envs, -1) + K = torch.einsum( + "...ij, ...jk -> ...ik", + torch.einsum("...ij, ...jk -> ...ik", R_inv, B_T), + S, + ) + u_prime = u_desired - torch.einsum( + "...ij, ...jk -> ...ik", K, x_bar.unsqueeze(-1) + ).squeeze(-1) + return u_prime + + def linearize_pendulum_dynamics(self, x_desired): + m = self.cfg.asset.mass + b = self.cfg.asset.joint_damping + length = self.cfg.asset.length + g = 9.81 + ml2 = m * length**2 + + A = torch.tensor( + [[0.0, 1.0], [g / length * torch.cos(x_desired[0]), -b / ml2]], + device=self.device, + ) + B = torch.tensor([[0.0], [(1.0 / ml2)]], device=self.device) + return A, B diff --git a/gym/envs/pendulum/lqr_pendulum_config.py b/gym/envs/pendulum/lqr_pendulum_config.py new file mode 100644 index 00000000..1c0975f9 --- /dev/null +++ b/gym/envs/pendulum/lqr_pendulum_config.py @@ -0,0 +1,126 @@ +import torch + +from gym.envs.base.fixed_robot_config import FixedRobotCfg, FixedRobotCfgPPO + + +class LQRPendulumCfg(FixedRobotCfg): + class env(FixedRobotCfg.env): + num_envs = 32 # 2**12 + num_actuators = 1 # 1 for theta connecting base and pole + episode_length_s = 3.0 + + class terrain(FixedRobotCfg.terrain): + pass + + class viewer: + ref_env = 0 + pos = [10.0, 5.0, 10.0] # [m] + lookat = [0.0, 0.0, 0.0] # [m] + + class init_state(FixedRobotCfg.init_state): + default_joint_angles = {"theta": torch.pi} # -torch.pi / 2.0} + + # * default setup chooses how the initial conditions are chosen. + # * "reset_to_basic" = a single position + # * "reset_to_range" = uniformly random from a range defined below + reset_mode = "reset_to_basic" + + # * initial conditions for reset_to_range + dof_pos_range = { + "theta": [-torch.pi / 2.0, torch.pi / 2.0], + } + dof_vel_range = {"theta": [-5, 5]} + + class control(FixedRobotCfg.control): + actuated_joints_mask = [1] # angle + ctrl_frequency = 500 + desired_sim_frequency = 500 + stiffness = {"theta": 0.0} # [N*m/rad] + damping = {"theta": 0.0} # [N*m*s/rad] + + class asset(FixedRobotCfg.asset): + # * Things that differ + file = "{LEGGED_GYM_ROOT_DIR}/resources/robots/" + "pendulum/urdf/pendulum.urdf" + disable_gravity = False + disable_motors = False # all torques set to 0 + joint_damping = 0.1 + mass = 1.0 + length = 1.0 + + class reward_settings(FixedRobotCfg.reward_settings): + tracking_sigma = 0.25 + + class scaling(FixedRobotCfg.scaling): + dof_vel = 5.0 + dof_pos = 2.0 * torch.pi + # * Action scales + tau_ff = 1.0 + + +class LQRPendulumRunnerCfg(FixedRobotCfgPPO): + seed = -1 + runner_class_name = "LQRDataGenRunner" # "LQRDataGenRunner" + + class actor: + hidden_dims = [128, 64, 32] + # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid + activation = "tanh" + + obs = [ + "dof_pos", + "dof_vel", + ] + + actions = ["tau_ff"] + disable_actions = False + + class noise: + dof_pos = 0.0 + dof_vel = 0.0 + + class critic: + critic_class_name = "" + obs = [ + "dof_pos", + "dof_vel", + ] + hidden_dims = [128, 64, 32] + # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid + activation = "tanh" + + class reward: + class weights: + theta = 0.0 + omega = 0.0 + equilibrium = 1.0 + energy = 0.0 + dof_vel = 0.0 + torques = 0.025 + + class termination_weight: + termination = 0.0 + + class algorithm(FixedRobotCfgPPO.algorithm): + # training params + value_loss_coef = 1.0 + use_clipped_value_loss = True + clip_param = 0.2 + entropy_coef = 0.01 + num_learning_epochs = 6 + # * mini batch size = num_envs*nsteps / nminibatches + num_mini_batches = 4 + learning_rate = 1.0e-3 + schedule = "fixed" # could be adaptive, fixed + discount_horizon = 2.0 # [s] + lam = 0.98 + # GAE_bootstrap_horizon = .0 # [s] + desired_kl = 0.01 + max_grad_norm = 1.0 + plus_c_penalty = 0.1 + + class runner(FixedRobotCfgPPO.runner): + run_name = "" + experiment_name = "pendulum" + max_iterations = 500 # number of policy updates + algorithm_class_name = "PPO2" + num_steps_per_env = 32 diff --git a/gym/envs/pendulum/pendulum.py b/gym/envs/pendulum/pendulum.py index 7e81a51f..c63ffa6c 100644 --- a/gym/envs/pendulum/pendulum.py +++ b/gym/envs/pendulum/pendulum.py @@ -1,3 +1,4 @@ +from math import sqrt import torch from gym.envs.base.fixed_robot import FixedRobot @@ -8,6 +9,28 @@ def _post_physics_step(self): """Update all states that are not handled in PhysX""" super()._post_physics_step() + def _check_terminations_and_timeouts(self): + super()._check_terminations_and_timeouts() + self.terminated = self.timed_out + + def reset_to_uniform(self, env_ids): + grid_points = int(sqrt(self.num_envs)) + lin_pos = torch.linspace( + self.dof_pos_range[0, 0], + self.dof_pos_range[0, 1], + grid_points, + device=self.device, + ) + lin_vel = torch.linspace( + self.dof_vel_range[0, 0], + self.dof_vel_range[0, 1], + grid_points, + device=self.device, + ) + grid = torch.cartesian_prod(lin_pos, lin_vel) + self.dof_pos[env_ids] = grid[:, 0].unsqueeze(-1) + self.dof_vel[env_ids] = grid[:, 1].unsqueeze(-1) + def _reward_theta(self): theta_rwd = torch.cos(self.dof_pos[:, 0]) / self.scales["dof_pos"] return self._sqrdexp(theta_rwd.squeeze(dim=-1)) @@ -25,15 +48,23 @@ def _reward_equilibrium(self): # -error.pow(2).sum(dim=1) / self.cfg.reward_settings.tracking_sigma # ) + def _reward_torques(self): + """Penalize torques""" + return self._sqrdexp(torch.mean(torch.square(self.torques), dim=1), sigma=0.2) + def _reward_energy(self): - m_pendulum = 1.0 - l_pendulum = 1.0 kinetic_energy = ( - 0.5 * m_pendulum * l_pendulum**2 * torch.square(self.dof_vel[:, 0]) + 0.5 + * self.cfg.asset.mass + * self.cfg.asset.length**2 + * torch.square(self.dof_vel[:, 0]) ) potential_energy = ( - m_pendulum * 9.81 * l_pendulum * torch.cos(self.dof_pos[:, 0]) + self.cfg.asset.mass + * 9.81 + * self.cfg.asset.length + * torch.cos(self.dof_pos[:, 0]) ) - desired_energy = m_pendulum * 9.81 * l_pendulum + desired_energy = self.cfg.asset.mass * 9.81 * self.cfg.asset.length energy_error = kinetic_energy + potential_energy - desired_energy return self._sqrdexp(energy_error / desired_energy) diff --git a/gym/envs/pendulum/pendulum_config.py b/gym/envs/pendulum/pendulum_config.py index 0fc99016..ba7f42c7 100644 --- a/gym/envs/pendulum/pendulum_config.py +++ b/gym/envs/pendulum/pendulum_config.py @@ -5,7 +5,7 @@ class PendulumCfg(FixedRobotCfg): class env(FixedRobotCfg.env): - num_envs = 2**13 + num_envs = 2**12 num_actuators = 1 # 1 for theta connecting base and pole episode_length_s = 5.0 @@ -23,7 +23,7 @@ class init_state(FixedRobotCfg.init_state): # * default setup chooses how the initial conditions are chosen. # * "reset_to_basic" = a single position # * "reset_to_range" = uniformly random from a range defined below - reset_mode = "reset_to_range" + reset_mode = "reset_to_uniform" # * initial conditions for reset_to_range dof_pos_range = { @@ -33,8 +33,8 @@ class init_state(FixedRobotCfg.init_state): class control(FixedRobotCfg.control): actuated_joints_mask = [1] # angle - ctrl_frequency = 100 - desired_sim_frequency = 200 + ctrl_frequency = 10 + desired_sim_frequency = 100 stiffness = {"theta": 0.0} # [N*m/rad] damping = {"theta": 0.0} # [N*m*s/rad] @@ -44,6 +44,8 @@ class asset(FixedRobotCfg.asset): disable_gravity = False disable_motors = False # all torques set to 0 joint_damping = 0.1 + mass = 1.0 + length = 1.0 class reward_settings(FixedRobotCfg.reward_settings): tracking_sigma = 0.25 @@ -59,27 +61,34 @@ class PendulumRunnerCfg(FixedRobotCfgPPO): seed = -1 runner_class_name = "DataLoggingRunner" - class policy(FixedRobotCfgPPO.policy): - actor_hidden_dims = [128, 64, 32] - critic_hidden_dims = [128, 64, 32] + class actor: + hidden_dims = [128, 64, 32] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "tanh" - - actor_obs = [ + normalize_obs = True + obs = [ "dof_pos", "dof_vel", ] - critic_obs = actor_obs - actions = ["tau_ff"] disable_actions = False - standard_critic_nn = True class noise: dof_pos = 0.0 dof_vel = 0.0 + class critic: + critic_class_name = "CholeskyOffset1" + normalize_obs = True + obs = [ + "dof_pos", + "dof_vel", + ] + hidden_dims = [128, 64, 32] + # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid + activation = "tanh" + class reward: class weights: theta = 0.0 @@ -93,28 +102,28 @@ class termination_weight: termination = 0.0 class algorithm(FixedRobotCfgPPO.algorithm): - # training params - value_loss_coef = 1.0 - use_clipped_value_loss = True + # both + gamma = 0.99 + discount_horizon = 2.0 + lam = 0.98 + # shared + max_gradient_steps = 24 + # new + storage_size = 2**17 # new + batch_size = 2**16 # new clip_param = 0.2 + learning_rate = 1.0e-4 + max_grad_norm = 1.0 + # Critic + use_clipped_value_loss = True + # Actor entropy_coef = 0.01 - num_learning_epochs = 6 - # * mini batch size = num_envs*nsteps / nminibatches - num_mini_batches = 4 - learning_rate = 1.0e-3 schedule = "fixed" # could be adaptive, fixed - discount_horizon = 2.0 # [s] - lam = 0.98 - # GAE_bootstrap_horizon = .0 # [s] desired_kl = 0.01 - max_grad_norm = 1.0 - standard_loss = True - plus_c_penalty = 0.1 class runner(FixedRobotCfgPPO.runner): run_name = "" experiment_name = "pendulum" - max_iterations = 500 # number of policy updates - algorithm_class_name = "PPO" - num_steps_per_env = 32 - + max_iterations = 200 # number of policy updates + algorithm_class_name = "PPO2" + num_steps_per_env = 50 diff --git a/gym/scripts/sweep_configs/sweep_pendulum.json b/gym/scripts/sweep_configs/sweep_pendulum.json index 050d2457..b8259a0e 100644 --- a/gym/scripts/sweep_configs/sweep_pendulum.json +++ b/gym/scripts/sweep_configs/sweep_pendulum.json @@ -6,16 +6,16 @@ }, "run_cap": 50, "parameters": { - "train_cfg.policy.reward.weights.theta":{ + "train_cfg.critic.reward.weights.theta":{ "values": [1.0, 10.0] }, - "train_cfg.policy.reward.weights.omega":{ + "train_cfg.critic.reward.weights.omega":{ "values": [1.0, 10.0] }, - "train_cfg.policy.reward.weights.equilibrium":{ + "train_cfg.critic.reward.weights.equilibrium":{ "values": [1.0, 10.0] }, - "train_cfg.policy.reward.weights.energy":{ + "train_cfg.critic.reward.weights.energy":{ "values": [1.0, 10.0] } } diff --git a/gym/utils/helpers.py b/gym/utils/helpers.py index a7363148..7a577b1b 100644 --- a/gym/utils/helpers.py +++ b/gym/utils/helpers.py @@ -181,28 +181,31 @@ def update_cfg_from_args(env_cfg, train_cfg, args): train_cfg.runner.checkpoint = args.checkpoint if args.rl_device is not None: train_cfg.runner.device = args.rl_device + if args.custom_critic: + train_cfg.critic.hidden_dims = None + else: + train_cfg.critic.critic_class_name = "Critic" if args.train_critic_only: - train_cfg.runner.algorithm_class_name = "PPOCriticOnly" + train_cfg.runner.algorithm_class_name = "PPO2CriticOnly" + train_cfg.runner_class_name = "CriticOnlyRunner" + train_cfg.runner.resume = True train_cfg.runner.experiment_name += "_critic_only" - if args.task == "pendulum" and args.custom_critic: - train_cfg.policy.standard_critic_nn = False - train_cfg.algorithm.standard_loss = False - train_cfg.runner.run_name += "custom_critic" - elif args.task == "pendulum" and not args.custom_critic: - train_cfg.policy.standard_critic_nn = True - train_cfg.algorithm.standard_loss = True - train_cfg.runner.run_name += "standard_critic" + train_cfg.runner.run_name += ( + f"{train_cfg.critic.critic_class_name}_custom_critic_only" + if args.task == "pendulum" and args.custom_critic + else "standard_critic_only" + ) else: - if args.task == "pendulum" and args.custom_critic: - train_cfg.policy.standard_critic_nn = False - train_cfg.algorithm.standard_loss = False - train_cfg.runner.experiment_name += "_custom_critic" - train_cfg.runner.run_name += "custom_critic" - elif args.task == "pendulum" and not args.custom_critic: - train_cfg.policy.standard_critic_nn = True - train_cfg.algorithm.standard_loss = True - train_cfg.runner.experiment_name += "_standard_critic" - train_cfg.runner.run_name += "standard_critic" + train_cfg.runner.experiment_name += ( + "_custom_critic" + if args.task == "pendulum" and args.custom_critic + else "_standard_critic" + ) + train_cfg.runner.run_name += ( + f"{train_cfg.critic.critic_class_name}_custom_critic" + if args.task == "pendulum" and args.custom_critic + else "standard_critic" + ) def get_args(custom_parameters=None): diff --git a/gym/utils/logging_and_saving/wandb_singleton.py b/gym/utils/logging_and_saving/wandb_singleton.py index a1b45b8a..bba14fea 100644 --- a/gym/utils/logging_and_saving/wandb_singleton.py +++ b/gym/utils/logging_and_saving/wandb_singleton.py @@ -7,7 +7,7 @@ class WandbSingleton(object): def __new__(self): if not hasattr(self, "instance"): - self.instance = super(WandbSingleton, self).__new__(self) + self.instance = super().__new__(self) self.entity_name = None self.project_name = None self.experiment_name = "" diff --git a/learning/algorithms/__init__.py b/learning/algorithms/__init__.py index 37186415..aef59018 100644 --- a/learning/algorithms/__init__.py +++ b/learning/algorithms/__init__.py @@ -31,5 +31,7 @@ # Copyright (c) 2021 ETH Zurich, Nikita Rudin from .ppo import PPO +from .ppo2 import PPO2 from .SE import StateEstimator -from .ppo_critic_only import PPOCriticOnly +# from .ppo_critic_only_old import PPOCriticOnly +from .ppo2_critic_only import PPO2CriticOnly diff --git a/learning/algorithms/ppo.py b/learning/algorithms/ppo.py index 86470a31..92898ad5 100644 --- a/learning/algorithms/ppo.py +++ b/learning/algorithms/ppo.py @@ -36,7 +36,6 @@ from learning.modules import ActorCritic from learning.storage import RolloutStorage -from learning.modules.lqrc import CustomCholeskyPlusConstLoss class PPO: @@ -58,8 +57,6 @@ def __init__( schedule="fixed", desired_kl=0.01, device="cpu", - standard_loss=True, - plus_c_penalty=0.0, **kwargs, ): self.device = device @@ -72,7 +69,12 @@ def __init__( self.actor_critic = actor_critic self.actor_critic.to(self.device) self.storage = None # initialized later - self.optimizer = optim.Adam(self.actor_critic.parameters(), lr=learning_rate) + self.optimizer = optim.Adam( + self.actor_critic.actor.parameters(), lr=learning_rate + ) + self.crit_optimizer = optim.Adam( + self.actor_critic.critic.parameters(), lr=learning_rate + ) self.transition = RolloutStorage.Transition() # * PPO parameters @@ -86,10 +88,6 @@ def __init__( self.max_grad_norm = max_grad_norm self.use_clipped_value_loss = use_clipped_value_loss - # * custom NN parameters - self.standard_loss = standard_loss - self.plus_c_penalty = plus_c_penalty - def init_storage( self, num_envs, @@ -128,6 +126,11 @@ def act(self, obs, critic_obs): return self.transition.actions def process_env_step(self, rewards, dones, timed_out=None): + if self.storage is None: + raise AttributeError( + "This version of PPO is deprecated and only works with OldPolicyRunner." + "Use PPO2 instead." + ) self.transition.rewards = rewards.clone() self.transition.dones = dones # * Bootstrapping on time outs @@ -163,7 +166,6 @@ def update(self): actions_log_prob_batch = self.actor_critic.get_actions_log_prob( actions_batch ) - value_batch = self.actor_critic.evaluate(critic_obs_batch) mu_batch = self.actor_critic.action_mean sigma_batch = self.actor_critic.action_std entropy_batch = self.actor_critic.entropy @@ -202,53 +204,38 @@ def update(self): surrogate_loss = torch.max(surrogate, surrogate_clipped).mean() # * Value function loss + value_batch = self.actor_critic.evaluate(critic_obs_batch) if self.use_clipped_value_loss: value_clipped = target_values_batch + ( value_batch - target_values_batch ).clamp(-self.clip_param, self.clip_param) - if self.standard_loss: - value_losses = (value_batch - returns_batch).pow(2) - value_losses_clipped = (value_clipped - returns_batch).pow(2) - else: - value_losses = CustomCholeskyPlusConstLoss( - const_penalty=self.plus_c_penalty - ).forward( - value_batch, - returns_batch, - self.actor_critic.critic.NN.intermediates, - ) - value_losses_clipped = CustomCholeskyPlusConstLoss( - const_penalty=self.plus_c_penalty - ).forward( - value_clipped, - returns_batch, - self.actor_critic.critic.NN.intermediates, - ) + value_losses = (value_batch - returns_batch).pow(2) + value_losses_clipped = (value_clipped - returns_batch).pow(2) value_loss = torch.max(value_losses, value_losses_clipped).mean() else: - if self.standard_loss: - value_loss = (returns_batch - value_batch).pow(2).mean() - else: - value_losses = CustomCholeskyPlusConstLoss( - const_penalty=self.plus_c_penalty - ).forward( - value_batch, - returns_batch, - self.actor_critic.critic.NN.intermediates, - ) + value_loss = (returns_batch - value_batch).pow(2).mean() loss = ( surrogate_loss - + self.value_loss_coef * value_loss + # + self.value_loss_coef * value_loss - self.entropy_coef * entropy_batch.mean() ) # * Gradient step self.optimizer.zero_grad() loss.backward() - nn.utils.clip_grad_norm_(self.actor_critic.parameters(), self.max_grad_norm) + nn.utils.clip_grad_norm_( + self.actor_critic.actor.parameters(), self.max_grad_norm + ) self.optimizer.step() + self.crit_optimizer.zero_grad() + value_loss.backward() + nn.utils.clip_grad_norm_( + self.actor_critic.critic.parameters(), self.max_grad_norm + ) + self.crit_optimizer.step() + self.mean_value_loss += value_loss.item() self.mean_surrogate_loss += surrogate_loss.item() diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py new file mode 100644 index 00000000..f03d0e29 --- /dev/null +++ b/learning/algorithms/ppo2.py @@ -0,0 +1,160 @@ +import torch +import torch.nn as nn +import torch.optim as optim + +from learning.utils import ( + create_uniform_generator, + compute_generalized_advantages, + normalize, +) + + +class PPO2: + def __init__( + self, + actor, + critic, + batch_size=2**15, + max_gradient_steps=10, + clip_param=0.2, + gamma=0.998, + lam=0.95, + entropy_coef=0.0, + learning_rate=1e-3, + max_grad_norm=1.0, + use_clipped_value_loss=True, + schedule="fixed", + desired_kl=0.01, + device="cpu", + **kwargs, + ): + self.device = device + + self.desired_kl = desired_kl + self.schedule = schedule + self.learning_rate = learning_rate + + # * PPO components + self.actor = actor.to(self.device) + self.optimizer = optim.Adam(self.actor.parameters(), lr=learning_rate) + self.critic = critic.to(self.device) + self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=learning_rate) + + # * PPO parameters + self.clip_param = clip_param + self.batch_size = batch_size + self.max_gradient_steps = max_gradient_steps + self.entropy_coef = entropy_coef + self.gamma = gamma + self.lam = lam + self.max_grad_norm = max_grad_norm + self.use_clipped_value_loss = use_clipped_value_loss + + def test_mode(self): + self.actor.test() + self.critic.test() + + def switch_to_train(self): + self.actor.train() + self.critic.train() + + def act(self, obs): + return self.actor.act(obs).detach() + + def update(self, data): + data["values"] = self.critic.evaluate(data["critic_obs"]) + data["advantages"] = compute_generalized_advantages( + data, self.gamma, self.lam, self.critic + ) + data["returns"] = data["advantages"] + data["values"] + self.update_critic(data) + data["advantages"] = normalize(data["advantages"]) + self.update_actor(data) + + def update_critic(self, data): + self.mean_value_loss = 0 + counter = 0 + + generator = create_uniform_generator( + data, + self.batch_size, + max_gradient_steps=self.max_gradient_steps, + ) + for batch in generator: + value_loss = self.critic.loss_fn(batch["critic_obs"], batch["returns"]) + self.critic_optimizer.zero_grad() + value_loss.backward() + nn.utils.clip_grad_norm_(self.critic.parameters(), self.max_grad_norm) + self.critic_optimizer.step() + self.mean_value_loss += value_loss.item() + counter += 1 + self.mean_value_loss /= counter + + def update_actor(self, data): + self.mean_surrogate_loss = 0 + counter = 0 + + self.actor.act(data["actor_obs"]) + data["old_sigma_batch"] = self.actor.action_std.detach() + data["old_mu_batch"] = self.actor.action_mean.detach() + data["old_actions_log_prob_batch"] = self.actor.get_actions_log_prob( + data["actions"] + ).detach() + + generator = create_uniform_generator( + data, + self.batch_size, + max_gradient_steps=self.max_gradient_steps, + ) + for batch in generator: + self.actor.act(batch["actor_obs"]) + actions_log_prob_batch = self.actor.get_actions_log_prob(batch["actions"]) + mu_batch = self.actor.action_mean + sigma_batch = self.actor.action_std + entropy_batch = self.actor.entropy + + # * KL + if self.desired_kl is not None and self.schedule == "adaptive": + with torch.inference_mode(): + kl = torch.sum( + torch.log(sigma_batch / batch["old_sigma_batch"] + 1.0e-5) + + ( + torch.square(batch["old_sigma_batch"]) + + torch.square(batch["old_mu_batch"] - mu_batch) + ) + / (2.0 * torch.square(sigma_batch)) + - 0.5, + axis=-1, + ) + kl_mean = torch.mean(kl) + + if kl_mean > self.desired_kl * 2.0: + self.learning_rate = max(1e-5, self.learning_rate / 1.5) + elif kl_mean < self.desired_kl / 2.0 and kl_mean > 0.0: + self.learning_rate = min(1e-2, self.learning_rate * 1.5) + + for param_group in self.optimizer.param_groups: + # ! check this + param_group["lr"] = self.learning_rate + + # * Surrogate loss + ratio = torch.exp( + actions_log_prob_batch + - torch.squeeze(batch["old_actions_log_prob_batch"]) + ) + surrogate = -torch.squeeze(batch["advantages"]) * ratio + surrogate_clipped = -torch.squeeze(batch["advantages"]) * torch.clamp( + ratio, 1.0 - self.clip_param, 1.0 + self.clip_param + ) + surrogate_loss = torch.max(surrogate, surrogate_clipped).mean() + + loss = surrogate_loss - self.entropy_coef * entropy_batch.mean() + + # * Gradient step + self.optimizer.zero_grad() + loss.backward() + nn.utils.clip_grad_norm_(self.actor.parameters(), self.max_grad_norm) + self.optimizer.step() + self.mean_surrogate_loss += surrogate_loss.item() + counter += 1 + self.mean_surrogate_loss /= counter diff --git a/learning/algorithms/ppo2_critic_only.py b/learning/algorithms/ppo2_critic_only.py new file mode 100644 index 00000000..29068d08 --- /dev/null +++ b/learning/algorithms/ppo2_critic_only.py @@ -0,0 +1,169 @@ +import torch +import torch.nn as nn +import torch.optim as optim +from learning.algorithms.ppo2 import PPO2 + +from learning.utils import ( + create_uniform_generator, + compute_generalized_advantages, +) + + +class PPO2CriticOnly(PPO2): + def __init__( + self, + actor, + critic, + num_learning_epochs=1, + num_mini_batches=1, + clip_param=0.2, + gamma=0.998, + lam=0.95, + entropy_coef=0.0, + learning_rate=1e-3, + max_grad_norm=1.0, + use_clipped_value_loss=True, + schedule="fixed", + desired_kl=0.01, + loss_fn="MSE", + device="cpu", + **kwargs, + ): + self.device = device + + self.desired_kl = desired_kl + self.schedule = schedule + self.learning_rate = learning_rate + + # * PPO components + self.actor = actor.to(self.device) + # self.optimizer = optim.Adam(self.actor.parameters(), lr=learning_rate) + self.critic = critic.to(self.device) + self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=learning_rate) + + # * PPO parameters + self.clip_param = clip_param + self.num_learning_epochs = num_learning_epochs + self.num_mini_batches = num_mini_batches + self.entropy_coef = entropy_coef + self.gamma = gamma + self.lam = lam + self.max_grad_norm = max_grad_norm + self.use_clipped_value_loss = use_clipped_value_loss + + def test_mode(self): + self.actor.test() + self.critic.test() + + def switch_to_train(self): + self.actor.train() + self.critic.train() + + def act(self, obs, critic_obs): + return self.actor.act(obs).detach() + + def update(self, data, last_obs=None): + if last_obs is None: + last_values = None + else: + with torch.no_grad(): + last_values = self.critic.evaluate(last_obs).detach() + compute_generalized_advantages( + data, self.gamma, self.lam, self.critic, last_values + ) + + self.update_critic(data) + self.update_actor(data) #! gradient step is disabled + + def update_critic(self, data): + self.mean_value_loss = 0 + counter = 0 + + n, m = data.shape + total_data = n * m + batch_size = total_data // self.num_mini_batches + generator = create_uniform_generator(data, batch_size, self.num_learning_epochs) + for batch in generator: + value_batch = self.critic.evaluate(batch["critic_obs"]) + value_loss = self.critic.loss_fn(value_batch, batch["returns"]) + self.critic_optimizer.zero_grad() + value_loss.backward() + nn.utils.clip_grad_norm_(self.critic.parameters(), self.max_grad_norm) + self.critic_optimizer.step() + self.mean_value_loss += value_loss.item() + counter += 1 + self.mean_value_loss /= counter + + def update_actor(self, data): + # already done before + # compute_generalized_advantages(data, self.gamma, self.lam, self.critic) + self.mean_surrogate_loss = 0 + counter = 0 + + self.actor.act(data["actor_obs"]) + data["old_sigma_batch"] = self.actor.action_std.detach() + data["old_mu_batch"] = self.actor.action_mean.detach() + data["old_actions_log_prob_batch"] = self.actor.get_actions_log_prob( + data["actions"] + ).detach() + + n, m = data.shape + total_data = n * m + batch_size = total_data // self.num_mini_batches + generator = create_uniform_generator(data, batch_size, self.num_learning_epochs) + for batch in generator: + # ! refactor how this is done + self.actor.act(batch["actor_obs"]) + actions_log_prob_batch = self.actor.get_actions_log_prob(batch["actions"]) + mu_batch = self.actor.action_mean + sigma_batch = self.actor.action_std + # entropy_batch = self.actor.entropy + + # * KL + if self.desired_kl is not None and self.schedule == "adaptive": + with torch.inference_mode(): + kl = torch.sum( + torch.log(sigma_batch / batch["old_sigma_batch"] + 1.0e-5) + + ( + torch.square(batch["old_sigma_batch"]) + + torch.square(batch["old_mu_batch"] - mu_batch) + ) + / (2.0 * torch.square(sigma_batch)) + - 0.5, + axis=-1, + ) + kl_mean = torch.mean(kl) + + if kl_mean > self.desired_kl * 2.0: + self.learning_rate = max(1e-5, self.learning_rate / 1.5) + elif kl_mean < self.desired_kl / 2.0 and kl_mean > 0.0: + self.learning_rate = min(1e-2, self.learning_rate * 1.5) + + # for param_group in self.optimizer.param_groups: + # # ! check this + # param_group["lr"] = self.learning_rate + + # * Surrogate loss + ratio = torch.exp( + actions_log_prob_batch + - torch.squeeze(batch["old_actions_log_prob_batch"]) + ) + surrogate = -torch.squeeze(batch["advantages"]) * ratio + surrogate_clipped = -torch.squeeze(batch["advantages"]) * torch.clamp( + ratio, 1.0 - self.clip_param, 1.0 + self.clip_param + ) + surrogate_loss = torch.max(surrogate, surrogate_clipped).mean() + + # loss = surrogate_loss - self.entropy_coef * entropy_batch.mean() + + # * Gradient step + # self.optimizer.zero_grad() + # loss.backward() + # nn.utils.clip_grad_norm_( + # list(self.actor.parameters()) + list(self.critic.parameters()), + # self.max_grad_norm, + # ) + # self.optimizer.step() + self.mean_surrogate_loss += surrogate_loss.item() + counter += 1 + self.mean_surrogate_loss /= counter diff --git a/learning/algorithms/ppo_critic_only.py b/learning/algorithms/ppo_critic_only_old.py similarity index 100% rename from learning/algorithms/ppo_critic_only.py rename to learning/algorithms/ppo_critic_only_old.py diff --git a/learning/modules/actor.py b/learning/modules/actor.py index f9c82396..fbaa6868 100644 --- a/learning/modules/actor.py +++ b/learning/modules/actor.py @@ -17,11 +17,6 @@ def __init__( normalize_obs=True, **kwargs, ): - if kwargs: - print( - "Actor.__init__ got unexpected arguments, " - "which will be ignored: " + str([key for key in kwargs.keys()]) - ) super().__init__() self._normalize_obs = normalize_obs @@ -51,9 +46,7 @@ def entropy(self): return self.distribution.entropy().sum(dim=-1) def update_distribution(self, observations): - if self._normalize_obs: - observations = self.normalize(observations) - mean = self.NN(observations) + mean = self.act_inference(observations) self.distribution = Normal(mean, mean * 0.0 + self.std) def act(self, observations): @@ -65,16 +58,12 @@ def get_actions_log_prob(self, actions): def act_inference(self, observations): if self._normalize_obs: - observations = self.normalize(observations) - actions_mean = self.NN(observations) - return actions_mean + with torch.no_grad(): + observations = self.obs_rms(observations) + return self.NN(observations) def forward(self, observations): return self.act_inference(observations) - def normalize(self, observation): - with torch.no_grad(): - return self.obs_rms(observation) - def export(self, path): export_network(self, "policy", path, self.num_obs) diff --git a/learning/modules/actor_critic.py b/learning/modules/actor_critic.py index 157ba38b..cec8cbb5 100644 --- a/learning/modules/actor_critic.py +++ b/learning/modules/actor_critic.py @@ -1,49 +1,17 @@ import torch.nn as nn -from .actor import Actor -from .critic import Critic - class ActorCritic(nn.Module): - def __init__( - self, - num_actor_obs, - num_critic_obs, - num_actions, - actor_hidden_dims=[256, 256, 256], - critic_hidden_dims=[256, 256, 256], - activation="elu", - init_noise_std=1.0, - normalize_obs=True, - standard_critic_nn=True, - **kwargs, - ): + def __init__(self, actor, critic, **kwargs): if kwargs: print( "ActorCritic.__init__ got unexpected arguments, " "which will be ignored: " + str([key for key in kwargs.keys()]) ) - super(ActorCritic, self).__init__() - - self.actor = Actor( - num_actor_obs, - num_actions, - actor_hidden_dims, - activation, - init_noise_std, - normalize_obs, - ) - - self.critic = Critic( - num_critic_obs, - critic_hidden_dims, - activation, - normalize_obs, - standard_critic_nn, - ) + super().__init__() - print(f"Actor MLP: {self.actor.NN}") - print(f"Critic MLP: {self.critic.NN}") + self.actor = actor + self.critic = critic @property def action_mean(self): diff --git a/learning/modules/critic.py b/learning/modules/critic.py index 608fba03..35c34843 100644 --- a/learning/modules/critic.py +++ b/learning/modules/critic.py @@ -3,7 +3,6 @@ from .utils import create_MLP from .utils import RunningMeanStd -from .lqrc import CholeskyPlusConst class Critic(nn.Module): @@ -13,30 +12,20 @@ def __init__( hidden_dims=None, activation="elu", normalize_obs=True, - standard_nn=True, **kwargs, ): - if kwargs: - print( - "Critic.__init__ got unexpected arguments, " - "which will be ignored: " + str([key for key in kwargs.keys()]) - ) super().__init__() - self.NN = ( - create_MLP(num_obs, 1, hidden_dims, activation) - if standard_nn - else CholeskyPlusConst(num_obs) - ) + self.NN = create_MLP(num_obs, 1, hidden_dims, activation) self._normalize_obs = normalize_obs if self._normalize_obs: self.obs_rms = RunningMeanStd(num_obs) def evaluate(self, critic_observations): if self._normalize_obs: - critic_observations = self.normalize(critic_observations) + with torch.no_grad(): + critic_observations = self.obs_rms(critic_observations) return self.NN(critic_observations).squeeze() - def normalize(self, observation): - with torch.no_grad(): - return self.obs_rms(observation) + def loss_fn(self, obs, target): + return nn.functional.mse_loss(self.evaluate(obs), target, reduction="mean") diff --git a/learning/modules/lqrc/__init__.py b/learning/modules/lqrc/__init__.py index f3b1e645..56cea2c0 100644 --- a/learning/modules/lqrc/__init__.py +++ b/learning/modules/lqrc/__init__.py @@ -1,2 +1,4 @@ -from .custom_nn import CholeskyPlusConst, QuadraticNetCholesky, BaselineMLP, CustomCholeskyLoss, CustomCholeskyPlusConstLoss +from .custom_nn import CholeskyPlusConst, QuadraticNetCholesky, BaselineMLP, CustomCholeskyLoss, CustomCholeskyPlusConstLoss # deprecated but left here until there's time for clean up from .utils import benchmark_args +from .custom_critics import CustomCriticBaseline, Cholesky, CholeskyPlusConst, CholeskyOffset1, CholeskyOffset2 +from .autoencoder import Autoencoder \ No newline at end of file diff --git a/learning/modules/lqrc/autoencoder.py b/learning/modules/lqrc/autoencoder.py new file mode 100644 index 00000000..c4e24838 --- /dev/null +++ b/learning/modules/lqrc/autoencoder.py @@ -0,0 +1,15 @@ +import torch.nn as nn + +from learning.modules.utils.neural_net import create_MLP + + +class Autoencoder(nn.Module): + def __init__(self, num_inputs, num_latent, hidden_dims, activation="elu"): + super().__init__() + self.encoder = create_MLP(num_inputs, num_latent, hidden_dims, activation) + self.decoder = create_MLP(num_latent, num_inputs, hidden_dims, activation) + + def forward(self, x): + x = self.encoder(x) + x = self.decoder(x) + return x diff --git a/learning/modules/lqrc/custom_critics.py b/learning/modules/lqrc/custom_critics.py new file mode 100644 index 00000000..764980d6 --- /dev/null +++ b/learning/modules/lqrc/custom_critics.py @@ -0,0 +1,264 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +from learning.modules.critic import Critic +from learning.modules.utils import RunningMeanStd +from learning.modules.utils.neural_net import get_activation + + +class CustomCriticBaseline(Critic): + def __init__( + self, + num_obs, + hidden_dims=None, + activation="elu", + normalize_obs=True, + output_size=1, + device="cuda", + **kwargs, + ): + super(Critic, self).__init__() + hidden_dims = [32, 128] if hidden_dims is None else hidden_dims + assert len(hidden_dims) == 2, "Too many hidden dims passed to Custom Critic" + activation = get_activation(activation) + self.input_size = num_obs + self.output_size = output_size + self.device = device + self.intermediates = {} + + self._normalize_obs = normalize_obs + if self._normalize_obs: + self.obs_rms = RunningMeanStd(num_obs) + + self.connection_1 = nn.Linear(self.input_size, hidden_dims[0]) + self.activation_1 = activation + self.connection_2 = nn.Linear(hidden_dims[0], hidden_dims[1]) + self.activation_2 = activation + self.connection_3 = nn.Linear(hidden_dims[1], output_size) + self.activation_3 = nn.Softplus() + + self.NN = self.forward + + def forward(self, x): + output = self.connection_1(x) + output = self.activation_1(output) + output = self.connection_2(output) + output = self.activation_2(output) + output = self.connection_3(output) + output = self.activation_3(output) + return output + + +class Cholesky(CustomCriticBaseline): + def __init__( + self, + num_obs, + hidden_dims=None, + activation="elu", + normalize_obs=True, + output_size=1, + device="cuda", + **kwargs, + ): + super().__init__( + num_obs, + hidden_dims=hidden_dims, + activation=activation, + normalize_obs=normalize_obs, + output_size=sum(range(num_obs + 1)), + device=device, + ) + self.activation_3.register_forward_hook(self.save_intermediate()) + + def forward(self, x): + output = self.connection_1(x) + output = self.activation_1(output) + output = self.connection_2(output) + output = self.activation_2(output) + output = self.connection_3(output) + output = self.activation_3(output) + C = self.create_cholesky(output) + A = torch.einsum("...ij, ...jk -> ...ik", C, C.transpose(-2, -1)) + y_pred = torch.einsum( + "...ij, ...jk -> ...ik", + torch.einsum("...ij, ...jk -> ...ik", x.unsqueeze(-1).transpose(-2, -1), A), + x.unsqueeze(-1), + ).squeeze(-1) + return y_pred + + def create_cholesky(self, x): + n = self.input_size + L = torch.zeros((*x.shape[:-1], n, n), device=self.device, requires_grad=False) + tril_indices = torch.tril_indices(row=n, col=n, offset=0) + rows, cols = tril_indices + L[..., rows, cols] = x + return L + + def save_intermediate(self): + def hook(module, input, output): + C = self.create_cholesky(output) + A = torch.einsum("...ij, ...jk -> ...ik", C, C.transpose(-2, -1)) + self.intermediates["A"] = A + + return hook + + +class CholeskyPlusConst(Cholesky): + def __init__( + self, + num_obs, + hidden_dims=None, + activation="elu", + normalize_obs=True, + output_size=1, + device="cuda", + **kwargs, + ): + # additional 1 to output_size for +c + super(Cholesky, self).__init__( + num_obs=num_obs, + hidden_dims=hidden_dims, + activation=activation, + normalize_obs=normalize_obs, + output_size=sum(range(num_obs + 1)) + 1, + device=device, + ) + self.activation_3.register_forward_hook(self.save_intermediate()) + # 0.0 if kwargs.get("const_penalty") is None else kwargs.get("const_penalty") + self.const_penalty = 0.1 + + def forward(self, x): + output = self.connection_1(x) + output = self.activation_1(output) + output = self.connection_2(output) + output = self.activation_2(output) + output = self.connection_3(output) + output = self.activation_3(output) + C = self.create_cholesky(output[..., :-1]) + A = torch.einsum("...ij, ...jk -> ...ik", C, C.transpose(-2, -1)) + c = output[..., -1] + y_pred = torch.einsum( + "...ij, ...jk -> ...ik", + torch.einsum("...ij, ...jk -> ...ik", x.unsqueeze(-1).transpose(-2, -1), A), + x.unsqueeze(-1), + ) + c.unsqueeze(-1).unsqueeze(-1) + return y_pred.squeeze(-1) + + def save_intermediate(self): + def hook(module, input, output): + C = self.create_cholesky(output[..., :-1]) + A = torch.einsum("...ij, ...jk -> ...ik", C, C.transpose(-2, -1)) + c = output[..., -1] + self.intermediates["A"] = A + self.intermediates["c"] = c + + return hook + + def loss_fn(self, input, target): + loss = F.mse_loss(self.evaluate(input), target, reduction="mean") + const_loss = self.const_penalty * torch.mean(self.intermediates["c"]) + return loss + const_loss + + +class CholeskyOffset1(Cholesky): + def __init__( + self, + num_obs, + hidden_dims=None, + activation="elu", + normalize_obs=True, + output_size=1, + device="cuda", + **kwargs, + ): + # + input_size for offset + super(Cholesky, self).__init__( + num_obs=num_obs, + hidden_dims=hidden_dims, + activation=activation, + normalize_obs=normalize_obs, + output_size=sum(range(num_obs + 1)) + num_obs, + device=device, + ) + self.L_indices = sum(range(num_obs + 1)) + self.activation_3.register_forward_hook(self.save_intermediate()) + + def forward(self, x): + output = self.connection_1(x) + output = self.activation_1(output) + output = self.connection_2(output) + output = self.activation_2(output) + output = self.connection_3(output) + output = self.activation_3(output) + C = self.create_cholesky(output[..., : self.L_indices]) + A = torch.einsum("...ij, ...jk -> ...ik", C, C.transpose(-2, -1)) + offset = output[..., self.L_indices :] + x_bar = x - offset + y_pred = torch.einsum( + "...ij, ...jk -> ...ik", + torch.einsum( + "...ij, ...jk -> ...ik", x_bar.unsqueeze(-1).transpose(-2, -1), A + ), + x_bar.unsqueeze(-1), + ).squeeze(-1) + return y_pred + + def save_intermediate(self): + def hook(module, input, output): + C = self.create_cholesky(output[..., : self.L_indices]) + A = torch.einsum("...ij, ...jk -> ...ik", C, C.transpose(-2, -1)) + offset = output[..., self.L_indices :] + self.intermediates["A"] = A + self.intermediates["offset"] = offset + + return hook + + +class CholeskyOffset2(Cholesky): + def __init__( + self, + num_obs, + hidden_dims=None, + activation="elu", + normalize_obs=True, + output_size=1, + device="cuda", + **kwargs, + ): + super().__init__(num_obs) + self.xhat_layer = nn.Linear(num_obs, num_obs) + self.xhat_layer.register_forward_hook(self.save_xhat()) + + def forward(self, x): + xhat = self.xhat_layer(x) + output = self.connection_1(xhat) + output = self.activation_1(output) + output = self.connection_2(output) + output = self.activation_2(output) + output = self.connection_3(output) + output = self.activation_3(output) + C = self.create_cholesky(output) + A = torch.einsum("...ij, ...jk -> ...ik", C, C.transpose(-2, -1)) + y_pred = torch.einsum( + "...ij, ...jk -> ...ik", + torch.einsum( + "...ij, ...jk -> ...ik", xhat.unsqueeze(-1).transpose(-2, -1), A + ), + xhat.unsqueeze(-1), + ).squeeze(-1) + return y_pred + + def save_intermediate(self): + def hook(module, input, output): + C = self.create_cholesky(output) + A = torch.einsum("...ij, ...jk -> ...ik", C, C.transpose(-2, -1)) + self.intermediates["A"] = A + + return hook + + def save_xhat(self): + def hook(module, input, output): + self.intermediates["xhat"] = output + + return hook diff --git a/learning/modules/lqrc/custom_nn.py b/learning/modules/lqrc/custom_nn.py index b47357e0..0d56c658 100644 --- a/learning/modules/lqrc/custom_nn.py +++ b/learning/modules/lqrc/custom_nn.py @@ -151,6 +151,93 @@ def forward(self, y_pred, y, intermediates=None): return loss + const_loss +class CholeskyOffset1(QuadraticNetCholesky): + def __init__(self, input_size, hidden_dims=128, device="cuda"): + # + input_size for offset + super(QuadraticNetCholesky, self).__init__( + input_size, + sum(range(input_size + 1)) + input_size, + hidden_dims=hidden_dims, + device=device, + ) + self.L_indices = sum(range(input_size + 1)) + self.activation_3.register_forward_hook(self.save_intermediate()) + + def forward(self, x): + output = self.connection_1(x) + output = self.activation_1(output) + output = self.connection_2(output) + output = self.activation_2(output) + output = self.connection_3(output) + output = self.activation_3(output) + C = self.create_cholesky(output[:, : self.L_indices]) + A = C.bmm(C.transpose(1, 2)) + offset = output[:, self.L_indices :] + x_bar = x - offset + y_pred = ( + x_bar.unsqueeze(2).transpose(1, 2).bmm(A).bmm(x_bar.unsqueeze(2)) + ).squeeze(2) + return y_pred + + def save_intermediate(self): + """ + Forward hook to save A and offset + """ + + def hook(module, input, output): + C = self.create_cholesky(output[:, : self.L_indices]) + A = C.bmm(C.transpose(1, 2)) + offset = output[:, self.L_indices :] + self.intermediates["A"] = A + self.intermediates["offset"] = offset + + return hook + + +class CholeskyOffset2(QuadraticNetCholesky): + def __init__(self, input_size, hidden_dims=128, device="cuda"): + super().__init__(input_size) + self.xhat_layer = nn.Linear(input_size, input_size) + self.xhat_layer.register_forward_hook(self.save_xhat()) + + def forward(self, x): + xhat = self.xhat_layer(x) + output = self.connection_1(xhat) + output = self.activation_1(output) + output = self.connection_2(output) + output = self.activation_2(output) + output = self.connection_3(output) + output = self.activation_3(output) + C = self.create_cholesky(output) + A = C.bmm(C.transpose(1, 2)) + y_pred = ( + xhat.unsqueeze(2).transpose(1, 2).bmm(A).bmm(xhat.unsqueeze(2)) + ).squeeze(2) + return y_pred + + def save_intermediate(self): + """ + Forward hook to save A + """ + + def hook(module, input, output): + C = self.create_cholesky(output) + A = C.bmm(C.transpose(1, 2)) + self.intermediates["A"] = A + + return hook + + def save_xhat(self): + """ + Forward hook to save xhat for debugging + """ + + def hook(module, input, output): + self.intermediates["xhat"] = output + + return hook + + class LQRCDataset(Dataset): def __init__(self, X, y): """ diff --git a/learning/modules/lqrc/evaluate_critic.py b/learning/modules/lqrc/evaluate_critic.py index 9b61d963..645c4c20 100644 --- a/learning/modules/lqrc/evaluate_critic.py +++ b/learning/modules/lqrc/evaluate_critic.py @@ -1,14 +1,17 @@ import os -from learning import LEGGED_GYM_LQRC_DIR +from learning import LEGGED_GYM_ROOT_DIR from utils import critic_eval_args, get_load_path from plotting import ( - plot_custom_critic, plot_critic_prediction_only, ) from learning.modules import Critic - +from learning.modules.lqrc.custom_critics import ( + CholeskyPlusConst, + CholeskyOffset1, + CholeskyOffset2, +) import torch @@ -32,10 +35,14 @@ def filter_state_dict(state_dict): def model_switch(args): - if args["model_type"] == "CholeskyPlusConst": - return Critic(2, standard_nn=False).to(DEVICE) - elif args["model_type"] == "StandardMLP": - return Critic(2, [512, 256, 128], "elu", standard_nn=True).to(DEVICE) + if args["model_type"] == "StandardMLP": + return Critic(2, [128, 64, 32], "tanh").to(DEVICE) + elif args["model_type"] == "CholeskyPlusConst": + return CholeskyPlusConst(2).to(DEVICE) + elif args["model_type"] == "CholeskyOffset1": + return CholeskyOffset1(2).to(DEVICE) + elif args["model_type"] == "CholeskyOffset2": + return CholeskyOffset2(2).to(DEVICE) else: raise KeyError("Specified model type is not supported for critic evaluation.") @@ -46,55 +53,26 @@ def model_switch(args): path = get_load_path(args["experiment_name"], args["load_run"], args["checkpoint"]) model = model_switch(args) - loaded_dict = torch.load(path) - critic_state_dict = filter_state_dict(loaded_dict["model_state_dict"]) + critic_state_dict = torch.load(path)["critic_state_dict"] model.load_state_dict(critic_state_dict) - lb = [0.0, -100.0] - ub = [2.0 * torch.pi, 100.0] + lb = [-torch.pi, -8.0] + ub = [torch.pi, 8.0] x = generate_data(lb, ub) - x_norm = model.normalize(x) + # x_norm = model.normalize(x) y_pred = [] - A_pred = [] - c_pred = [] model.eval() - t_c = 0 - p_c = 0 for X_batch in x: y_hat = model.evaluate(X_batch.unsqueeze(0)) y_pred.append(y_hat) - if model_type == "CholeskyPlusConst": - A_pred.append(model.NN.intermediates["A"]) - c_pred.append(model.NN.intermediates["c"]) - save_path = os.path.join(LEGGED_GYM_LQRC_DIR, "logs") + save_path = os.path.join(LEGGED_GYM_ROOT_DIR, "logs", "lqrc") if not os.path.exists(save_path): os.makedirs(save_path) fn = args["fn"] - if model_type == "CholeskyPlusConst": - plot_custom_critic( - x, - torch.vstack(y_pred), - ( - x.unsqueeze(2) - .transpose(1, 2) - .bmm(torch.vstack(A_pred)) - .bmm(x.unsqueeze(2)) - ).squeeze(2), - torch.vstack(c_pred), - save_path + f"/{fn}.png", - contour=args["contour"], - ) - plot_critic_prediction_only( - x, - torch.vstack(y_pred), - save_path + f"/{fn}_prediction_only.png", - contour=args["contour"], - ) - else: - plot_critic_prediction_only( - x, torch.vstack(y_pred), save_path + f"/{fn}.png", contour=args["contour"] - ) + plot_critic_prediction_only( + x, torch.vstack(y_pred), save_path + f"/{fn}.png", contour=args["contour"] + ) diff --git a/learning/modules/lqrc/plotting.py b/learning/modules/lqrc/plotting.py index aa862b3c..9dde12df 100644 --- a/learning/modules/lqrc/plotting.py +++ b/learning/modules/lqrc/plotting.py @@ -1,10 +1,93 @@ from math import sqrt +import matplotlib import matplotlib.pyplot as plt import numpy as np from matplotlib.colors import CenteredNorm, TwoSlopeNorm +matplotlib.rcParams["pdf.fonttype"] = 42 +matplotlib.rcParams["ps.fonttype"] = 42 +matplotlib.rcParams["mathtext.fontset"] = "stix" +matplotlib.rcParams["font.family"] = "STIXGeneral" -def graph_3D_helper(ax, contour): +font = {"size": 12} +matplotlib.rc("font", **font) + + +def plot_pendulum_multiple_critics( + x, predictions, targets, title, fn, colorbar_label="f(x)" +): + num_critics = len(x.keys()) + assert ( + num_critics >= 1 + ), "This function requires at least two critics for graphing. To graph a single critic please use its corresponding graphing function." + fig, axes = plt.subplots(nrows=2, ncols=num_critics, figsize=(6 * num_critics, 10)) + for ix, critic_name in enumerate(x): + np_x = x[critic_name].detach().cpu().numpy().reshape(-1, 2) + np_predictions = predictions[critic_name].detach().cpu().numpy().reshape(-1) + np_targets = targets[critic_name].detach().cpu().numpy().reshape(-1) + np_error = np_predictions - np_targets + + # predictions + if np.any(~np.equal(np_predictions, np.zeros_like(np_predictions))): + pred_scatter = axes[0, ix].scatter( + np_x[:, 0], + np_x[:, 1], + c=np_predictions, + cmap="PiYG", + alpha=0.5, + norm=CenteredNorm(), + ) + axes[0, ix].set_title(f"{critic_name} Prediction") + + # error + if np.any(~np.equal(np_error, np.zeros_like(np_error))): + error_scatter = axes[1, ix].scatter( + np_x[:, 0], + np_x[:, 1], + c=np_error, + cmap="bwr", + alpha=0.5, + norm=CenteredNorm(), + ) + axes[1, ix].set_title(f"{critic_name} Error") + + fig.colorbar( + pred_scatter, ax=axes[0, :].ravel().tolist(), shrink=0.95, label=colorbar_label + ) + fig.colorbar( + error_scatter, ax=axes[1, :].ravel().tolist(), shrink=0.95, label=colorbar_label + ) + fig.suptitle(title, fontsize=16) + plt.savefig(f"{fn}.png") + print(f"Saved to {fn}.png") + + +def plot_pendulum_single_critic( + x, predictions, targets, title, fn, colorbar_label="f(x)" +): + x = x.detach().cpu().numpy().reshape(-1, 2) + predictions = predictions.detach().cpu().numpy().reshape(-1) + targets = targets.detach().cpu().numpy().reshape(-1) + error = predictions - targets + + fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 6)) + # error + scatter = axes[0].scatter( + x[:, 0], x[:, 1], c=error, cmap="bwr", alpha=0.5, norm=CenteredNorm() + ) + # prediction values + scatter = axes[1].scatter( + x[:, 0], x[:, 1], c=predictions, cmap="bwr", alpha=0.5, norm=CenteredNorm() + ) + axes[0].set_title("Error") + axes[1].set_title("Predictions") + fig.colorbar(scatter, ax=axes.ravel().tolist(), shrink=0.95, label=colorbar_label) + fig.suptitle(title, fontsize=16) + plt.savefig(f"{fn}.png") + print(f"Saved to {fn}.png") + + +def graph_3D_helper(ax, contour=False): if contour: return ax.contourf return ax.pcolormesh @@ -198,13 +281,17 @@ def plot_predictions_and_gradients( y_pred = y_pred.detach().cpu().numpy() y_actual = y_actual.detach().cpu().numpy() if actual_grad is not None: - actual_grad = actual_grad.detach().cpu().numpy() + actual_grad = ( + actual_grad.detach().cpu().numpy() + if not isinstance(actual_grad, np.ndarray) + else actual_grad + ) if colormap_diff: fig, (ax1, ax2) = plt.subplots( nrows=1, ncols=2, figsize=(16, 8), layout="tight" ) sq_len = int(sqrt(x_actual.shape[0])) - img1 = ax1.contourf( + img1 = graph_3D_helper(ax1)( x_actual[:, 0].reshape(sq_len, sq_len), x_actual[:, 1].reshape(sq_len, sq_len), np.linalg.norm( @@ -214,16 +301,18 @@ def plot_predictions_and_gradients( - actual_grad, axis=1, ).reshape(sq_len, sq_len), - cmap="RdBu_r", - norm=TwoSlopeNorm(0), + cmap="OrRd", + vmin=0.0, + vmax=30.0, ) plt.colorbar(img1, ax=ax1) - img2 = ax2.contourf( + img2 = graph_3D_helper(ax2)( x_actual[:, 0].reshape(sq_len, sq_len), x_actual[:, 1].reshape(sq_len, sq_len), (y_pred - y_actual).reshape(sq_len, sq_len), - cmap="PuOr", - norm=TwoSlopeNorm(0), + cmap="bwr", + vmin=-2.0, + vmax=2.0, ) plt.colorbar(img2, ax=ax2) set_titles_labels( @@ -243,7 +332,7 @@ def plot_predictions_and_gradients( c=y_pred, cmap="viridis", marker="o", - edgecolors="k", + alpha=0.7, ) scatter = axes[1].scatter( x_actual[:, 0], @@ -251,7 +340,7 @@ def plot_predictions_and_gradients( c=y_actual, cmap="viridis", marker="^", - edgecolors="k", + alpha=0.7, ) fig.colorbar(scatter, ax=axes.ravel().tolist(), shrink=0.95, label="f(x)") axes[0].set_title("Pointwise Predictions") @@ -278,6 +367,20 @@ def plot_predictions_and_gradients( for a in [45, 135, 180, 225]: ax.view_init(elev=10.0, azim=a) plt.savefig(f"{fn}_angle_{a}.png") + if colormap_diff: + fig, axis = plt.subplots(nrows=1, ncols=1, figsize=(6, 6)) + scatter = axis.scatter( + x_actual[:, 0], + x_actual[:, 1], + c=y_pred - y_actual, + cmap="bwr", + marker="o", + alpha=0.5, + norm=CenteredNorm(), + ) + fig.colorbar(scatter, ax=axis, shrink=0.95, label="f(x)") + axis.set_title("Error Between Pointwise Predictions and Targets") + plt.savefig(f"{fn}_value_error_colormap.png") elif dim == 2: x_actual = x_actual.detach().cpu().numpy() y_pred = y_pred.detach().cpu().numpy() @@ -307,55 +410,34 @@ def plot_predictions_and_gradients( print("Dimension less than 2, cannot graph") -def plot_pointwise_predictions(dim, x_actual, y_pred, y_actual, fn): - if dim > 3: - print("Dimension greater than 3, cannot graph") - elif dim == 3: - x_actual = x_actual.detach().cpu().numpy() - y_pred = y_pred.detach().cpu().numpy() - y_actual = y_actual.detach().cpu().numpy() +def plot_autoencoder(targets, predictions, fn): + targets = targets.detach().cpu().numpy() + predictions = predictions.detach().cpu().numpy() - fig = plt.figure(constrained_layout=True) - ax = fig.add_subplot(projection="3d") - ax.scatter( - x_actual[:, 0], x_actual[:, 1], y_pred, marker="o", label="Predicted" - ) - ax.scatter(x_actual[:, 0], x_actual[:, 1], y_actual, marker="^", label="Actual") - ax.set_xlim(-15, 15) - ax.set_ylim(-15, 15) - ax.set_zlim(-15, 250) - ax.set_xlabel("x") - ax.set_ylabel("y") - ax.set_zlabel("f(x, y)") - ax.legend(loc="upper left") - ax.set_title("Pointwise Predictions vs Actual") - plt.legend(loc="upper left") - plt.savefig(f"{fn}.png") - elif dim == 2: - x_actual = x_actual.detach().cpu().numpy() - y_pred = y_pred.detach().cpu().numpy() - y_actual = y_actual.detach().cpu().numpy() - - _, ax = plt.subplots() - ax.scatter(x_actual, y_pred, label="Predicted") - ax.scatter(x_actual, y_actual, label="Actual", alpha=0.5) - ax.set_ylim(-10, 250) - ax.set_xlim(-15, 15) - ax.set_ylabel("y") - ax.set_xlabel("x") - ax.legend(loc="upper left") - ax.set_title("Pointwise Predictions vs Actual") - plt.savefig(f"{fn}.png") - else: - print("Dimension less than 2, cannot graph") + _, ax = plt.subplots() + ax.scatter(targets[:, 0], targets[:, 1], label="Targets", alpha=0.5) + ax.scatter(predictions[:, 0], predictions[:, 1], label="Predictions", alpha=0.5) + ax.set_xlim( + min(np.min(targets[:, 0]), np.min(predictions[:, 0])), + max(np.max(targets[:, 0]), np.max(predictions[:, 0])), + ) + ax.set_ylim( + min(np.min(targets[:, 1]), np.min(predictions[:, 1])), + max(np.max(targets[:, 1]), np.max(predictions[:, 1])), + ) + ax.set_xlabel("theta") + ax.set_ylabel("theta dot") + ax.legend(loc="upper left") + ax.set_title("Pointwise Predictions vs Actual") + plt.savefig(f"{fn}.png") -def plot_loss(loss_arr, fn): +def plot_loss(loss_arr, fn, title="Training Loss Across Epochs"): _, ax = plt.subplots() ax.plot(loss_arr) ax.set_ylim(min(0, min(loss_arr) - 1.0), max(loss_arr) + 1.0) ax.set_xlim(0, len(loss_arr)) ax.set_ylabel("Average Batch MSE") ax.set_xlabel("Epoch") - ax.set_title("Training Loss Across Epochs") + ax.set_title(title) plt.savefig(f"{fn}.png") diff --git a/learning/modules/lqrc/run_plot.py b/learning/modules/lqrc/run_plot.py index a5eff73a..9a9b7f24 100644 --- a/learning/modules/lqrc/run_plot.py +++ b/learning/modules/lqrc/run_plot.py @@ -1,12 +1,12 @@ import os -from learning import LEGGED_GYM_LQRC_DIR +from learning import LEGGED_GYM_ROOT_DIR from learning.modules.lqrc.plotting import plot_training_data_dist read_path = "CholeskyPlusConst_fixed_mean_var_all_obs.npy" output_path = "CholeskyPlusConst_fixed_mean_var_data_dist_1_redo.png" -save_path = os.path.join(LEGGED_GYM_LQRC_DIR, "logs") +save_path = os.path.join(LEGGED_GYM_ROOT_DIR, "logs", "lqrc") if __name__ == "__main__": plot_training_data_dist( - LEGGED_GYM_LQRC_DIR + f"/{read_path}", save_path + f"/{output_path}" + LEGGED_GYM_ROOT_DIR + f"/{read_path}", save_path + f"/{output_path}" ) diff --git a/learning/modules/lqrc/train_autoencoder.py b/learning/modules/lqrc/train_autoencoder.py new file mode 100644 index 00000000..913c3e91 --- /dev/null +++ b/learning/modules/lqrc/train_autoencoder.py @@ -0,0 +1,73 @@ +import os +import time +import torch +import torch.nn as nn +from autoencoder import Autoencoder +from gym import LEGGED_GYM_ROOT_DIR +from utils import autoencoder_args +from plotting import plot_loss, plot_autoencoder + + +def generate_data(lb, ub, n): + return (ub - lb).expand(n, -1) * torch.rand(n, ub.shape[-1]) + lb + + +if __name__ == "__main__": + args = vars(autoencoder_args()) + num_epochs = args["epochs"] + num_batches = args["batches"] + input_dim = args["input_dim"] + latent_dim = args["latent_dim"] + n = args["n"] + lb = torch.tensor([-torch.pi, -8.0]) + ub = torch.tensor([torch.pi, 8.0]) + + # model = Autoencoder(input_dim, latent_dim, [16, 8, 4, 2], "elu") + # model = Autoencoder(input_dim, latent_dim, [512, 256, 128], "elu") + model = Autoencoder(input_dim, latent_dim, [64, 32, 16, 8], "elu") + + # ! TODO: train on reference motion swing up trajectories + + loss = nn.MSELoss() + optimizer = torch.optim.Adam(model.parameters(), lr=0.0003) + training_loss = [] + for epoch in range(num_epochs): + batch_loss = [] + for batch in range(num_batches): + target = generate_data(lb, ub, n) + prediction = model(target) + loss = nn.MSELoss(reduction="mean")(target, prediction) + batch_loss.append(loss) + optimizer.zero_grad() + loss.backward() + optimizer.step() + training_loss.append(torch.mean(torch.tensor(batch_loss)).item()) + if epoch % 250 == 0: + print(f"Finished epoch {epoch}, latest loss {training_loss[-1]}") + print(f"Training finished with final loss of {training_loss[-1]}") + + test_loss = [] + targets = generate_data(lb, ub, 100) + predictions = torch.zeros_like(targets) + for ix, target in enumerate(targets): + prediction = model(target) + predictions[ix, :] = prediction + loss = nn.MSELoss(reduction="mean")(target, prediction) + test_loss.append(loss) + + print(f"Average loss on test set {torch.mean(torch.tensor(test_loss))}") + + save_str = "autoencoders" + time_str = time.strftime("%Y%m%d_%H%M%S") + save_path = os.path.join(LEGGED_GYM_ROOT_DIR, "logs", save_str) + if not os.path.exists(save_path): + os.makedirs(save_path) + + plot_loss(training_loss, f"{save_path}/{save_str}_loss") + plot_autoencoder(targets, predictions, f"{save_path}/{save_str}_predictions") + + if args["save_model"]: + torch.save( + model.state_dict(), + save_path + f"/model_{time_str}" + ".pt", + ) diff --git a/learning/modules/lqrc/train_benchmark.py b/learning/modules/lqrc/train_benchmark.py index 69058087..51000092 100644 --- a/learning/modules/lqrc/train_benchmark.py +++ b/learning/modules/lqrc/train_benchmark.py @@ -1,6 +1,8 @@ import os import time +import numpy as np import torch +from learning.modules.utils.neural_net import create_MLP from torchviz import make_dot from torch.utils.data import DataLoader from torch.utils.data import random_split @@ -11,8 +13,10 @@ CustomCholeskyLoss, CholeskyPlusConst, CustomCholeskyPlusConstLoss, + CholeskyOffset1, + CholeskyOffset2, ) -from learning import LEGGED_GYM_LQRC_DIR +from learning import LEGGED_GYM_ROOT_DIR from utils import benchmark_args from plotting import ( plot_loss, @@ -100,6 +104,26 @@ def bound_function_smoothly(y): return X, y +def load_ground_truth(*args, **kwargs): + fn = f"{LEGGED_GYM_ROOT_DIR}/logs/lqrc/ground_truth.npy" + data = torch.from_numpy(np.load(fn)).to(DEVICE).type(torch.float32) + return data[:, :-1], data[:, -1].unsqueeze(1) + + +def ground_truth_numerial_gradients(): + fn = f"{LEGGED_GYM_ROOT_DIR}/logs/lqrc/ground_truth.npy" + data = np.load(fn) + sq_size = int(np.sqrt(data.shape[0]).item()) + gridded_data = data[:, -1].reshape((sq_size, sq_size)) + h_theta = abs(data[0, 0] - data[sq_size, 0]).item() + h_omega = abs(data[0, 1] - data[1, 1]).item() + theta_grad, theta_omega = np.gradient(gridded_data, h_theta, h_omega) + return np.concatenate( + (theta_grad.flatten().reshape(-1, 1), theta_omega.flatten().reshape(-1, 1)), + axis=1, + ) + + def model_switch(input_dim, model_name=None): if model_name == "QuadraticNetCholesky": return QuadraticNetCholesky(input_dim, device=DEVICE).to( @@ -109,10 +133,24 @@ def model_switch(input_dim, model_name=None): return CholeskyPlusConst(input_dim, device=DEVICE).to( DEVICE ), CustomCholeskyPlusConstLoss(const_penalty=0.1) - else: - return BaselineMLP(input_dim, device=DEVICE).to(DEVICE), torch.nn.MSELoss( + elif model_name == "VanillaCritic": + num_obs = input_dim + hidden_dims = [128, 64, 32] + activation = "tanh" + return create_MLP(num_obs, 1, hidden_dims, activation).to( + DEVICE + ), torch.nn.MSELoss(reduction="mean") + elif model_name == "CholeskyOffset1": + return CholeskyOffset1(input_dim, device=DEVICE).to(DEVICE), torch.nn.MSELoss( + reduction="mean" + ) + elif model_name == "CholeskyOffset2": + return CholeskyOffset2(input_dim, device=DEVICE).to(DEVICE), torch.nn.MSELoss( reduction="mean" ) + return BaselineMLP(input_dim, device=DEVICE).to(DEVICE), torch.nn.MSELoss( + reduction="mean" + ) def test_case_switch(case_name=None): @@ -122,8 +160,17 @@ def test_case_switch(case_name=None): return generate_cos elif case_name == "quadratic": return generate_nD_quadratic - else: - assert case_name is not None, "Please specify a valid test case when running." + elif case_name == "ground_truth": + return load_ground_truth + assert case_name is not None, "Please specify a valid test case when running." + + +def gradient_switch(test_case, **kwargs): + if test_case == "rosenbrock": + return generate_rosenbrock_grad(torch.vstack(kwargs["all_inputs"])) + elif test_case == "ground_truth": + return ground_truth_numerial_gradients() + return None if __name__ == "__main__": @@ -163,11 +210,7 @@ def test_case_switch(case_name=None): model, loss_fn = model_switch(input_dim, model_type) print(model) - optimizer = torch.optim.Adam( - model.parameters(), - betas=(0.75, 0.9), - weight_decay=2.074821225483474e-07, - ) + optimizer = torch.optim.Adam(model.parameters()) # train model.train() @@ -180,9 +223,9 @@ def test_case_switch(case_name=None): X_batch.requires_grad_() y_pred = model(X_batch) loss = ( - loss_fn(y_pred, y_batch) - if model_type == "BaselineMLP" - else loss_fn(y_pred, y_batch, intermediates=model.intermediates) + loss_fn(y_pred, y_batch, intermediates=model.intermediates) + if model_type == "CholeskyPlusConst" + else loss_fn(y_pred, y_batch) ) loss_per_batch.append(loss.item()) optimizer.zero_grad() @@ -213,6 +256,13 @@ def test_case_switch(case_name=None): batch_size=1, shuffle=False, ) + if args["test_case"] == "ground_truth": + graphing_data = LQRCDataset(*load_ground_truth()) + test_dataloader = DataLoader( + torch.utils.data.Subset(graphing_data, range(len(graphing_data))), + batch_size=1, + shuffle=False, + ) for X_batch, y_batch in test_dataloader: # give X_batch a grad_fn @@ -220,11 +270,10 @@ def test_case_switch(case_name=None): X_batch = X_batch + 0 y_pred = model(X_batch) loss = ( - loss_fn(y_pred, y_batch) - if model_type == "BaselineMLP" - else loss_fn(y_pred, y_batch, intermediates=model.intermediates) + loss_fn(y_pred, y_batch, intermediates=model.intermediates) + if model_type == "CholeskyPlusConst" + else loss_fn(y_pred, y_batch) ) - # could wrap BaselineMLP in a loss that ignores intermediates to eliminate if/else if model_type == "QuadraticNetCholesky": # ! do we want this anymore? A_pred = model.intermediates["A"] @@ -252,7 +301,7 @@ def test_case_switch(case_name=None): print(f"Total training time: {timer.get_time('total training')}") time_str = time.strftime("%Y%m%d_%H%M%S") - save_path = os.path.join(LEGGED_GYM_LQRC_DIR, "logs", save_str) + save_path = os.path.join(LEGGED_GYM_ROOT_DIR, "logs", save_str) if not os.path.exists(save_path): os.makedirs(save_path) if save_model: @@ -265,9 +314,11 @@ def test_case_switch(case_name=None): params=dict(model.named_parameters()), show_attrs=True, show_saved=True, - ).render(f"{LEGGED_GYM_LQRC_DIR}/logs/model_viz", format="png") + ).render(f"{LEGGED_GYM_ROOT_DIR}/logs/lqrc/model_viz", format="png") print("Saving to", save_path) + grad_kwargs = {"all_inputs": all_inputs} if test_case == "rosenbrock" else {} + plot_predictions_and_gradients( input_dim + 1, torch.vstack(all_inputs), @@ -277,9 +328,7 @@ def test_case_switch(case_name=None): f"{save_path}/{time_str}_grad_graph", colormap_diff=args["colormap_diff"], colormap_values=args["colormap_values"], - actual_grad=generate_rosenbrock_grad(torch.vstack(all_inputs)) - if test_case == "rosenbrock" - else None, + actual_grad=gradient_switch(test_case, **grad_kwargs), ) plot_loss(training_losses, f"{save_path}/{time_str}_loss") diff --git a/learning/modules/lqrc/utils.py b/learning/modules/lqrc/utils.py index 8e7cd9c7..82c879bd 100644 --- a/learning/modules/lqrc/utils.py +++ b/learning/modules/lqrc/utils.py @@ -112,6 +112,58 @@ def critic_eval_args(): return args +def autoencoder_args(): + parser = argparse.ArgumentParser( + description="Toggle between autoencoder training settings." + ) + parser.add_argument( + "--input_dim", + action="store", + type=int, + nargs="?", + default=1, + help="Dimenison of the input variables", + ) + parser.add_argument( + "--latent_dim", + action="store", + type=int, + nargs="?", + default=1, + help="Dimenison of the input variables", + ) + parser.add_argument( + "--epochs", + action="store", + type=int, + nargs="?", + default=1000, + help="Number of training epochs", + ) + parser.add_argument( + "--batches", + action="store", + type=int, + nargs="?", + default=4, + help="Number of randomly generated batches per epoch.", + ) + parser.add_argument( + "--n", + action="store", + type=int, + nargs="?", + default=100, + help="Number of training epochs", + ) + parser.add_argument("--save_model", action="store_true", help="Save the model") + try: + args = parser.parse_args() + except SystemExit: + print("Exception occurred! You've likely mispelled a key.") + return args + + def get_load_path(name, load_run=-1, checkpoint=-1): root = os.path.join(LEGGED_GYM_ROOT_DIR, "logs", name) run_path = select_run(root, load_run) diff --git a/learning/modules/utils/normalize.py b/learning/modules/utils/normalize.py index 11be1b14..246bafa4 100644 --- a/learning/modules/utils/normalize.py +++ b/learning/modules/utils/normalize.py @@ -2,31 +2,10 @@ import torch.nn as nn -def get_mean_var_with_masks(values, masks): - """ - Applies a mask to the input values and calculates - the mean and variance over the valid entries, as specified - by the mask. - """ - sum_mask = masks.sum() - masked_vals = values * masks - values_mean = masked_vals.sum() / sum_mask - min_sqr = (((masked_vals) ** 2) / sum_mask).sum() - ( - (masked_vals / sum_mask).sum() - ) ** 2 - values_var = min_sqr * sum_mask / (sum_mask - 1) - return values_mean, values_var - - class RunningMeanStd(nn.Module): - """ - Keeps a running mean to normalize tensor of choice. - """ - - def __init__(self, num_items, axis=0, epsilon=1e-05): - super(RunningMeanStd, self).__init__() + def __init__(self, num_items, epsilon=1e-05): + super().__init__() self.num_items = num_items - self.axis = axis self.epsilon = epsilon self.register_buffer( @@ -44,11 +23,6 @@ def _update_mean_var_from_moments( batch_var, batch_count, ): - """ - Implements parallel algorithm for combining arbitrary sets A and B - https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance - #Parallel_algorithm - """ tot_count = running_count + batch_count delta = batch_mean - running_mean @@ -64,16 +38,10 @@ def _update_mean_var_from_moments( new_var = M2 / (tot_count - 1) return new_mean, new_var, tot_count - def forward(self, input, mask=None): - """ - Returns the normalized version of the input. - """ + def forward(self, input): if self.training: - if mask is not None: - mean, var = get_mean_var_with_masks(input, mask) - else: - mean = input.mean(self.axis) - var = input.var(self.axis) + mean = input.mean(tuple(range(input.dim() - 1))) + var = input.var(tuple(range(input.dim() - 1))) ( self.running_mean, self.running_var, @@ -89,8 +57,6 @@ def forward(self, input, mask=None): current_mean = self.running_mean current_var = self.running_var - # current_mean = torch.tensor([torch.pi, 0.0], device="cuda:0") - # current_var = torch.tensor([((200.0)**2.0/12.0), ((2.0*torch.pi)/12.0)], device="cuda:0") y = (input - current_mean.float()) / torch.sqrt( current_var.float() + self.epsilon diff --git a/learning/runners/BaseRunner.py b/learning/runners/BaseRunner.py index 9b227e1d..cac170dc 100644 --- a/learning/runners/BaseRunner.py +++ b/learning/runners/BaseRunner.py @@ -1,6 +1,6 @@ import torch from learning.algorithms import * # noqa: F403 -from learning.modules import ActorCritic +from learning.modules import Actor, Critic from learning.utils import remove_zero_weighted_rewards @@ -10,31 +10,32 @@ def __init__(self, env, train_cfg, device="cpu"): self.env = env self.parse_train_cfg(train_cfg) - num_actor_obs = self.get_obs_size(self.policy_cfg["actor_obs"]) - num_critic_obs = self.get_obs_size(self.policy_cfg["critic_obs"]) - num_actions = self.get_action_size(self.policy_cfg["actions"]) - actor_critic = ActorCritic( - num_actor_obs, num_critic_obs, num_actions, **self.policy_cfg - ).to(self.device) - - alg_class = eval(self.cfg["algorithm_class_name"]) - self.alg = alg_class(actor_critic, device=self.device, **self.alg_cfg) - self.num_steps_per_env = self.cfg["num_steps_per_env"] self.save_interval = self.cfg["save_interval"] self.num_learning_iterations = self.cfg["max_iterations"] self.tot_timesteps = 0 self.it = 0 - - # * init storage and model - self.init_storage() self.log_dir = train_cfg["log_dir"] + self._set_up_alg() + + def _set_up_alg(self): + num_actor_obs = self.get_obs_size(self.actor_cfg["obs"]) + num_actions = self.get_action_size(self.actor_cfg["actions"]) + num_critic_obs = self.get_obs_size(self.critic_cfg["obs"]) + actor = Actor(num_actor_obs, num_actions, **self.actor_cfg) + critic = Critic(num_critic_obs, **self.critic_cfg) + alg_class = eval(self.cfg["algorithm_class_name"]) + self.alg = alg_class(actor, critic, device=self.device, **self.alg_cfg) def parse_train_cfg(self, train_cfg): self.cfg = train_cfg["runner"] self.alg_cfg = train_cfg["algorithm"] - remove_zero_weighted_rewards(train_cfg["policy"]["reward"]["weights"]) - self.policy_cfg = train_cfg["policy"] + remove_zero_weighted_rewards(train_cfg["critic"]["reward"]["weights"]) + remove_zero_weighted_rewards( + train_cfg["critic"]["reward"]["termination_weight"] + ) + self.actor_cfg = train_cfg["actor"] + self.critic_cfg = train_cfg["critic"] def init_storage(self): raise NotImplementedError diff --git a/learning/runners/__init__.py b/learning/runners/__init__.py index 7f474ff2..da07bb47 100644 --- a/learning/runners/__init__.py +++ b/learning/runners/__init__.py @@ -32,4 +32,8 @@ from .on_policy_runner import OnPolicyRunner from .my_runner import MyRunner -from .data_logging_runner import DataLoggingRunner +from .old_policy_runner import OldPolicyRunner +from .custom_critic_runner import CustomCriticRunner +from .critic_only_runner import CriticOnlyRunner +from .lqr_data_gen_runner import LQRDataGenRunner +from .datalogging_runner import DataLoggingRunner diff --git a/learning/runners/critic_only_runner.py b/learning/runners/critic_only_runner.py index 4c523e37..d5a9eb4a 100644 --- a/learning/runners/critic_only_runner.py +++ b/learning/runners/critic_only_runner.py @@ -1,9 +1,197 @@ +import os +import numpy as np import torch -from .data_logging_runner import DataLoggingRunner +from tensordict import TensorDict +from gym import LEGGED_GYM_ROOT_DIR +from learning.algorithms import * # noqa: F403 +from learning.modules import Actor +from learning.utils import Logger -class CriticOnlyRunner(DataLoggingRunner): - def load(self, path): +from .on_policy_runner import OnPolicyRunner +from learning.storage import DictStorage + + +logger = Logger() +storage = DictStorage() + + +class CriticOnlyRunner(OnPolicyRunner): + def __init__(self, env, train_cfg, device="cpu"): + super().__init__(env, train_cfg, device) + logger.initialize( + self.env.num_envs, + self.env.dt, + self.cfg["max_iterations"], + self.device, + ) + + def _set_up_alg(self): + num_actor_obs = self.get_obs_size(self.actor_cfg["obs"]) + num_critic_obs = self.get_obs_size(self.critic_cfg["obs"]) # noqa: F841 + critic_class_name = self.critic_cfg["critic_class_name"] + num_actions = self.get_action_size(self.actor_cfg["actions"]) + actor = Actor(num_actor_obs, num_actions, **self.actor_cfg) + # critic = Critic(num_critic_obs, **self.critic_cfg) + critic = eval(f"{critic_class_name}(num_critic_obs, **self.critic_cfg)") + alg_class = eval(self.cfg["algorithm_class_name"]) + self.alg = alg_class(actor, critic, device=self.device, **self.alg_cfg) + + def learn(self): + self.set_up_logger() + + rewards_dict = {} + + self.alg.switch_to_train() + actor_obs = self.get_obs(self.actor_cfg["obs"]) + critic_obs = self.get_obs(self.critic_cfg["obs"]) + tot_iter = self.it + self.num_learning_iterations + self.all_obs = torch.zeros(self.env.num_envs * (tot_iter - self.it + 1), 2) + self.all_obs[: self.env.num_envs, :] = actor_obs + self.save() + + # * start up storage + transition = TensorDict({}, batch_size=self.env.num_envs, device=self.device) + transition.update( + { + "actor_obs": actor_obs, + "actions": self.alg.act(actor_obs, critic_obs), + "critic_obs": critic_obs, + "rewards": self.get_rewards({"termination": 0.0})["termination"], + "dones": self.get_timed_out(), + } + ) + storage.initialize( + transition, + self.env.num_envs, + self.env.num_envs * self.num_steps_per_env, + device=self.device, + ) + + logger.tic("runtime") + for self.it in range(self.it + 1, tot_iter + 1): + logger.tic("iteration") + logger.tic("collection") + # * Rollout + with torch.inference_mode(): + for i in range(self.num_steps_per_env): + actions = self.alg.act(actor_obs, critic_obs) + self.set_actions( + self.actor_cfg["actions"], + actions, + self.actor_cfg["disable_actions"], + ) + + transition.update( + { + "actor_obs": actor_obs, + "actions": actions, + "critic_obs": critic_obs, + } + ) + + self.env.step() + + actor_obs = self.get_noisy_obs( + self.actor_cfg["obs"], self.actor_cfg["noise"] + ) + critic_obs = self.get_obs(self.critic_cfg["obs"]) + start = self.env.num_envs * self.it + end = self.env.num_envs * (self.it + 1) + self.all_obs[start:end, :] = actor_obs + + # * get time_outs + timed_out = self.get_timed_out() + terminated = self.get_terminated() + dones = timed_out | terminated + + self.update_rewards(rewards_dict, terminated) + total_rewards = torch.stack(tuple(rewards_dict.values())).sum(dim=0) + + transition.update( + { + "rewards": total_rewards, + "timed_out": timed_out, + "dones": dones, + } + ) + storage.add_transitions(transition) + + logger.log_rewards(rewards_dict) + logger.log_rewards({"total_rewards": total_rewards}) + logger.finish_step(dones) + logger.toc("collection") + + logger.tic("learning") + self.alg.update(storage.data) + storage.clear() + logger.toc("learning") + logger.log_all_categories() + + logger.finish_iteration() + logger.toc("iteration") + logger.toc("runtime") + logger.print_to_terminal() + + if self.it % self.save_interval == 0: + self.save() + self.all_obs = self.all_obs.detach().cpu().numpy() + save_path = os.path.join( + LEGGED_GYM_ROOT_DIR, + "logs", + "lqrc", + "standard_training_data.npy" + if self.critic_cfg["standard_critic_nn"] + else "custom_training_data.npy", + ) + + dir_path = os.path.dirname(save_path) + if not os.path.exists(dir_path): + os.makedirs(dir_path) + np.save(save_path, self.all_obs) + print(f"Saved training observations to {save_path}") + self.save() + + def load(self, path, load_optimizer=True): loaded_dict = torch.load(path) - self.alg.actor_critic.actor.load_state_dict( - loaded_dict["actor_state_dict"]) + self.alg.critic.load_state_dict(loaded_dict["critic_state_dict"]) + + def save(self): + os.makedirs(self.log_dir, exist_ok=True) + path = os.path.join(self.log_dir, "model_{}.pt".format(self.it)) + torch.save( + { + "actor_state_dict": self.alg.actor.state_dict(), + "critic_state_dict": self.alg.critic.state_dict(), + "critic_optimizer_state_dict": self.alg.critic_optimizer.state_dict(), + "iter": self.it, + }, + path, + ) + + def update_rewards(self, rewards_dict, terminated): + rewards_dict.update( + self.get_rewards( + self.critic_cfg["reward"]["termination_weight"], mask=terminated + ) + ) + rewards_dict.update( + self.get_rewards( + self.critic_cfg["reward"]["weights"], + modifier=self.env.dt, + mask=~terminated, + ) + ) + + def set_up_logger(self): + logger.register_rewards(list(self.critic_cfg["reward"]["weights"].keys())) + logger.register_rewards( + list(self.critic_cfg["reward"]["termination_weight"].keys()) + ) + logger.register_rewards(["total_rewards"]) + logger.register_category( + "algorithm", self.alg, ["mean_value_loss", "mean_surrogate_loss"] + ) + logger.register_category("actor", self.alg.actor, ["action_std", "entropy"]) + + logger.attach_torch_obj_to_wandb((self.alg.actor, self.alg.critic)) diff --git a/learning/runners/custom_critic_runner.py b/learning/runners/custom_critic_runner.py new file mode 100644 index 00000000..d2621b78 --- /dev/null +++ b/learning/runners/custom_critic_runner.py @@ -0,0 +1,178 @@ +import os +import numpy as np +import torch +from tensordict import TensorDict + +from gym import LEGGED_GYM_ROOT_DIR +from learning.algorithms import * # noqa: F403 +from learning.modules import Actor +from learning.utils import Logger + +from .on_policy_runner import OnPolicyRunner +from learning.storage import DictStorage + +logger = Logger() +storage = DictStorage() + + +class CustomCriticRunner(OnPolicyRunner): + def __init__(self, env, train_cfg, device="cpu"): + super().__init__(env, train_cfg, device) + logger.initialize( + self.env.num_envs, + self.env.dt, + self.cfg["max_iterations"], + self.device, + ) + + def _set_up_alg(self): + num_actor_obs = self.get_obs_size(self.actor_cfg["obs"]) + num_critic_obs = self.get_obs_size(self.critic_cfg["obs"]) # noqa: F841 + critic_class_name = self.critic_cfg["critic_class_name"] + num_actions = self.get_action_size(self.actor_cfg["actions"]) + actor = Actor(num_actor_obs, num_actions, **self.actor_cfg) + critic = eval(f"{critic_class_name}(num_critic_obs, **self.critic_cfg)") + alg_class = eval(self.cfg["algorithm_class_name"]) + self.alg = alg_class(actor, critic, device=self.device, **self.alg_cfg) + + def learn(self): + self.set_up_logger() + + rewards_dict = {} + + self.alg.switch_to_train() + actor_obs = self.get_obs(self.actor_cfg["obs"]) + critic_obs = self.get_obs(self.critic_cfg["obs"]) + tot_iter = self.it + self.num_learning_iterations + self.all_obs = torch.zeros(self.env.num_envs * (tot_iter - self.it + 1), 2) + self.all_obs[: self.env.num_envs, :] = actor_obs + self.save() + + # * start up storage + transition = TensorDict({}, batch_size=self.env.num_envs, device=self.device) + transition.update( + { + "actor_obs": actor_obs, + "actions": self.alg.act(actor_obs, critic_obs), + "critic_obs": critic_obs, + "rewards": self.get_rewards({"termination": 0.0})["termination"], + "dones": self.get_timed_out(), + } + ) + storage.initialize( + transition, + self.env.num_envs, + self.env.num_envs * self.num_steps_per_env, + device=self.device, + ) + + logger.tic("runtime") + for self.it in range(self.it + 1, tot_iter + 1): + logger.tic("iteration") + logger.tic("collection") + # * Rollout + with torch.inference_mode(): + for i in range(self.num_steps_per_env): + actions = self.alg.act(actor_obs, critic_obs) + self.set_actions( + self.actor_cfg["actions"], + actions, + self.actor_cfg["disable_actions"], + ) + + transition.update( + { + "actor_obs": actor_obs, + "actions": actions, + "critic_obs": critic_obs, + } + ) + + self.env.step() + + actor_obs = self.get_noisy_obs( + self.actor_cfg["obs"], self.actor_cfg["noise"] + ) + critic_obs = self.get_obs(self.critic_cfg["obs"]) + start = self.env.num_envs * self.it + end = self.env.num_envs * (self.it + 1) + self.all_obs[start:end, :] = actor_obs + + # * get time_outs + timed_out = self.get_timed_out() + terminated = self.get_terminated() + dones = timed_out | terminated + + self.update_rewards(rewards_dict, terminated) + total_rewards = torch.stack(tuple(rewards_dict.values())).sum(dim=0) + + transition.update( + { + "rewards": total_rewards, + "timed_out": timed_out, + "dones": dones, + } + ) + storage.add_transitions(transition) + + logger.log_rewards(rewards_dict) + logger.log_rewards({"total_rewards": total_rewards}) + logger.finish_step(dones) + logger.toc("collection") + + logger.tic("learning") + self.alg.update(storage.data) + storage.clear() + logger.toc("learning") + logger.log_all_categories() + + logger.finish_iteration() + logger.toc("iteration") + logger.toc("runtime") + logger.print_to_terminal() + + if self.it % self.save_interval == 0: + self.save() + self.all_obs = self.all_obs.detach().cpu().numpy() + save_path = os.path.join( + LEGGED_GYM_ROOT_DIR, + "logs", + "lqrc", + "standard_training_data.npy" + if "Cholesky" not in self.critic_cfg["critic_class_name"] + else "custom_training_data.npy", + ) + + dir_path = os.path.dirname(save_path) + if not os.path.exists(dir_path): + os.makedirs(dir_path) + np.save(save_path, self.all_obs) + print(f"Saved training observations to {save_path}") + self.save() + + def update_rewards(self, rewards_dict, terminated): + rewards_dict.update( + self.get_rewards( + self.critic_cfg["reward"]["termination_weight"], mask=terminated + ) + ) + rewards_dict.update( + self.get_rewards( + self.critic_cfg["reward"]["weights"], + modifier=self.env.dt, + mask=~terminated, + ) + ) + + def set_up_logger(self): + logger.register_rewards(list(self.critic_cfg["reward"]["weights"].keys())) + logger.register_rewards( + list(self.critic_cfg["reward"]["termination_weight"].keys()) + ) + logger.register_rewards(["total_rewards"]) + logger.register_category( + "algorithm", self.alg, ["mean_value_loss", "mean_surrogate_loss"] + ) + logger.register_category("actor", self.alg.actor, ["action_std", "entropy"]) + + logger.attach_torch_obj_to_wandb((self.alg.actor, self.alg.critic)) diff --git a/learning/runners/datalogging_runner.py b/learning/runners/datalogging_runner.py new file mode 100644 index 00000000..d7847362 --- /dev/null +++ b/learning/runners/datalogging_runner.py @@ -0,0 +1,232 @@ +import os +import torch +from tensordict import TensorDict + +from learning.utils import Logger + +from .BaseRunner import BaseRunner +from learning.algorithms import * # noqa: F403 +from learning.modules import Actor, Critic # noqa: F401 +from learning.modules.lqrc import ( + CustomCriticBaseline, + Cholesky, + CholeskyPlusConst, + CholeskyOffset1, + CholeskyOffset2, +) # noqa F401 +from learning.storage import DictStorage +from learning.utils import export_to_numpy + +logger = Logger() +storage = DictStorage() + + +class DataLoggingRunner(BaseRunner): + def __init__(self, env, train_cfg, device="cpu"): + super().__init__(env, train_cfg, device) + logger.initialize( + self.env.num_envs, + self.env.dt, + self.cfg["max_iterations"], + self.device, + ) + + def _set_up_alg(self): + num_actor_obs = self.get_obs_size(self.actor_cfg["obs"]) + num_actions = self.get_action_size(self.actor_cfg["actions"]) + num_critic_obs = self.get_obs_size(self.critic_cfg["obs"]) + critic_class_name = self.critic_cfg["critic_class_name"] + actor = Actor(num_actor_obs, num_actions, **self.actor_cfg) + critic = eval(f"{critic_class_name}(num_critic_obs, **self.critic_cfg)") + alg_class = eval(self.cfg["algorithm_class_name"]) + self.alg = alg_class(actor, critic, device=self.device, **self.alg_cfg) + + def learn(self): + self.set_up_logger() + + rewards_dict = {} + + self.alg.switch_to_train() + actor_obs = self.get_obs(self.actor_cfg["obs"]) + critic_obs = self.get_obs(self.critic_cfg["obs"]) + tot_iter = self.it + self.num_learning_iterations + + # * start up storage + transition = TensorDict({}, batch_size=self.env.num_envs, device=self.device) + transition.update( + { + "actor_obs": actor_obs, + "next_actor_obs": actor_obs, + "actions": self.alg.act(actor_obs), + "critic_obs": critic_obs, + "next_critic_obs": critic_obs, + "rewards": self.get_rewards({"termination": 0.0})["termination"], + "timed_out": self.get_timed_out(), + "terminated": self.get_terminated(), + "dones": self.get_timed_out() | self.get_terminated(), + } + ) + storage.initialize( + transition, + self.env.num_envs, + self.env.num_envs * self.num_steps_per_env, + device=self.device, + ) + + self.save() + # burn in observation normalization. + if self.actor_cfg["normalize_obs"] or self.critic_cfg["normalize_obs"]: + self.burn_in_normalization() + self.env.reset() + + logger.tic("runtime") + for self.it in range(self.it + 1, tot_iter + 1): + logger.tic("iteration") + logger.tic("collection") + # * Rollout + with torch.inference_mode(): + for i in range(self.num_steps_per_env): + actions = self.alg.act(actor_obs) + self.set_actions( + self.actor_cfg["actions"], + actions, + self.actor_cfg["disable_actions"], + ) + + transition.update( + { + "actor_obs": actor_obs, + "actions": actions, + "critic_obs": critic_obs, + } + ) + + self.env.step() + + actor_obs = self.get_noisy_obs( + self.actor_cfg["obs"], self.actor_cfg["noise"] + ) + critic_obs = self.get_obs(self.critic_cfg["obs"]) + + # * get time_outs + timed_out = self.get_timed_out() + terminated = self.get_terminated() + dones = timed_out | terminated + + self.update_rewards(rewards_dict, terminated) + total_rewards = torch.stack(tuple(rewards_dict.values())).sum(dim=0) + + transition.update( + { + "next_actor_obs": actor_obs, + "next_critic_obs": critic_obs, + "rewards": total_rewards, + "timed_out": timed_out, + "terminated": terminated, + "dones": dones, + } + ) + storage.add_transitions(transition) + + logger.log_rewards(rewards_dict) + logger.log_rewards({"total_rewards": total_rewards}) + logger.finish_step(dones) + logger.toc("collection") + + logger.tic("learning") + self.save() + self.alg.update(storage.data) + storage.clear() + logger.toc("learning") + logger.log_all_categories() + + logger.finish_iteration() + logger.toc("iteration") + logger.toc("runtime") + logger.print_to_terminal() + + # if self.it % self.save_interval == 0: + # self.save() + # self.save() + # print(f"Saved data from LQR Pendulum run to {dict_save_path}") + + @torch.no_grad + def burn_in_normalization(self, n_iterations=100): + actor_obs = self.get_obs(self.actor_cfg["obs"]) + critic_obs = self.get_obs(self.critic_cfg["obs"]) + for _ in range(n_iterations): + actions = self.alg.act(actor_obs) + self.set_actions(self.actor_cfg["actions"], actions) + self.env.step() + actor_obs = self.get_noisy_obs( + self.actor_cfg["obs"], self.actor_cfg["noise"] + ) + critic_obs = self.get_obs(self.critic_cfg["obs"]) + self.alg.critic.evaluate(critic_obs) + self.env.reset() + + def update_rewards(self, rewards_dict, terminated): + rewards_dict.update( + self.get_rewards( + self.critic_cfg["reward"]["termination_weight"], mask=terminated + ) + ) + rewards_dict.update( + self.get_rewards( + self.critic_cfg["reward"]["weights"], + modifier=self.env.dt, + mask=~terminated, + ) + ) + + def set_up_logger(self): + logger.register_rewards(list(self.critic_cfg["reward"]["weights"].keys())) + logger.register_rewards( + list(self.critic_cfg["reward"]["termination_weight"].keys()) + ) + logger.register_rewards(["total_rewards"]) + logger.register_category( + "algorithm", self.alg, ["mean_value_loss", "mean_surrogate_loss"] + ) + logger.register_category("actor", self.alg.actor, ["action_std", "entropy"]) + + logger.attach_torch_obj_to_wandb((self.alg.actor, self.alg.critic)) + + def save(self): + os.makedirs(self.log_dir, exist_ok=True) + path = os.path.join(self.log_dir, "model_{}.pt".format(self.it)) + torch.save( + { + "actor_state_dict": self.alg.actor.state_dict(), + "critic_state_dict": self.alg.critic.state_dict(), + "optimizer_state_dict": self.alg.optimizer.state_dict(), + "critic_optimizer_state_dict": self.alg.critic_optimizer.state_dict(), + "iter": self.it, + }, + path, + ) + path_data = os.path.join(self.log_dir, "data_{}".format(self.it)) + torch.save(storage.data.cpu(), path_data + ".pt") + export_to_numpy(storage.data, path_data + ".npz") + + def load(self, path, load_optimizer=True): + loaded_dict = torch.load(path) + self.alg.actor.load_state_dict(loaded_dict["actor_state_dict"]) + self.alg.critic.load_state_dict(loaded_dict["critic_state_dict"]) + if load_optimizer: + self.alg.optimizer.load_state_dict(loaded_dict["optimizer_state_dict"]) + self.alg.critic_optimizer.load_state_dict( + loaded_dict["critic_optimizer_state_dict"] + ) + self.it = loaded_dict["iter"] + + def switch_to_eval(self): + self.alg.actor.eval() + self.alg.critic.eval() + + def get_inference_actions(self): + obs = self.get_noisy_obs(self.actor_cfg["obs"], self.actor_cfg["noise"]) + return self.alg.actor.act_inference(obs) + + def export(self, path): + self.alg.actor.export(path) diff --git a/learning/runners/lqr_data_gen_runner.py b/learning/runners/lqr_data_gen_runner.py new file mode 100644 index 00000000..5aafa608 --- /dev/null +++ b/learning/runners/lqr_data_gen_runner.py @@ -0,0 +1,155 @@ +import os +import time +import torch +from tensordict import TensorDict + +from gym import LEGGED_GYM_ROOT_DIR +from learning.utils import Logger +from .custom_critic_runner import OnPolicyRunner +from learning.storage import DictStorage +from learning.utils import export_to_numpy + +logger = Logger() +storage = DictStorage() + + +class LQRDataGenRunner(OnPolicyRunner): + def __init__(self, env, train_cfg, device="cpu"): + super().__init__(env, train_cfg, device) + logger.initialize( + self.env.num_envs, + self.env.dt, + self.cfg["max_iterations"], + self.device, + ) + self.total_steps = self.num_learning_iterations * self.num_steps_per_env + + def learn(self): + self.set_up_logger() + + rewards_dict = {} + + self.alg.switch_to_train() + actor_obs = self.get_obs(self.actor_cfg["obs"]) + critic_obs = self.get_obs(self.critic_cfg["obs"]) + tot_iter = self.it + self.num_learning_iterations + self.save() + + # * start up storage + transition = TensorDict({}, batch_size=self.env.num_envs, device=self.device) + transition.update( + { + "actor_obs": actor_obs, + "actions": self.alg.act(actor_obs, critic_obs), + "critic_obs": critic_obs, + "rewards": self.get_rewards({"termination": 0.0})["termination"], + "dones": self.get_timed_out(), + } + ) + storage.initialize( + transition, + self.env.num_envs, + self.env.num_envs * self.num_steps_per_env, + device=self.device, + ) + + logger.tic("runtime") + + dict_save_path = self.create_data_save_path_and_name() + + for self.it in range(self.it + 1, tot_iter + 1): + logger.tic("iteration") + logger.tic("collection") + # * Rollout + with torch.inference_mode(): + for i in range(self.num_steps_per_env): + actions = self.alg.act(actor_obs, critic_obs) + self.set_actions( + self.actor_cfg["actions"], + actions, + self.actor_cfg["disable_actions"], + ) + + transition.update( + { + "actor_obs": actor_obs, + "actions": actions, + "critic_obs": critic_obs, + } + ) + + self.env.step() + + actor_obs = self.get_noisy_obs( + self.actor_cfg["obs"], self.actor_cfg["noise"] + ) + critic_obs = self.get_obs(self.critic_cfg["obs"]) + + # * get time_outs + timed_out = self.get_timed_out() + terminated = self.get_terminated() + dones = timed_out | terminated + + self.update_rewards(rewards_dict, terminated) + total_rewards = torch.stack(tuple(rewards_dict.values())).sum(dim=0) + + transition.update( + { + "rewards": total_rewards, + "timed_out": timed_out, + "dones": dones, + } + ) + storage.add_transitions(transition) + + logger.log_rewards(rewards_dict) + logger.log_rewards({"total_rewards": total_rewards}) + logger.finish_step(dones) + logger.toc("collection") + + logger.tic("learning") + self.alg.update(storage.data) + torch.save(storage.data.cpu(), dict_save_path + str(self.it) + ".pt") + export_to_numpy(storage.data, dict_save_path + str(self.it)) + storage.clear() + logger.toc("learning") + logger.log_all_categories() + + logger.finish_iteration() + logger.toc("iteration") + logger.toc("runtime") + logger.print_to_terminal() + + if self.it % self.save_interval == 0: + self.save() + self.save() + + print(f"Saved data from LQR Pendulum run to {dict_save_path}") + + def set_up_logger(self): + logger.register_rewards(list(self.critic_cfg["reward"]["weights"].keys())) + logger.register_rewards( + list(self.critic_cfg["reward"]["termination_weight"].keys()) + ) + logger.register_rewards(["total_rewards"]) + logger.register_category( + "algorithm", self.alg, ["mean_value_loss", "mean_surrogate_loss"] + ) + logger.register_category("actor", self.alg.actor, ["action_std", "entropy"]) + + logger.attach_torch_obj_to_wandb((self.alg.actor, self.alg.critic)) + + def create_data_save_path_and_name(self): + time_str = time.strftime("%Y%m%d_%H%M%S") + save_path = os.path.join( + LEGGED_GYM_ROOT_DIR, + "logs", + "lqrc", + "lqr_data", + f"run_{time_str}", + "iteration_", + ) + dir_path = os.path.dirname(save_path) + if not os.path.exists(dir_path): + os.makedirs(dir_path) + return save_path diff --git a/learning/runners/my_runner.py b/learning/runners/my_runner.py index e555b525..d97dee8d 100644 --- a/learning/runners/my_runner.py +++ b/learning/runners/my_runner.py @@ -1,12 +1,12 @@ import torch +from tensordict import TensorDict from learning.utils import Logger -from learning.utils import PotentialBasedRewardShaping -from learning.utils import remove_zero_weighted_rewards - from .on_policy_runner import OnPolicyRunner +from learning.storage import DictStorage logger = Logger() +storage = DictStorage() class MyRunner(OnPolicyRunner): @@ -21,21 +21,33 @@ def __init__(self, env, train_cfg, device="cpu"): def learn(self): self.set_up_logger() - remove_zero_weighted_rewards(self.policy_cfg["reward"]["pbrs_weights"]) - PBRS = PotentialBasedRewardShaping( - self.policy_cfg["reward"]["pbrs_weights"], self.device - ) - logger.register_rewards(PBRS.get_reward_keys()) rewards_dict = {} - self.alg.actor_critic.train() - actor_obs = self.get_obs(self.policy_cfg["actor_obs"]) - critic_obs = self.get_obs(self.policy_cfg["critic_obs"]) + self.alg.switch_to_train() + actor_obs = self.get_obs(self.actor_cfg["actor_obs"]) + critic_obs = self.get_obs(self.critic_cfg["critic_obs"]) tot_iter = self.it + self.num_learning_iterations - self.save() + # * start up storage + transition = TensorDict({}, batch_size=self.env.num_envs, device=self.device) + transition.update( + { + "actor_obs": actor_obs, + "actions": self.alg.act(actor_obs, critic_obs), + "critic_obs": critic_obs, + "rewards": self.get_rewards({"termination": 0.0})["termination"], + "dones": self.get_timed_out(), + } + ) + storage.initialize( + transition, + self.env.num_envs, + self.env.num_envs * self.num_steps_per_env, + device=self.device, + ) + logger.tic("runtime") for self.it in range(self.it + 1, tot_iter + 1): logger.tic("iteration") @@ -45,39 +57,52 @@ def learn(self): for i in range(self.num_steps_per_env): actions = self.alg.act(actor_obs, critic_obs) self.set_actions( - self.policy_cfg["actions"], + self.actor_cfg["actions"], actions, - self.policy_cfg["disable_actions"], + self.actor_cfg["disable_actions"], + ) + + transition.update( + { + "actor_obs": actor_obs, + "actions": actions, + "critic_obs": critic_obs, + } ) - PBRS.pre_step(self.env) self.env.step() actor_obs = self.get_noisy_obs( - self.policy_cfg["actor_obs"], self.policy_cfg["noise"] + self.actor_cfg["actor_obs"], self.actor_cfg["noise"] ) - critic_obs = self.get_obs(self.policy_cfg["critic_obs"]) + critic_obs = self.get_obs(self.critic_cfg["critic_obs"]) # * get time_outs timed_out = self.get_timed_out() terminated = self.get_terminated() dones = timed_out | terminated self.update_rewards(rewards_dict, terminated) - rewards_dict.update(PBRS.post_step(self.env, dones)) total_rewards = torch.stack(tuple(rewards_dict.values())).sum(dim=0) + transition.update( + { + "rewards": total_rewards, + "timed_out": timed_out, + "dones": dones, + } + ) + storage.add_transitions(transition) + logger.log_rewards(rewards_dict) logger.log_rewards({"total_rewards": total_rewards}) logger.finish_step(dones) - - self.alg.process_env_step(total_rewards, dones, timed_out) - self.alg.compute_returns(critic_obs) logger.toc("collection") logger.tic("learning") - self.alg.update() + self.alg.update(storage.data) + storage.clear() logger.toc("learning") - logger.log_category() + logger.log_all_categories() logger.finish_iteration() logger.toc("iteration") @@ -88,30 +113,15 @@ def learn(self): self.save() self.save() - def update_rewards(self, rewards_dict, terminated): - rewards_dict.update( - self.get_rewards( - self.policy_cfg["reward"]["termination_weight"], mask=terminated - ) - ) - rewards_dict.update( - self.get_rewards( - self.policy_cfg["reward"]["weights"], - modifier=self.env.dt, - mask=~terminated, - ) - ) - def set_up_logger(self): - logger.register_rewards(list(self.policy_cfg["reward"]["weights"].keys())) + logger.register_rewards(list(self.critic_cfg["reward"]["weights"].keys())) logger.register_rewards( - list(self.policy_cfg["reward"]["termination_weight"].keys()) + list(self.critic_cfg["reward"]["termination_weight"].keys()) ) logger.register_rewards(["total_rewards"]) logger.register_category( "algorithm", self.alg, ["mean_value_loss", "mean_surrogate_loss"] ) + logger.register_category("actor", self.alg.actor, ["action_std", "entropy"]) - logger.attach_torch_obj_to_wandb( - (self.alg.actor_critic.actor, self.alg.actor_critic.critic) - ) + logger.attach_torch_obj_to_wandb((self.alg.actor, self.alg.critic)) diff --git a/learning/runners/old_critic_only_runner.py b/learning/runners/old_critic_only_runner.py new file mode 100644 index 00000000..3a9de845 --- /dev/null +++ b/learning/runners/old_critic_only_runner.py @@ -0,0 +1,8 @@ +import torch +from .old_data_logging_runner import DataLoggingRunner + + +class CriticOnlyRunner(DataLoggingRunner): + def load(self, path): + loaded_dict = torch.load(path) + self.alg.actor_critic.actor.load_state_dict(loaded_dict["actor_state_dict"]) diff --git a/learning/runners/data_logging_runner.py b/learning/runners/old_data_logging_runner.py similarity index 53% rename from learning/runners/data_logging_runner.py rename to learning/runners/old_data_logging_runner.py index f374d84c..dbedee99 100644 --- a/learning/runners/data_logging_runner.py +++ b/learning/runners/old_data_logging_runner.py @@ -1,13 +1,16 @@ import torch import numpy as np import os -from learning import LEGGED_GYM_LQRC_DIR +from tensordict import TensorDict +from learning import LEGGED_GYM_ROOT_DIR from learning.utils import Logger from .on_policy_runner import OnPolicyRunner +from learning.storage import DictStorage logger = Logger() +storage = DictStorage() class DataLoggingRunner(OnPolicyRunner): @@ -20,20 +23,46 @@ def __init__(self, env, train_cfg, device="cpu"): self.device, ) + # def _set_up_alg(self): + # num_actor_obs = self.get_obs_size(self.actor_cfg["obs"]) + # num_critic_obs = self.get_obs_size(self.critic_cfg["obs"]) + # num_actions = self.get_action_size(self.actor_cfg["actions"]) + # actor = Actor(num_actor_obs, num_actions, **self.actor_cfg) + # critic = Critic(num_critic_obs, **self.critic_cfg) + # alg_class = eval(self.cfg["algorithm_class_name"]) + # self.alg = alg_class(actor, critic, device=self.device, **self.alg_cfg) + def learn(self): self.set_up_logger() rewards_dict = {} - self.alg.actor_critic.train() - actor_obs = self.get_obs(self.policy_cfg["actor_obs"]) - critic_obs = self.get_obs(self.policy_cfg["critic_obs"]) + self.alg.switch_to_train() + actor_obs = self.get_obs(self.actor_cfg["obs"]) + critic_obs = self.get_obs(self.critic_cfg["obs"]) tot_iter = self.it + self.num_learning_iterations self.all_obs = torch.zeros(self.env.num_envs * (tot_iter - self.it + 1), 2) self.all_obs[: self.env.num_envs, :] = actor_obs self.save() + transition = TensorDict({}, batch_size=self.env.num_envs, device=self.device) + transition.update( + { + "actor_obs": actor_obs, + "actions": self.alg.act(actor_obs, critic_obs), + "critic_obs": critic_obs, + "rewards": self.get_rewards({"termination": 0.0})["termination"], + "dones": self.get_timed_out(), + } + ) + storage.initialize( + transition, + self.env.num_envs, + self.env.num_envs * self.num_steps_per_env, + device=self.device, + ) + logger.tic("runtime") for self.it in range(self.it + 1, tot_iter + 1): logger.tic("iteration") @@ -43,17 +72,25 @@ def learn(self): for i in range(self.num_steps_per_env): actions = self.alg.act(actor_obs, critic_obs) self.set_actions( - self.policy_cfg["actions"], + self.actor_cfg["actions"], actions, - self.policy_cfg["disable_actions"], + self.actor_cfg["disable_actions"], + ) + + transition.update( + { + "actor_obs": actor_obs, + "actions": actions, + "critic_obs": critic_obs, + } ) self.env.step() actor_obs = self.get_noisy_obs( - self.policy_cfg["actor_obs"], self.policy_cfg["noise"] + self.actor_cfg["obs"], self.actor_cfg["noise"] ) - critic_obs = self.get_obs(self.policy_cfg["critic_obs"]) + critic_obs = self.get_obs(self.critic_cfg["obs"]) start = self.env.num_envs * self.it end = self.env.num_envs * (self.it + 1) @@ -67,18 +104,25 @@ def learn(self): self.update_rewards(rewards_dict, terminated) total_rewards = torch.stack(tuple(rewards_dict.values())).sum(dim=0) + transition.update( + { + "rewards": total_rewards, + "timed_out": timed_out, + "dones": dones, + } + ) + storage.add_transitions(transition) + logger.log_rewards(rewards_dict) logger.log_rewards({"total_rewards": total_rewards}) logger.finish_step(dones) - - self.alg.process_env_step(total_rewards, dones, timed_out) - self.alg.compute_returns(critic_obs) logger.toc("collection") logger.tic("learning") - self.alg.update() + self.alg.update(storage.data) + storage.clear() logger.toc("learning") - logger.log_category() + logger.log_all_categories() logger.finish_iteration() logger.toc("iteration") @@ -87,12 +131,14 @@ def learn(self): if self.it % self.save_interval == 0: self.save() + self.all_obs = self.all_obs.detach().cpu().numpy() save_path = os.path.join( - LEGGED_GYM_LQRC_DIR, + LEGGED_GYM_ROOT_DIR, "logs", + "lqrc", "standard_training_data.npy" - if self.policy_cfg["standard_critic_nn"] + if self.critic_cfg["standard_critic_nn"] else "custom_training_data.npy", ) @@ -100,33 +146,32 @@ def learn(self): if not os.path.exists(dir_path): os.makedirs(dir_path) np.save(save_path, self.all_obs) + print(f"Saved training observations to {save_path}") self.save() def update_rewards(self, rewards_dict, terminated): rewards_dict.update( self.get_rewards( - self.policy_cfg["reward"]["termination_weight"], mask=terminated + self.critic_cfg["reward"]["termination_weight"], mask=terminated ) ) rewards_dict.update( self.get_rewards( - self.policy_cfg["reward"]["weights"], + self.critic_cfg["reward"]["weights"], modifier=self.env.dt, mask=~terminated, ) ) def set_up_logger(self): - logger.register_rewards(list(self.policy_cfg["reward"]["weights"].keys())) + logger.register_rewards(list(self.critic_cfg["reward"]["weights"].keys())) logger.register_rewards( - list(self.policy_cfg["reward"]["termination_weight"].keys()) + list(self.critic_cfg["reward"]["termination_weight"].keys()) ) logger.register_rewards(["total_rewards"]) logger.register_category( "algorithm", self.alg, ["mean_value_loss", "mean_surrogate_loss"] ) - - logger.attach_torch_obj_to_wandb( - (self.alg.actor_critic.actor, self.alg.actor_critic.critic) - ) + logger.register_category("actor", self.alg.actor, ["action_std", "entropy"]) + logger.attach_torch_obj_to_wandb((self.alg.actor, self.alg.critic)) diff --git a/learning/runners/old_policy_runner.py b/learning/runners/old_policy_runner.py new file mode 100644 index 00000000..42f019a3 --- /dev/null +++ b/learning/runners/old_policy_runner.py @@ -0,0 +1,164 @@ +import os +import torch + +from learning.utils import Logger +from .BaseRunner import BaseRunner +from learning.algorithms import PPO # noqa: F401 +from learning.modules import ActorCritic, Actor, Critic + +logger = Logger() + + +class OldPolicyRunner(BaseRunner): + def __init__(self, env, train_cfg, device="cpu"): + super().__init__(env, train_cfg, device) + self.init_storage() + logger.initialize( + self.env.num_envs, + self.env.dt, + self.cfg["max_iterations"], + self.device, + ) + + def _set_up_alg(self): + num_actor_obs = self.get_obs_size(self.actor_cfg["obs"]) + num_actions = self.get_action_size(self.actor_cfg["actions"]) + num_critic_obs = self.get_obs_size(self.critic_cfg["obs"]) + actor = Actor(num_actor_obs, num_actions, **self.actor_cfg) + critic = Critic(num_critic_obs, **self.critic_cfg) + actor_critic = ActorCritic(actor, critic) + alg_class = eval(self.cfg["algorithm_class_name"]) + self.alg = alg_class(actor_critic, device=self.device, **self.alg_cfg) + + def learn(self): + self.set_up_logger() + + rewards_dict = {} + + self.alg.actor_critic.train() + actor_obs = self.get_obs(self.actor_cfg["obs"]) + critic_obs = self.get_obs(self.critic_cfg["obs"]) + tot_iter = self.it + self.num_learning_iterations + + self.save() + + logger.tic("runtime") + for self.it in range(self.it + 1, tot_iter + 1): + logger.tic("iteration") + logger.tic("collection") + # * Rollout + with torch.inference_mode(): + for i in range(self.num_steps_per_env): + actions = self.alg.act(actor_obs, critic_obs) + self.set_actions( + self.actor_cfg["actions"], + actions, + self.actor_cfg["disable_actions"], + ) + + self.env.step() + + actor_obs = self.get_noisy_obs( + self.actor_cfg["obs"], self.actor_cfg["noise"] + ) + critic_obs = self.get_obs(self.critic_cfg["obs"]) + # * get time_outs + timed_out = self.get_timed_out() + terminated = self.get_terminated() + dones = timed_out | terminated + + self.update_rewards(rewards_dict, terminated) + total_rewards = torch.stack(tuple(rewards_dict.values())).sum(dim=0) + + logger.log_rewards(rewards_dict) + logger.log_rewards({"total_rewards": total_rewards}) + logger.finish_step(dones) + + self.alg.process_env_step(total_rewards, dones, timed_out) + self.alg.compute_returns(critic_obs) + logger.toc("collection") + + logger.tic("learning") + self.alg.update() + logger.toc("learning") + logger.log_all_categories() + + logger.finish_iteration() + logger.toc("iteration") + logger.toc("runtime") + logger.print_to_terminal() + + if self.it % self.save_interval == 0: + self.save() + self.save() + + def update_rewards(self, rewards_dict, terminated): + rewards_dict.update( + self.get_rewards( + self.critic_cfg["reward"]["termination_weight"], mask=terminated + ) + ) + rewards_dict.update( + self.get_rewards( + self.critic_cfg["reward"]["weights"], + modifier=self.env.dt, + mask=~terminated, + ) + ) + + def set_up_logger(self): + logger.register_rewards(list(self.critic_cfg["reward"]["weights"].keys())) + logger.register_rewards( + list(self.critic_cfg["reward"]["termination_weight"].keys()) + ) + logger.register_rewards(["total_rewards"]) + logger.register_category( + "algorithm", self.alg, ["mean_value_loss", "mean_surrogate_loss"] + ) + + logger.attach_torch_obj_to_wandb( + (self.alg.actor_critic.actor, self.alg.actor_critic.critic) + ) + + def save(self): + os.makedirs(self.log_dir, exist_ok=True) + path = os.path.join(self.log_dir, "model_{}.pt".format(self.it)) + torch.save( + { + "actor_state_dict": self.alg.actor_critic.actor.state_dict(), + "critic_state_dict": self.alg.actor_critic.critic.state_dict(), + "optimizer_state_dict": self.alg.optimizer.state_dict(), + "iter": self.it, + }, + path, + ) + + def load(self, path, load_optimizer=True): + loaded_dict = torch.load(path) + self.alg.actor_critic.actor.load_state_dict(loaded_dict["actor_state_dict"]) + self.alg.actor_critic.critic.load_state_dict(loaded_dict["critic_state_dict"]) + if load_optimizer: + self.alg.optimizer.load_state_dict(loaded_dict["optimizer_state_dict"]) + self.it = loaded_dict["iter"] + + def switch_to_eval(self): + self.alg.actor_critic.eval() + + def get_inference_actions(self): + obs = self.get_noisy_obs(self.actor_cfg["obs"], self.actor_cfg["noise"]) + return self.alg.actor_critic.actor.act_inference(obs) + + def export(self, path): + self.alg.actor_critic.export_policy(path) + + def init_storage(self): + num_actor_obs = self.get_obs_size(self.actor_cfg["obs"]) + num_critic_obs = self.get_obs_size(self.critic_cfg["obs"]) + num_actions = self.get_action_size(self.actor_cfg["actions"]) + self.alg.init_storage( + self.env.num_envs, + self.num_steps_per_env, + actor_obs_shape=[num_actor_obs], + critic_obs_shape=[num_critic_obs], + action_shape=[num_actions], + ) diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py index 9660dd66..816d03fb 100644 --- a/learning/runners/on_policy_runner.py +++ b/learning/runners/on_policy_runner.py @@ -1,11 +1,14 @@ import os import torch +from tensordict import TensorDict from learning.utils import Logger from .BaseRunner import BaseRunner +from learning.storage import DictStorage logger = Logger() +storage = DictStorage() class OnPolicyRunner(BaseRunner): @@ -23,13 +26,38 @@ def learn(self): rewards_dict = {} - self.alg.actor_critic.train() - actor_obs = self.get_obs(self.policy_cfg["actor_obs"]) - critic_obs = self.get_obs(self.policy_cfg["critic_obs"]) + self.alg.switch_to_train() + actor_obs = self.get_obs(self.actor_cfg["obs"]) + critic_obs = self.get_obs(self.critic_cfg["obs"]) tot_iter = self.it + self.num_learning_iterations - self.save() + # * start up storage + transition = TensorDict({}, batch_size=self.env.num_envs, device=self.device) + transition.update( + { + "actor_obs": actor_obs, + "next_actor_obs": actor_obs, + "actions": self.alg.act(actor_obs), + "critic_obs": critic_obs, + "next_critic_obs": critic_obs, + "rewards": self.get_rewards({"termination": 0.0})["termination"], + "timed_out": self.get_timed_out(), + "terminated": self.get_terminated(), + "dones": self.get_timed_out() | self.get_terminated(), + } + ) + storage.initialize( + transition, + self.env.num_envs, + self.env.num_envs * self.num_steps_per_env, + device=self.device, + ) + + # burn in observation normalization. + if self.actor_cfg["normalize_obs"] or self.critic_cfg["normalize_obs"]: + self.burn_in_normalization() + logger.tic("runtime") for self.it in range(self.it + 1, tot_iter + 1): logger.tic("iteration") @@ -37,19 +65,27 @@ def learn(self): # * Rollout with torch.inference_mode(): for i in range(self.num_steps_per_env): - actions = self.alg.act(actor_obs, critic_obs) + actions = self.alg.act(actor_obs) self.set_actions( - self.policy_cfg["actions"], + self.actor_cfg["actions"], actions, - self.policy_cfg["disable_actions"], + self.actor_cfg["disable_actions"], + ) + + transition.update( + { + "actor_obs": actor_obs, + "actions": actions, + "critic_obs": critic_obs, + } ) self.env.step() actor_obs = self.get_noisy_obs( - self.policy_cfg["actor_obs"], self.policy_cfg["noise"] + self.actor_cfg["obs"], self.actor_cfg["noise"] ) - critic_obs = self.get_obs(self.policy_cfg["critic_obs"]) + critic_obs = self.get_obs(self.critic_cfg["obs"]) # * get time_outs timed_out = self.get_timed_out() @@ -59,18 +95,28 @@ def learn(self): self.update_rewards(rewards_dict, terminated) total_rewards = torch.stack(tuple(rewards_dict.values())).sum(dim=0) + transition.update( + { + "next_actor_obs": actor_obs, + "next_critic_obs": critic_obs, + "rewards": total_rewards, + "timed_out": timed_out, + "terminated": terminated, + "dones": dones, + } + ) + storage.add_transitions(transition) + logger.log_rewards(rewards_dict) logger.log_rewards({"total_rewards": total_rewards}) logger.finish_step(dones) - - self.alg.process_env_step(total_rewards, dones, timed_out) - self.alg.compute_returns(critic_obs) logger.toc("collection") logger.tic("learning") - self.alg.update() + self.alg.update(storage.data) + storage.clear() logger.toc("learning") - logger.log_category() + logger.log_all_categories() logger.finish_iteration() logger.toc("iteration") @@ -79,45 +125,59 @@ def learn(self): if self.it % self.save_interval == 0: self.save() - self.save() + @torch.no_grad + def burn_in_normalization(self, n_iterations=100): + actor_obs = self.get_obs(self.actor_cfg["obs"]) + critic_obs = self.get_obs(self.critic_cfg["obs"]) + for _ in range(n_iterations): + actions = self.alg.act(actor_obs) + self.set_actions(self.actor_cfg["actions"], actions) + self.env.step() + actor_obs = self.get_noisy_obs( + self.actor_cfg["obs"], self.actor_cfg["noise"] + ) + critic_obs = self.get_obs(self.critic_cfg["obs"]) + self.alg.critic.evaluate(critic_obs) + self.env.reset() + def update_rewards(self, rewards_dict, terminated): rewards_dict.update( self.get_rewards( - self.policy_cfg["reward"]["termination_weight"], mask=terminated + self.critic_cfg["reward"]["termination_weight"], mask=terminated ) ) rewards_dict.update( self.get_rewards( - self.policy_cfg["reward"]["weights"], + self.critic_cfg["reward"]["weights"], modifier=self.env.dt, mask=~terminated, ) ) def set_up_logger(self): - logger.register_rewards(list(self.policy_cfg["reward"]["weights"].keys())) + logger.register_rewards(list(self.critic_cfg["reward"]["weights"].keys())) logger.register_rewards( - list(self.policy_cfg["reward"]["termination_weight"].keys()) + list(self.critic_cfg["reward"]["termination_weight"].keys()) ) logger.register_rewards(["total_rewards"]) logger.register_category( "algorithm", self.alg, ["mean_value_loss", "mean_surrogate_loss"] ) + logger.register_category("actor", self.alg.actor, ["action_std", "entropy"]) - logger.attach_torch_obj_to_wandb( - (self.alg.actor_critic.actor, self.alg.actor_critic.critic) - ) + logger.attach_torch_obj_to_wandb((self.alg.actor, self.alg.critic)) def save(self): os.makedirs(self.log_dir, exist_ok=True) path = os.path.join(self.log_dir, "model_{}.pt".format(self.it)) torch.save( { - "actor_state_dict": self.alg.actor_critic.actor.state_dict(), - "critic_state_dict": self.alg.actor_critic.critic.state_dict(), + "actor_state_dict": self.alg.actor.state_dict(), + "critic_state_dict": self.alg.critic.state_dict(), "optimizer_state_dict": self.alg.optimizer.state_dict(), + "critic_optimizer_state_dict": self.alg.critic_optimizer.state_dict(), "iter": self.it, }, path, @@ -125,30 +185,22 @@ def save(self): def load(self, path, load_optimizer=True): loaded_dict = torch.load(path) - self.alg.actor_critic.actor.load_state_dict(loaded_dict["actor_state_dict"]) - self.alg.actor_critic.critic.load_state_dict(loaded_dict["critic_state_dict"]) + self.alg.actor.load_state_dict(loaded_dict["actor_state_dict"]) + self.alg.critic.load_state_dict(loaded_dict["critic_state_dict"]) if load_optimizer: self.alg.optimizer.load_state_dict(loaded_dict["optimizer_state_dict"]) + self.alg.critic_optimizer.load_state_dict( + loaded_dict["critic_optimizer_state_dict"] + ) self.it = loaded_dict["iter"] def switch_to_eval(self): - self.alg.actor_critic.eval() + self.alg.actor.eval() + self.alg.critic.eval() def get_inference_actions(self): - obs = self.get_noisy_obs(self.policy_cfg["actor_obs"], self.policy_cfg["noise"]) - return self.alg.actor_critic.actor.act_inference(obs) + obs = self.get_noisy_obs(self.actor_cfg["obs"], self.actor_cfg["noise"]) + return self.alg.actor.act_inference(obs) def export(self, path): - self.alg.actor_critic.export_policy(path) - - def init_storage(self): - num_actor_obs = self.get_obs_size(self.policy_cfg["actor_obs"]) - num_critic_obs = self.get_obs_size(self.policy_cfg["critic_obs"]) - num_actions = self.get_action_size(self.policy_cfg["actions"]) - self.alg.init_storage( - self.env.num_envs, - self.num_steps_per_env, - actor_obs_shape=[num_actor_obs], - critic_obs_shape=[num_critic_obs], - action_shape=[num_actions], - ) + self.alg.actor.export(path) diff --git a/learning/storage/__init__.py b/learning/storage/__init__.py index edd7bd55..f10df85f 100644 --- a/learning/storage/__init__.py +++ b/learning/storage/__init__.py @@ -3,3 +3,4 @@ from .rollout_storage import RolloutStorage from .SE_storage import SERolloutStorage +from .dict_storage import DictStorage \ No newline at end of file diff --git a/learning/storage/base_storage.py b/learning/storage/base_storage.py index 66a78328..835d3b07 100644 --- a/learning/storage/base_storage.py +++ b/learning/storage/base_storage.py @@ -1,4 +1,4 @@ -""" Store data for different loop +"""Store data for different loop Design consideration: change the original "has-a" relationship of LT->rollout->transition to parallel. Transition stores all the needed data for every step and will be cleared/updated every step and passed to rollout and diff --git a/learning/storage/dict_storage.py b/learning/storage/dict_storage.py new file mode 100644 index 00000000..2191093c --- /dev/null +++ b/learning/storage/dict_storage.py @@ -0,0 +1,48 @@ +import torch +from tensordict import TensorDict + + +class DictStorage: + def __init__(self): + self.initialized = False + + def initialize( + self, + dummy_dict, + num_envs=2**12, + max_storage=2**17, + device="cpu", + ): + self.device = device + self.num_envs = num_envs + max_length = max_storage // num_envs + self.max_length = max_length + self.data = TensorDict({}, batch_size=(max_length, num_envs), device=device) + self.fill_count = 0 + + for key in dummy_dict.keys(): + if dummy_dict[key].dim() == 1: # if scalar + self.data[key] = torch.zeros( + (max_length, num_envs), + dtype=dummy_dict[key].dtype, + device=self.device, + ) + else: + self.data[key] = torch.zeros( + (max_length, num_envs, dummy_dict[key].shape[1]), + dtype=dummy_dict[key].dtype, + device=self.device, + ) + + @torch.inference_mode + def add_transitions(self, transition: TensorDict): + if self.fill_count >= self.max_length: + raise AssertionError("Rollout buffer overflow") + self.data[self.fill_count] = transition + self.fill_count += 1 + + def clear(self): + self.fill_count = 0 + with torch.inference_mode(): + for tensor in self.data: + tensor.zero_() diff --git a/learning/storage/rollout_storage.py b/learning/storage/rollout_storage.py index bba48e02..38a315d7 100644 --- a/learning/storage/rollout_storage.py +++ b/learning/storage/rollout_storage.py @@ -182,7 +182,7 @@ def get_statistics(self): trajectory_lengths = done_indices[1:] - done_indices[:-1] return trajectory_lengths.float().mean(), self.rewards.mean() - def mini_batch_generator(self, num_mini_batches, num_epochs=8): + def mini_batch_generator(self, num_mini_batches=1, num_epochs=8): batch_size = self.num_envs * self.num_transitions_per_env mini_batch_size = batch_size // num_mini_batches indices = torch.randperm( diff --git a/learning/utils/PBRS/test_usecase.py b/learning/utils/PBRS/test_usecase.py index a84ef547..be5d0412 100644 --- a/learning/utils/PBRS/test_usecase.py +++ b/learning/utils/PBRS/test_usecase.py @@ -30,14 +30,14 @@ def test_PBRS(): logger.initialize(num_envs=1, episode_dt=0.1, total_iterations=100, device="cpu") self_env = MyTask() - # let's create a dummy policy_cfg with just the reward weights - policy_cfg = {"reward": {"weights": {"dummy": 1}}} - policy_cfg["reward"]["pbrs_weights"] = {"first": 1, "second": 2} + # let's create a dummy critic_cfg with just the reward weights + critic_cfg = {"reward": {"weights": {"dummy": 1}}} + critic_cfg["reward"]["pbrs_weights"] = {"first": 1, "second": 2} # and a dummy env PBRS = PotentialBasedRewardShaping( - policy_cfg["reward"]["pbrs_weights"], device="cpu" + critic_cfg["reward"]["pbrs_weights"], device="cpu" ) assert PBRS.get_reward_keys() == ["PBRS_first", "PBRS_second"] diff --git a/learning/utils/__init__.py b/learning/utils/__init__.py index a39a3daf..15c674d6 100644 --- a/learning/utils/__init__.py +++ b/learning/utils/__init__.py @@ -1,40 +1,8 @@ -# SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. -# All rights reserved. -# SPDX-License-Identifier: BSD-3-Clause -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, -# this list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# 3. Neither the name of the copyright holder nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE -# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF -# THE POSSIBILITY OF SUCH DAMAGE. -# -# Copyright (c) 2021 ETH Zurich, Nikita Rudin from .utils import ( - split_and_pad_trajectories, - unpad_trajectories, remove_zero_weighted_rewards, set_discount_from_horizon, ) +from .dict_utils import * from .logger import Logger from .PBRS.PotentialBasedRewardShaping import PotentialBasedRewardShaping \ No newline at end of file diff --git a/learning/utils/dict_utils.py b/learning/utils/dict_utils.py new file mode 100644 index 00000000..d73709fa --- /dev/null +++ b/learning/utils/dict_utils.py @@ -0,0 +1,92 @@ +import numpy as np +import torch +from tensordict import TensorDict + + +@torch.no_grad +def compute_MC_returns(data: TensorDict, gamma, critic=None): + # todo not as accurate as taking + if critic is None: + last_values = torch.zeros_like(data["rewards"][0]) + else: + last_values = critic.evaluate(data["critic_obs"][-1]) + + returns = torch.zeros_like(data["rewards"]) + returns[-1] = data["rewards"][-1] + gamma * last_values * ~data["terminated"][-1] + for k in reversed(range(data["rewards"].shape[0] - 1)): + not_done = ~data["dones"][k] + returns[k] = data["rewards"][k] + gamma * returns[k + 1] * not_done + if critic is not None: + returns[k] += ( + gamma + * critic.evaluate(data["critic_obs"][k]) + * data["timed_out"][k] + * ~data["terminated"][k] + ) + return returns + + +@torch.no_grad +def normalize(input, eps=1e-8): + return (input - input.mean()) / (input.std() + eps) + + +@torch.no_grad +def compute_generalized_advantages(data, gamma, lam, critic): + data["values"] = critic.evaluate(data["critic_obs"]) + last_values = critic.evaluate(data["next_critic_obs"][-1]) + advantages = torch.zeros_like(data["values"]) + if last_values is not None: + # todo check this + not_done = ~data["dones"][-1] + advantages[-1] = ( + data["rewards"][-1] + + gamma * data["values"][-1] * data["timed_out"][-1] + + gamma * last_values * not_done + - data["values"][-1] + ) + + for k in reversed(range(data["values"].shape[0] - 1)): + not_done = ~data["dones"][k] + td_error = ( + data["rewards"][k] + + gamma * data["values"][k] * data["timed_out"][k] + + gamma * data["values"][k + 1] * not_done + - data["values"][k] + ) + advantages[k] = td_error + gamma * lam * not_done * advantages[k + 1] + + return advantages + + +# todo change num_epochs to num_batches +@torch.no_grad +def create_uniform_generator( + data, batch_size, num_epochs=1, max_gradient_steps=None, keys=None +): + n, m = data.shape + total_data = n * m + + if batch_size > total_data: + batch_size = total_data + + num_batches_per_epoch = total_data // batch_size + if max_gradient_steps: + num_epochs = max_gradient_steps // num_batches_per_epoch + num_epochs = max(num_epochs, 1) + for epoch in range(num_epochs): + indices = torch.randperm(total_data, device=data.device) + for i in range(num_batches_per_epoch): + batched_data = data.flatten(0, 1)[ + indices[i * batch_size : (i + 1) * batch_size] + ] + yield batched_data + + +@torch.no_grad +def export_to_numpy(data, path): + # check if path ends iwth ".npz", and if not append it. + if not path.endswith(".npz"): + path += ".npz" + np.savez_compressed(path, **{key: val.cpu().numpy() for key, val in data.items()}) + return diff --git a/learning/utils/logger/Logger.py b/learning/utils/logger/Logger.py index f84626df..bcaa56a2 100644 --- a/learning/utils/logger/Logger.py +++ b/learning/utils/logger/Logger.py @@ -122,19 +122,23 @@ def separator(subtitle="", marker="-"): ) print(log_string) - def log_category(self, category="algorithm"): - self.iteration_logs.log(category) + def log_all_categories(self): + for category in self.iteration_logs.logs.keys(): + self.iteration_logs.log(category) def log_to_wandb(self): - def prepend_to_keys(section, dictionary): - return {section + "/" + key: val for key, val in dictionary.items()} + def prepend_to_keys(prefix, dictionary): + return {prefix + "/" + key: val for key, val in dictionary.items()} averages = prepend_to_keys("rewards", self.reward_logs.get_average_rewards()) - algorithm_logs = prepend_to_keys( - "algorithm", self.iteration_logs.get_all_logs("algorithm") - ) - wandb.log({**averages, **algorithm_logs}) + category_logs = { + f"{category}/{key}": val + for category in self.iteration_logs.logs.keys() + for key, val in self.iteration_logs.get_all_logs(category).items() + } + + wandb.log({**averages, **category_logs}) def tic(self, category="default"): self.timer.tic(category) diff --git a/learning/utils/logger/test_logger.py b/learning/utils/logger/test_logger.py index ebd0b859..eaef76f6 100644 --- a/learning/utils/logger/test_logger.py +++ b/learning/utils/logger/test_logger.py @@ -126,8 +126,7 @@ def test_logging_iteration(): logger.register_category("performanceAB", mocky, ["a", "b"]) for _ in range(10): - logger.log_category("performanceA") - logger.log_category("performanceAB") + logger.log_all_categories() logger.finish_iteration() for key, val in logger.iteration_logs.logs["performanceA"].items(): diff --git a/learning/utils/tests/test_returns.py b/learning/utils/tests/test_returns.py new file mode 100644 index 00000000..1876c123 --- /dev/null +++ b/learning/utils/tests/test_returns.py @@ -0,0 +1,133 @@ +import pytest +import torch +from tensordict import TensorDict +from learning.utils import compute_MC_returns, compute_generalized_advantages + + +class CriticConstant(torch.nn.Module): + def __init__(self, output): + super().__init__() + self.output = output + + def evaluate(self, obs): + return self.output * torch.ones(obs.shape[:-1]) + + +@pytest.fixture +def setup_data(): + n_timesteps = 4 + n_envs = 5 + rewards = torch.ones((n_timesteps, n_envs)) + terminated = torch.zeros((n_timesteps, n_envs), dtype=torch.bool) + timed_out = torch.zeros_like(terminated) + critic_obs = torch.rand((n_timesteps, n_envs, 1)) + + # Terminating conditions setup + timed_out[-1, [0, 2]] = True + terminated[-1, [1, 2]] = True + timed_out[1, 3] = True + terminated[1, 4] = True + + dones = timed_out | terminated + data = TensorDict( + { + "rewards": rewards, + "timed_out": timed_out, + "terminated": terminated, + "dones": dones, + "critic_obs": critic_obs, + "next_critic_obs": torch.roll(critic_obs, shifts=-1, dims=0), + }, + batch_size=(n_timesteps, n_envs), + ) + + return data + + +def test_critic_always_zero_gamma_zero(setup_data): + data = setup_data + critic = CriticConstant(0) + returns = compute_MC_returns(data, gamma=0.0, critic=critic) + expected_returns = data["rewards"] + torch.testing.assert_close(returns, expected_returns) + + +def test_critic_always_four_gamma_zero(setup_data): + data = setup_data + critic = CriticConstant(4) + returns = compute_MC_returns(data, gamma=0.0, critic=critic) + expected_returns = data["rewards"] + torch.testing.assert_close(returns, expected_returns) + + +def test_critic_always_zero_gamma_one(setup_data): + data = setup_data + critic = CriticConstant(0) + returns = compute_MC_returns(data, gamma=1.0, critic=critic) + expected_returns = torch.tensor( + [ + [4.0, 3.0, 2.0, 1.0], + [4.0, 3.0, 2.0, 1.0], + [4.0, 3.0, 2.0, 1.0], + [2.0, 1.0, 2.0, 1.0], + [2.0, 1.0, 2.0, 1.0], + ] + ).T + torch.testing.assert_close(returns, expected_returns) + + +def test_critic_always_four_gamma_one(setup_data): + data = setup_data + critic = CriticConstant(4) + returns = compute_MC_returns(data, gamma=1.0, critic=critic) + expected_returns = torch.tensor( + [ + [8.0, 7.0, 6.0, 5.0], + [4.0, 3.0, 2.0, 1.0], + [4.0, 3.0, 2.0, 1.0], + [6.0, 5.0, 6.0, 5.0], + [2.0, 1.0, 6.0, 5.0], + ] + ).T + torch.testing.assert_close(returns, expected_returns) + + +def test_critic_always_zero_gamma_half(setup_data): + data = setup_data + critic = CriticConstant(0) + returns = compute_MC_returns(data, gamma=0.5, critic=critic) + expected_returns = torch.tensor( + [ + [1.875, 1.75, 1.5, 1.0], + [1.875, 1.75, 1.5, 1.0], + [1.875, 1.75, 1.5, 1.0], + [1.5, 1.0, 1.5, 1.0], + [1.5, 1.0, 1.5, 1.0], + ] + ).T + torch.testing.assert_close(returns, expected_returns) + + +def test_critic_always_four_gamma_half(setup_data): + data = setup_data + critic = CriticConstant(6.0) + returns = compute_MC_returns(data, gamma=0.5, critic=critic) + expected_returns = torch.tensor( + [ + [2.25, 2.5, 3.0, 4.0], + [1.875, 1.75, 1.5, 1.0], + [1.875, 1.75, 1.5, 1.0], + [3.0, 4.0, 3.0, 4.0], + [1.5, 1.0, 3.0, 4.0], + ] + ).T + torch.testing.assert_close(returns, expected_returns) + + +def test_gae_critic_always_zero_lambda_one(setup_data): + data = setup_data + critic = CriticConstant(0) + returns = compute_MC_returns(data, gamma=0.5, critic=critic) + advantages = compute_generalized_advantages(data, gamma=0.5, lam=1.0, critic=critic) + expected_advantages = returns - critic.evaluate(data["critic_obs"]) + torch.testing.assert_close(advantages, expected_advantages) diff --git a/learning/utils/utils.py b/learning/utils/utils.py index b1a20bcf..7a54c65a 100644 --- a/learning/utils/utils.py +++ b/learning/utils/utils.py @@ -1,96 +1,3 @@ -# SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. -# All rights reserved. -# SPDX-License-Identifier: BSD-3-Clause -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, -# this list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# 3. Neither the name of the copyright holder nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE -# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF -# THE POSSIBILITY OF SUCH DAMAGE. -# -# Copyright (c) 2021 ETH Zurich, Nikita Rudin - -import torch - - -def split_and_pad_trajectories(tensor, dones): - """ - Splits trajectories at done indices. Then concatenates them and pads - with zeros up to the length og the longest trajectory. - Returns masks corresponding to valid parts of the trajectories - Example: - Input: [ [a1, a2, a3, a4 | a5, a6], - [b1, b2 | b3, b4, b5 | b6] - ] - - Output:[ [a1, a2, a3, a4], | [ [True, True, True, True], - [a5, a6, 0, 0], | [True, True, False, False], - [b1, b2, 0, 0], | [True, True, False, False], - [b3, b4, b5, 0], | [True, True, True, False], - [b6, 0, 0, 0] | [True, False, False, False], - ] | ] - - Assumes that the inputy has the following dimension order: - [time, number of envs, aditional dimensions] - """ - dones = dones.clone() - dones[-1] = 1 - # * Permute the buffers to have order - # * (num_envs, num_transitions_per_env, ...), for correct reshaping - flat_dones = dones.transpose(1, 0).reshape(-1, 1) - - # * Get length of trajectory by counting the number of - # * successive not done elements - done_indices = torch.cat( - ( - flat_dones.new_tensor([-1], dtype=torch.int64), - flat_dones.nonzero()[:, 0], - ) - ) - trajectory_lengths = done_indices[1:] - done_indices[:-1] - trajectory_lengths_list = trajectory_lengths.tolist() - # * Extract the individual trajectories - trajectories = torch.split( - tensor.transpose(1, 0).flatten(0, 1), trajectory_lengths_list - ) - padded_trajectories = torch.nn.utils.rnn.pad_sequence(trajectories) - - trajectory_masks = trajectory_lengths > torch.arange( - 0, tensor.shape[0], device=tensor.device - ).unsqueeze(1) - return padded_trajectories, trajectory_masks - - -def unpad_trajectories(trajectories, masks): - """Does the inverse operation of split_and_pad_trajectories()""" - # * Need to transpose before and after the masking to have proper reshaping - return ( - trajectories.transpose(1, 0)[masks.transpose(1, 0)] - .view(-1, trajectories.shape[0], trajectories.shape[-1]) - .transpose(1, 0) - ) - - def remove_zero_weighted_rewards(reward_weights): for name in list(reward_weights.keys()): if reward_weights[name] == 0: diff --git a/requirements.txt b/requirements.txt index d08d918a..3f100110 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,9 +7,10 @@ torch>=2.0 torchvision torchaudio pygame -pytest +pytest<8.0 mss ruff pre-commit onnx -onnxruntime \ No newline at end of file +onnxruntime +tensordict \ No newline at end of file diff --git a/scripts/lqrc/brute_force_value_function.py b/scripts/lqrc/brute_force_value_function.py index fd3baeb8..5634f710 100644 --- a/scripts/lqrc/brute_force_value_function.py +++ b/scripts/lqrc/brute_force_value_function.py @@ -4,12 +4,12 @@ from gym.envs import __init__ # noqa: F401 from gym import LEGGED_GYM_ROOT_DIR from gym.utils import get_args, task_registry -from learning import LEGGED_GYM_LQRC_DIR from learning.modules import Critic from learning.modules.lqrc.utils import get_load_path from learning.modules.lqrc.plotting import ( plot_value_func, plot_value_func_error, + plot_training_data_dist, ) from isaacgym import gymtorch @@ -98,15 +98,15 @@ def get_ground_truth(env, runner, train_cfg, grid): ) rewards_dict = {} rewards = np.zeros((runner.num_steps_per_env, env.num_envs)) - reward_list = runner.policy_cfg["reward"]["weights"] + reward_list = runner.critic_cfg["reward"]["weights"] n_steps = runner.num_steps_per_env states_to_log, logs = create_logging_dict(runner, n_steps, reward_list) for i in range(runner.num_steps_per_env): runner.set_actions( - runner.policy_cfg["actions"], + runner.actor_cfg["actions"], runner.get_inference_actions(), - runner.policy_cfg["disable_actions"], + runner.actor_cfg["disable_actions"], ) env.step() terminated = runner.get_terminated() @@ -152,9 +152,9 @@ def query_value_function(vf_args, grid): DEVICE = "cuda:0" steps = 50 npy_fn = ( - f"{LEGGED_GYM_LQRC_DIR}/logs/custom_training_data.npy" + f"{LEGGED_GYM_ROOT_DIR}/logs/lqrc/custom_training_data.npy" if args.custom_critic - else f"{LEGGED_GYM_LQRC_DIR}/logs/standard_training_data.npy" + else f"{LEGGED_GYM_ROOT_DIR}/logs/lqrc/standard_training_data.npy" ) data = np.load(npy_fn) # ! data is normalized @@ -166,7 +166,7 @@ def query_value_function(vf_args, grid): EXPORT_POLICY = False time_str = time.strftime("%b%d_%H-%M-%S") - save_path = os.path.join(LEGGED_GYM_LQRC_DIR, f"logs/{time_str}") + save_path = os.path.join(LEGGED_GYM_ROOT_DIR, f"logs/{time_str}") if not os.path.exists(save_path): os.makedirs(save_path) # get ground truth @@ -174,45 +174,47 @@ def query_value_function(vf_args, grid): env, runner, train_cfg = setup(args) ground_truth_returns, logs = get_ground_truth(env, runner, train_cfg, grid) - logs = {k: v.detach().cpu().numpy() for k, v in logs.items()} - np.savez_compressed(os.path.join(save_path, "data.npz"), **logs) - print("saved logs to", os.path.join(save_path, "data.npz")) - - high_gt_returns = [] - for i in range(ground_truth_returns.shape[1] - 1): - if ground_truth_returns[0, i] > 3.5 and ground_truth_returns[0, i + 1] < 2.0: - high_gt_returns.append( - torch.hstack((grid[i, :], grid[i + 1, :])).detach().cpu().numpy() - ) - if ground_truth_returns[0, i] > 3.5 and ground_truth_returns[0, i + 1] < 2.0: - high_gt_returns.append( - torch.hstack((grid[i, :], grid[i + 1, :])).detach().cpu().numpy() - ) - high_gt_returns = np.array(high_gt_returns) - returns_save_path = ( - f"{LEGGED_GYM_LQRC_DIR}/logs/custom_high_returns.npy" - if args.custom_critic - else f"{LEGGED_GYM_LQRC_DIR}/logs/standard_high_returns.npy" - ) - returns_save_path = ( - f"{LEGGED_GYM_LQRC_DIR}/logs/custom_high_returns.npy" - if args.custom_critic - else f"{LEGGED_GYM_LQRC_DIR}/logs/standard_high_returns.npy" - ) - np.save(returns_save_path, high_gt_returns) - print("Saved high returns to", returns_save_path) + grid_gt_combined = np.hstack((grid.detach().cpu().numpy(), ground_truth_returns.T)) + gt_save_path = f"{LEGGED_GYM_ROOT_DIR}/logs/lqrc/ground_truth.npy" + np.save(gt_save_path, grid_gt_combined) + print("Saved high returns to", gt_save_path) + + exit() + + # logs = {k: v.detach().cpu().numpy() for k, v in logs.items()} + # np.savez_compressed(os.path.join(save_path, "data.npz"), **logs) + # print("saved logs to", os.path.join(save_path, "data.npz")) + + # high_gt_returns = [] + # for i in range(ground_truth_returns.shape[1] - 1): + # if ground_truth_returns[0, i] > 3.5 and ground_truth_returns[0, i + 1] < 2.0: + # high_gt_returns.append( + # torch.hstack((grid[i, :], grid[i + 1, :])).detach().cpu().numpy() + # ) + # if ground_truth_returns[0, i] > 3.5 and ground_truth_returns[0, i + 1] < 2.0: + # high_gt_returns.append( + # torch.hstack((grid[i, :], grid[i + 1, :])).detach().cpu().numpy() + # ) + # high_gt_returns = np.array(high_gt_returns) + # returns_save_path = ( + # f"{LEGGED_GYM_ROOT_DIR}/logs/lqrc/custom_high_returns.npy" + # if args.custom_critic + # else f"{LEGGED_GYM_ROOT_DIR}/logs/lqrc/standard_high_returns.npy" + # ) + # np.save(returns_save_path, high_gt_returns) + # print("Saved high returns to", returns_save_path) # get NN value functions custom_vf_args = { - "experiment_name": "pendulum_custom_critic", - "load_run": "Feb22_14-11-04_custom_critic", + "experiment_name": "pendulum_critic_only", + "load_run": "Mar01_11-03-27_custom_critic_only", "checkpoint": -1, "model_type": "CholeskyPlusConst", } custom_critic_returns = query_value_function(custom_vf_args, grid) standard_vf_args = { - "experiment_name": "pendulum_standard_critic", - "load_run": "Feb22_14-07-29_standard_critic", + "experiment_name": "pendulum_critic_only", + "load_run": "Mar01_11-06-37_standard_critic_only", "checkpoint": -1, "model_type": "StandardMLP", } @@ -237,5 +239,4 @@ def query_value_function(vf_args, grid): ) # ! store data dist with model logs to ensure they're paired properly - # plot_training_data_dist(npy_fn, - # save_path + "/data_distribution.png") + plot_training_data_dist(npy_fn, save_path + "/data_distribution.png") diff --git a/scripts/lqrc/rollout_init_conditions.py b/scripts/lqrc/rollout_init_conditions.py index 4209abab..8f54dd1d 100644 --- a/scripts/lqrc/rollout_init_conditions.py +++ b/scripts/lqrc/rollout_init_conditions.py @@ -7,7 +7,7 @@ # torch needs to be imported after isaacgym imports in local source import torch -from learning import LEGGED_GYM_LQRC_DIR +from learning import LEGGED_GYM_ROOT_DIR from learning.modules.lqrc.plotting import plot_trajectories @@ -65,9 +65,9 @@ def play(env, runner, train_cfg, init_conditions): if env.cfg.viewer.record: recorder.update(i) runner.set_actions( - runner.policy_cfg["actions"], + runner.actor_cfg["actions"], runner.get_inference_actions(), - runner.policy_cfg["disable_actions"], + runner.actor_cfg["disable_actions"], ) env.step() pos_traj[i, :] = env.dof_pos.detach().cpu().numpy().squeeze() @@ -89,9 +89,9 @@ def play(env, runner, train_cfg, init_conditions): args = get_args() DEVICE = "cuda:0" npy_fn = ( - f"{LEGGED_GYM_LQRC_DIR}/logs/custom_high_returns.npy" + f"{LEGGED_GYM_ROOT_DIR}/logs/lqrc/custom_high_returns.npy" if args.custom_critic - else f"{LEGGED_GYM_LQRC_DIR}/logs/standard_high_returns.npy" + else f"{LEGGED_GYM_ROOT_DIR}/logs/lqrc/standard_high_returns.npy" ) init_conditions = np.array([[0.25, -0.75, 0.2, -0.7]]) # np.load(npy_fn) num_init_conditions = 2 @@ -100,13 +100,16 @@ def play(env, runner, train_cfg, init_conditions): pos_traj, vel_traj, torques, rewards = play( env, runner, train_cfg, init_conditions ) + + high = np.hstack((init_conditions[0, 0], init_conditions[0, 2])) + low = np.hstack((init_conditions[0, 1], init_conditions[0, 3])) + title = f"High Return Init {high}, Low Return Init {low}" + plot_trajectories( pos_traj, vel_traj, torques, rewards, - f"{LEGGED_GYM_LQRC_DIR}/logs/trajectories.png", - title=f"High Return Init (theta, omega) {np.hstack((init_conditions[0, 0], init_conditions[0, 2]))}, Low Return Init {np.hstack((init_conditions[0, 1], init_conditions[0, 3]))}", + f"{LEGGED_GYM_ROOT_DIR}/logs/lqrc/trajectories.png", + title=title, ) - # plot_theta_omega_polar(np.vstack((init_conditions[:, 0], init_conditions[:, 2])), np.vstack((init_conditions[:, 1], init_conditions[:, 3])), npy_fn[:-4] + ".png") - # print(init_conditions[0, :]) diff --git a/scripts/lqrc/save_video.py b/scripts/lqrc/save_video.py new file mode 100644 index 00000000..755f687d --- /dev/null +++ b/scripts/lqrc/save_video.py @@ -0,0 +1,35 @@ +import cv2 +import os +from gym import LEGGED_GYM_ROOT_DIR +import re + + +def extract_iteration(filename): + # Use regular expression to find the numbers following 'it' + match = re.search(r"it(\d+)", filename) + if match: + return int(match.group(1)) + return 0 # Return 0 or some default value if 'it' followed by digits is not found + + +image_folder = ( + os.path.join(LEGGED_GYM_ROOT_DIR, "logs", "offline_critics_graph") + + "/20240512_223831" +) +video_name = f"{image_folder}/video_100.avi" +img_names = os.listdir(image_folder) +# Sort the list using the custom key function +img_names = sorted(img_names, key=extract_iteration) +images = [ + img for img in img_names if img.endswith(".png") and extract_iteration(img) > 60 +] +frame = cv2.imread(os.path.join(image_folder, images[0])) +height, width, layers = frame.shape + +video = cv2.VideoWriter(video_name, 0, 10, (width, height)) + +for image in images: + video.write(cv2.imread(os.path.join(image_folder, image))) + +cv2.destroyAllWindows() +video.release() diff --git a/scripts/lqrc/simple_plots.py b/scripts/lqrc/simple_plots.py index 06b30c98..846af7b2 100644 --- a/scripts/lqrc/simple_plots.py +++ b/scripts/lqrc/simple_plots.py @@ -1,6 +1,6 @@ import numpy as np import matplotlib.pyplot as plt -from learning import LEGGED_GYM_LQRC_DIR +from learning import LEGGED_GYM_ROOT_DIR import matplotlib.collections as mcoll @@ -202,13 +202,13 @@ def plot_initial_conditions_colored_by_return( if __name__ == "__main__": experiment = "Feb23_14-33-51" - file_path = f"{LEGGED_GYM_LQRC_DIR}/logs/{experiment}/data.npz" + file_path = f"{LEGGED_GYM_ROOT_DIR}/logs/lqrc/{experiment}/data.npz" data = np.load(file_path) rewards_data = {key: data[key] for key in data.files if key[:2] == "r_"} data = {key: data[key] for key in data.files if key[:2] != "r_"} - save_path = f"{LEGGED_GYM_LQRC_DIR}/logs/{experiment}" + save_path = f"{LEGGED_GYM_ROOT_DIR}/logs/lqrc/{experiment}" returns = calculate_cumulative_returns(rewards_data) # setting colormap outside, so that all returns-related plots use the same diff --git a/scripts/multiple_offline_critics.py b/scripts/multiple_offline_critics.py new file mode 100644 index 00000000..f190f983 --- /dev/null +++ b/scripts/multiple_offline_critics.py @@ -0,0 +1,130 @@ +import time +from learning.modules.critic import Critic +from learning.modules.lqrc import ( + CustomCriticBaseline, + Cholesky, + CholeskyPlusConst, + CholeskyOffset1, + CholeskyOffset2, +) # noqa F401 +from learning.utils import ( + compute_generalized_advantages, + compute_MC_returns, + create_uniform_generator, +) +from learning.modules.lqrc.plotting import ( + plot_pendulum_multiple_critics, +) +from gym import LEGGED_GYM_ROOT_DIR +import os +import torch +from torch import nn + +DEVICE = "cuda:0" +# handle some bookkeeping +run_name = "May12_13-27-05_standard_critic" # "May03_20-49-23_standard_critic" "May02_08-59-46_standard_critic" +log_dir = os.path.join( + LEGGED_GYM_ROOT_DIR, "logs", "pendulum_standard_critic", run_name +) +time_str = time.strftime("%Y%m%d_%H%M%S") + +# create fresh critic +custom_critic_params = { + "num_obs": 2, + "hidden_dims": None, + "activation": "elu", + "normalize_obs": False, + "output_size": 1, + "device": DEVICE, +} +vanilla_critic_params = { + "num_obs": 2, + "hidden_dims": [128, 64, 32], + "activation": "elu", + "normalize_obs": False, + "output_size": 1, + "device": DEVICE, +} +learning_rate = 1.0e-4 +critic_names = [ + "Critic", + "Cholesky", + "CholeskyPlusConst", + "CholeskyOffset1", + "CholeskyOffset2", +] +test_critics = { + name: eval(f"{name}(**custom_critic_params).to(DEVICE)") + if not name == "Critic" + else eval(f"{name}(**vanilla_critic_params).to(DEVICE)") + for name in critic_names +} + +critic_optimizers = { + name: torch.optim.Adam(critic.parameters(), lr=learning_rate) + for name, critic in test_critics.items() +} +gamma = 0.99 +lam = 1.0 +tot_iter = 200 + +for iteration in range(tot_iter): + # load data + data = torch.load(os.path.join(log_dir, "data_{}.pt".format(iteration))).to(DEVICE) + + # compute ground-truth + graphing_data = {data_name: {} for data_name in ["critic_obs", "values", "returns"]} + + episode_rollouts = compute_MC_returns(data, gamma) + graphing_data["critic_obs"]["Ground Truth MC Returns"] = data[0, :]["critic_obs"] + graphing_data["values"]["Ground Truth MC Returns"] = episode_rollouts[0, :] + graphing_data["returns"]["Ground Truth MC Returns"] = episode_rollouts[0, :] + + for name, test_critic in test_critics.items(): + critic_optimizer = critic_optimizers[name] + # train new critic + data["values"] = test_critic.evaluate(data["critic_obs"]) + data["advantages"] = compute_generalized_advantages( + data, gamma, lam, test_critic + ) + data["returns"] = data["advantages"] + data["values"] + + mean_value_loss = 0 + counter = 0 + max_gradient_steps = 24 + max_grad_norm = 1.0 + batch_size = 2**16 + generator = create_uniform_generator( + data, + batch_size, + max_gradient_steps=max_gradient_steps, + ) + for batch in generator: + value_loss = test_critic.loss_fn(batch["critic_obs"], batch["returns"]) + critic_optimizer.zero_grad() + value_loss.backward() + nn.utils.clip_grad_norm_(test_critic.parameters(), max_grad_norm) + critic_optimizer.step() + mean_value_loss += value_loss.item() + print(f"{name} value loss: ", value_loss.item()) + counter += 1 + mean_value_loss /= counter + + graphing_data["critic_obs"][name] = data[0, :]["critic_obs"] + graphing_data["values"][name] = data[0, :]["values"] + graphing_data["returns"][name] = data[0, :]["returns"] + + # compare new and old critics + save_path = os.path.join( + LEGGED_GYM_ROOT_DIR, "logs", "offline_critics_graph", time_str + ) + if not os.path.exists(save_path): + os.makedirs(save_path) + + plot_pendulum_multiple_critics( + graphing_data["critic_obs"], + graphing_data["values"], + graphing_data["returns"], + title=f"Neural Network Training Iteration {iteration}", + fn=save_path + f"/{len(critic_names)}_critics_it{iteration}", + ) diff --git a/scripts/offline_critics.py b/scripts/offline_critics.py new file mode 100644 index 00000000..1a2a3e7a --- /dev/null +++ b/scripts/offline_critics.py @@ -0,0 +1,85 @@ +import time +from learning.modules.lqrc import Cholesky # noqa F401 +from learning.utils import ( + compute_generalized_advantages, + compute_MC_returns, + create_uniform_generator, +) +from learning.modules.lqrc.plotting import plot_pendulum_single_critic +from gym import LEGGED_GYM_ROOT_DIR +import os +import torch +from torch import nn + +DEVICE = "cuda:0" +# handle some bookkeeping +run_name = "May12_13-27-05_standard_critic" # "May03_20-49-23_standard_critic" "May02_08-59-46_standard_critic" +log_dir = os.path.join( + LEGGED_GYM_ROOT_DIR, "logs", "pendulum_standard_critic", run_name +) +time_str = time.strftime("%Y%m%d_%H%M%S") + +# create fresh critic +test_critic_params = { + "num_obs": 2, + "hidden_dims": None, + "activation": "elu", + "normalize_obs": False, + "output_size": 1, + "device": DEVICE, +} +learning_rate = 1.0e-4 +critic_name = "Cholesky" +test_critic = eval(f"{critic_name}(**test_critic_params).to(DEVICE)") +critic_optimizer = torch.optim.Adam(test_critic.parameters(), lr=learning_rate) +gamma = 0.99 +lam = 0.99 +tot_iter = 200 + +for iteration in range(tot_iter): + # load data + data = torch.load(os.path.join(log_dir, "data_{}.pt".format(iteration))).to(DEVICE) + + # compute ground-truth + episode_rollouts = compute_MC_returns(data, gamma) + + # train new critic + data["values"] = test_critic.evaluate(data["critic_obs"]) + data["advantages"] = compute_generalized_advantages(data, gamma, lam, test_critic) + data["returns"] = data["advantages"] + data["values"] + + mean_value_loss = 0 + counter = 0 + max_gradient_steps = 24 + max_grad_norm = 1.0 + batch_size = 2**16 + generator = create_uniform_generator( + data, + batch_size, + max_gradient_steps=max_gradient_steps, + ) + for batch in generator: + value_loss = test_critic.loss_fn(batch["critic_obs"], batch["returns"]) + critic_optimizer.zero_grad() + value_loss.backward() + nn.utils.clip_grad_norm_(test_critic.parameters(), max_grad_norm) + critic_optimizer.step() + mean_value_loss += value_loss.item() + print("Value loss: ", value_loss.item()) + counter += 1 + mean_value_loss /= counter + + # compare new and old critics + save_path = os.path.join( + LEGGED_GYM_ROOT_DIR, "logs", "offline_critics_graph", time_str + ) + if not os.path.exists(save_path): + os.makedirs(save_path) + + plot_pendulum_single_critic( + x=data["critic_obs"][0], + predictions=data["values"][0], + targets=data["returns"][0], + title=f"{critic_name}_iteration{iteration}", + fn=save_path + f"/{critic_name}_it{iteration}", + ) diff --git a/scripts/play.py b/scripts/play.py index 29a837ea..1a23c22e 100644 --- a/scripts/play.py +++ b/scripts/play.py @@ -16,9 +16,9 @@ def setup(args): env_cfg.commands.resampling_time = 9999 env_cfg.env.episode_length_s = 9999 env_cfg.env.num_projectiles = 20 + env_cfg.init_state.reset_mode = "reset_to_range" task_registry.make_gym_and_sim() env = task_registry.make_env(args.task, env_cfg) - env.cfg.init_state.reset_mode = "reset_to_basic" train_cfg.runner.resume = True train_cfg.logging.enable_local_saving = False runner = task_registry.make_alg_runner(env, train_cfg) @@ -46,9 +46,9 @@ def play(env, runner, train_cfg): if env.cfg.viewer.record: recorder.update(i) runner.set_actions( - runner.policy_cfg["actions"], + runner.actor_cfg["actions"], runner.get_inference_actions(), - runner.policy_cfg["disable_actions"], + runner.actor_cfg["disable_actions"], ) env.step() env.check_exit() diff --git a/scripts/train.py b/scripts/train.py index 283717e8..176fffb3 100644 --- a/scripts/train.py +++ b/scripts/train.py @@ -1,5 +1,5 @@ from gym.envs import __init__ # noqa: F401 -from gym.utils import get_args, task_registry, randomize_episode_counters +from gym.utils import get_args, task_registry from gym.utils.logging_and_saving import wandb_singleton from gym.utils.logging_and_saving import local_code_save_helper @@ -12,9 +12,8 @@ def setup(): task_registry.make_gym_and_sim() wandb_helper.setup_wandb(env_cfg=env_cfg, train_cfg=train_cfg, args=args) env = task_registry.make_env(name=args.task, env_cfg=env_cfg) - randomize_episode_counters(env) - randomize_episode_counters(env) + # randomize_episode_counters(env) policy_runner = task_registry.make_alg_runner(env, train_cfg) local_code_save_helper.save_local_files_to_logs(train_cfg.log_dir) diff --git a/setup.py b/setup.py index c918d911..06709c4e 100644 --- a/setup.py +++ b/setup.py @@ -2,20 +2,16 @@ from distutils.core import setup setup( - name="gpuGym", - version="1.0.1", + name="QGym", + version="1.0.3", author="Biomimetic Robotics Lab", license="BSD-3-Clause", packages=find_packages(), - description="Isaac Gym environments for Legged Robots", + description="Isaac Gym environments", install_requires=[ "isaacgym", - "matplotlib", - "pandas", - "tensorboard", "setuptools==59.5.0", "torch>=1.4.0", "torchvision>=0.5.0", - "numpy>=1.16.4", ], ) diff --git a/tests/integration_tests/test_runner_integration.py b/tests/integration_tests/test_runner_integration.py index 4781a665..f2e607c6 100644 --- a/tests/integration_tests/test_runner_integration.py +++ b/tests/integration_tests/test_runner_integration.py @@ -7,7 +7,7 @@ # * imports for onnx test import onnx import onnxruntime -from learning.modules import ActorCritic +from learning.modules import Actor from gym.utils.helpers import get_load_path from gym import LEGGED_GYM_ROOT_DIR @@ -27,21 +27,18 @@ def learn_policy(args): def load_saved_policy(runner): - num_actor_obs = runner.get_obs_size(runner.policy_cfg["actor_obs"]) - num_critic_obs = runner.get_obs_size(runner.policy_cfg["critic_obs"]) - num_actions = runner.get_action_size(runner.policy_cfg["actions"]) - actor_critic = ActorCritic( - num_actor_obs, num_critic_obs, num_actions, **runner.policy_cfg - ).to(runner.device) + num_actor_obs = runner.get_obs_size(runner.actor_cfg["obs"]) + num_actions = runner.get_action_size(runner.actor_cfg["actions"]) + actor = Actor(num_actor_obs, num_actions, **runner.actor_cfg).to(runner.device) resume_path = get_load_path( name=runner.cfg["experiment_name"], load_run=runner.cfg["load_run"], checkpoint=runner.cfg["checkpoint"], ) loaded_dict = torch.load(resume_path) - actor_critic.actor.load_state_dict(loaded_dict["actor_state_dict"]) - actor_critic.eval() - return actor_critic + actor.load_state_dict(loaded_dict["actor_state_dict"]) + actor.eval() + return actor class TestDefaultIntegration: @@ -60,7 +57,7 @@ def test_default_integration_settings(self, args): with torch.no_grad(): actions = runner.get_inference_actions() - deployed_actions = runner.env.get_states(runner.policy_cfg["actions"]) + deployed_actions = runner.env.get_states(runner.actor_cfg["actions"]) assert ( torch.equal(actions, torch.zeros_like(actions)) is False ), "Policy returning all zeros" @@ -95,20 +92,20 @@ def test_default_integration_settings(self, args): f"{model_5_path}" "(last iteration) was not saved" ) - obs = torch.randn_like(runner.get_obs(runner.policy_cfg["actor_obs"])) - actions_first = runner.alg.actor_critic.act_inference(obs).cpu().clone() + obs = torch.randn_like(runner.get_obs(runner.actor_cfg["obs"])) + actions_first = runner.alg.actor.act_inference(obs).cpu().clone() runner.load(model_8_path) - actions_loaded = runner.alg.actor_critic.act_inference(obs).cpu().clone() + actions_loaded = runner.alg.actor.act_inference(obs).cpu().clone() assert torch.equal(actions_first, actions_loaded), "Model loading failed" # * test onnx exporting - loaded_actor_critic = load_saved_policy(runner) + loaded_actor = load_saved_policy(runner) export_path = os.path.join( LEGGED_GYM_ROOT_DIR, "tests", "integration_tests", "exported_policy" ) - loaded_actor_critic.export_policy(export_path) + loaded_actor.export(export_path) # load the exported onnx path_to_onnx = os.path.join(export_path, "policy.onnx") @@ -121,9 +118,9 @@ def test_default_integration_settings(self, args): ) # compute torch output with torch.no_grad(): - test_input = runner.get_obs(runner.policy_cfg["actor_obs"])[0:1] - runner_out = runner.alg.actor_critic.actor.act_inference(test_input) - loaded_out = loaded_actor_critic.actor.act_inference(test_input) + test_input = runner.get_obs(runner.actor_cfg["obs"])[0:1] + runner_out = runner.alg.actor.act_inference(test_input) + loaded_out = loaded_actor.act_inference(test_input) # compute ONNX Runtime output prediction ort_inputs = {ort_session.get_inputs()[0].name: test_input.cpu().numpy()} @@ -131,8 +128,8 @@ def test_default_integration_settings(self, args): # compare ONNX Runtime and PyTorch results torch.testing.assert_close( - runner_out.cpu().numpy(), loaded_out.cpu().numpy(), rtol=1e-05, atol=1e-8 + runner_out.cpu().numpy(), loaded_out.cpu().numpy(), rtol=1e-05, atol=1e-7 ) torch.testing.assert_close( - runner_out.cpu().numpy(), ort_out[0], rtol=1e-05, atol=1e-8 + runner_out.cpu().numpy(), ort_out[0], rtol=1e-05, atol=1e-7 ) diff --git a/tests/regression_tests/run_n_iterations.py b/tests/regression_tests/run_n_iterations.py index 6a3a4965..29e2f424 100644 --- a/tests/regression_tests/run_n_iterations.py +++ b/tests/regression_tests/run_n_iterations.py @@ -33,7 +33,7 @@ def _run_training_test(output_tensor_file, env, runner): runner.learn() # * get the test values after learning - actions = runner.env.get_states(runner.policy_cfg["actions"]) + actions = runner.env.get_states(runner.actor_cfg["actions"]) # * return the values to the parent for assertion actions.detach().cpu()