diff --git a/high-level-code.txt b/high-level-code.txt new file mode 100644 index 0000000..1997940 --- /dev/null +++ b/high-level-code.txt @@ -0,0 +1,48 @@ +Visual Whole-Body Control for Legged Loco-Manipulation + +├── b1-pick-multi-teacher == policy +├── data == robot model; object_model(.obj,.npy点云特征数据,.urdf.visual.py);cfg配置 +├── envs == b1z1_pickmulti.py +├── experiments +├── learning +├── modules +├── play_multi_bc_deter.py +├── play_multistate.py +├── __pycache__ +├── README.md +├── test_pointcloud.py +├── train_multi_bc_deter.py +├── train_multistate_asym.py +├── train_multistate.py +├── utils +└── wandb + +Reward +_reward_base_height:它测量机器人根部的当前高度跟一个给定目标高度的差异,然后根据这个差异计算一个奖励,距离目标越近奖励越大, 且奖励以指数的形式衰减。 + +_reward_approaching:比之前更靠近方块了,给奖励,奖励数值根据你靠近的幅度变化,但不会无限增大。如果你距离太远了,就不给奖励。 +_reward_lifting: 如果成功把物体举得比之前更高,给奖励,奖励根据提升的高度大小变化,但不会无限大。一旦物体已经举起来了,就不再发奖励。 +_reward_pick_up: 该 _reward_pick_up 函数确保机器人获得明确的成功奖励,且通过“持物计数器”机制保证所称的“成功拾取”是稳定的抓取行为,从而引导机器人学会稳健的推出物体执行策略,最终实现论文中较高的抓取成功率和灵活多次尝试能力 +_reward_acc_penalty: 衡量机械臂运动中的速度变化率(即加速度),并对高加速度动作进行惩罚,促进机械臂动作平滑和稳定性。 + +_reward_command_reward: 只对机器人距离物体较近(小于0.6米)的位置赋予奖励。奖励数值是对机器人执行的第一个指令分量(常是线速度或位移等命令)的绝对 + +_reward_command_penalty: 值取负,再做指数函数,意味着命令越小(越接近0),奖励越大,鼓励机器人在接近物体时保持命令输入平稳或较小。 +该函数用于惩罚机器人在接近物体时施加过大的动作命令,促使动作更加平滑和节制。 + +_reward_action_rate: 动作变化越大,奖励越大 动作中线速度和偏航角速度变化越大,奖励越大 + +_reward_ee_orn: 算机器人“手”的朝向和目标物体之间的对齐程度,奖励机器人“手臂的方向越对着目标,奖励越高” + +_reward_base_dir: 该函数根据机器人基座的朝向和目标物体相对位置方向,计算两者的方向一致度并返回对应奖励。奖励越高表示机器人越“正对”目标物体,这对于基座调整朝向、朝向目标物体抓取等任务极为关键。 + +_reward_rad_penalty:这个函数对应机器人的奖励设计,目的是通过对距离的软限制引导机器人让末端执行器位置保持在理想的半径周围,避免过近或过远,保证机械臂动作既有效又安全。 + +_reward_base_ang_pen: 这个函数计算一个与机器人的“基座角速度”相关的奖励,用于惩罚基座的旋转速度,鼓励机器人保持稳定。 + +_reward_base_approaching: 它是计算机器人“身体底座”(base)与目标物体在平面上的距离接近度,并基于距离的接近程度给出奖励。距离越接近一个预设目标距离,奖励越高。 + +: 该函数衡量并奖励夹爪动作的变化速度,以促使机器人在操作过程中灵活调整夹爪开合,提升操控精度和抓取成功率。同时,训练初期奖励关闭,保证训练稳定,防止夹爪动作变化过早产生不必要的影响。 + +_reward_standpick: 机器人如果靠近目标物体(距离够近),并且它的移动速度不是太快(在设定限制内),就奖励1分。靠近目标物体(增加成功概率),保持合理的移动速度(稳健动作)且训练初期不急于给奖励,让机器人先积累经验。 +_reward_grasp_base_height: 用于奖励机器人抬起物体的能力。它依赖于一个更基础的奖励 _reward_base_height,可能衡量抬起来的高度。 diff --git a/high-level/README.md b/high-level/README.md index 011d09b..3f8fac7 100644 --- a/high-level/README.md +++ b/high-level/README.md @@ -3,7 +3,10 @@ This code base only includes the task of picking multiple objects. ## Code structure + `data` contains assets and config files. +assets: robot model +config: RL Environment `envs` contains environment related codes. @@ -15,24 +18,25 @@ This code base only includes the task of picking multiple objects. ## Train -1. Environments: +1. Environments: **Picking multiple objects**: [b1_z1pickmulti.py](./envs/b1_z1pickmulti.py) is for walking and picking policy, supporting floating base (fixed and varied body heights). - 2. To train the state-based teacher, using [train_multistate.py](./train_multistate.py) with the config [b1z1_pickmulti.yaml](./data/cfg/b1z1_pickmulti.yaml) (remember to determine the pre-trained low-level policy path in the config file): + ```bash python train_multistate.py --rl_device "cuda:0" --sim_device "cuda:0" --timesteps 60000 --headless --task B1Z1PickMulti --experiment_dir b1-pick-multi-teacher --wandb --wandb_project "b1-pick-multi-teacher" --wandb_name "some descriptions" --roboinfo --observe_gait_commands --small_value_set_zero --rand_control --stop_pick ``` + Arguments explanation: -`--task` should be the full name of the environment, with every first letter of each word capitalized. +`--task` should be the full name of the environment, with every first letter of each word capitalized. -`--timesteps` total training steps. +`--timesteps` total training steps. `--experiment_dir` is the name of the directory where the running is saved. -`--wandb`, then the training will be logged to wandb. You can omit this argument if you don't want to use wandb or when debugging. +`--wandb`, then the training will be logged to wandb. You can omit this argument if you don't want to use wandb or when debugging. -`--wandb_project` is the name of the project in wandb. +`--wandb_project` is the name of the project in wandb. `--wandb_name` is the name of the run in wandb, which is also the name of this run in experiment_dir. @@ -47,26 +51,38 @@ Arguments explanation: `--stop_pick` enforces the robot to stop when the gripper is closing. For playing the teacher policy, using [play_multistate.py](./play_multistate.py): + ```bash python play_multistate.py --task B1Z1PickMulti --checkpoint "(specify the path)" # --(same arguments as training) ``` -It should be a maximum of 60000 timesteps for successful teacher policy training. +It should be a maximum of 60000 timesteps for successful teacher policy training. 3. To train the vision-based student policy, use [train_multi_bc_deter.py](./train_multi_bc_deter.py) with the config [b1z1_pickmulti.yaml](./data/cfg/b1z1_pickmulti.yaml) + ```bash python train_multi_bc_deter.py --headless --task B1Z1PickMulti --rl_device "cuda:0" --sim_device "cuda:0" --timesteps 60000 --experiment_dir "b1-pick-multi-stu" --wandb --wandb_project "b1-pick-multi-stu" --wandb_name "checkpoint dir path" --teacher_ckpt_path "teacher checkpoint path" --roboinfo --observe_gait_commands --small_value_set_zero --rand_control --stop_pick ``` + Arguments are similar to those above. For playing the trained student policy, using [play_multi_bc_deter.py](./play_multi_bc_deter.py): + ```bash python play_multi_bc_deter.py --task B1Z1PickMulti --checkpoint "(specify the path)" # --(same arguments as training) ``` + If you don't specify `--num_envs`, it will use 34 by default (only for this script). It should be a maximum of 60000 timesteps for successful student policy training. ## Others + [test_pointcloud.py](./test_pointcloud.py) can be use for checking the pointcloud of the objects. [train_multistate_asym.py](./train_multistate_asym.py) is a try of using asymetric PPO for training the high-level policy (i.e, vision-based policy and privilaged value function), it is training inefficient and is not comparable to the teacher-student as it cannot parallel too many environments due to the depth images consumption. + +command: + +python train_multistate.py --timesteps 60000 --headless --task B1Z1PickMulti --experiment_dir b1-pick-multi-teacher --wandb --wandb_project "b1-pick-multi-teacher" --wandb_name "policy2" --roboinfo --observe_gait_commands --small_value_set_zero --rand_control --stop_pick + +python play_multistate.py --task B1Z1PickMulti --checkpoint /home/wang/Desktop/visual_wholebody/high-level/b1-pick-multi-teacher/policy_2/checkpoints/agent_30001.pt --roboinfo --observe_gait_commands --small_value_set_zero --rand_control --stop_pick diff --git a/high-level/data/cfg/b1z1_pickmulti.yaml b/high-level/data/cfg/b1z1_pickmulti.yaml index 8f609fd..fef3123 100644 --- a/high-level/data/cfg/b1z1_pickmulti.yaml +++ b/high-level/data/cfg/b1z1_pickmulti.yaml @@ -1,10 +1,10 @@ env: - numEnvs: 10240 + numEnvs: 2048 numAgents: 1 envSpacing: 5.0 enableDebugVis: False isFlagrun: False - maxEpisodeLength: 150 #200 + maxEpisodeLength: 200 #200 pdControl: True powerScale: 1.0 controlFrequencyInv: 4 @@ -17,8 +17,9 @@ env: holdSteps: 25 lastCommands: False - low_policy_path: "/data/mhliu/visual_wholebody/high-level/data/low_policy/publiccheckrollrew_42000.pt" # "data/low_policy/jan17-frontfeet_terrainflat_linrew2_freq2_gaitstopzero_penelizeang_roll_fixbug_35600.pt" - + low_policy_path: "/home/wang/Desktop/visual_wholebody/low-level/logs/b1z1-low/gait_reference/model_37000.pt" + # "data/low_policy/jan17-frontfeet_terrainflat_linrew2_freq2_gaitstopzero_penelizeang_roll_fixbug_35600.pt" + # "/home/wang/Desktop/visual_wholebody/low-level/logs/b1z1-low/gait_tuning_v2/model_13000.pt" asset: assetRoot: "data/asset" assetFileRobot: "b1z1-col/urdf/b1z1.urdf" @@ -216,7 +217,7 @@ sim: solver_type: 1 num_position_iterations: 8 num_velocity_iterations: 0 - contact_offset: 0.02 + contact_offset: 0.08 rest_offset: 0.0 bounce_threshold_velocity: 0.2 max_depenetration_velocity: 50.0 @@ -227,21 +228,21 @@ reward: base_height_target: 0.55 only_positive_rewards: False scales: - approaching: 0.5 + approaching: 5.0 lifting: 1.0 - pick_up: 3.5 # 0.5 + pick_up: 5.0 # 0.5 acc_penalty: -0.001 command_penalty: -1.0 command_reward: 0.25 - standpick: 0.25 + standpick: 0.25 # no found in reward_vec_task.py # action_rate: -0.001 - ee_orn: 0.01 + ee_orn: 0.1 # 0.01 base_dir: 0.25 rad_penalty: 0.0 base_ang_pen: 0.0 base_approaching: 0.01 # 0.05 - grasp_base_height: 0.5 - gripper_rate: 0.0 # -0.1 + grasp_base_height: 0.5 # no found in reward_vec_task.py + gripper_rate: -0.1 # -0.1 sensor: depth_clip_lower: 0.15 diff --git a/high-level/envs/b1z1_pickmulti.py b/high-level/envs/b1z1_pickmulti.py index ab8b047..c69fd47 100644 --- a/high-level/envs/b1z1_pickmulti.py +++ b/high-level/envs/b1z1_pickmulti.py @@ -122,7 +122,7 @@ def _create_envs(self): self.table_heights = torch.zeros(self.num_envs, device=self.device, dtype=torch.float) # table - self.table_dimz = 0.25 + self.table_dimz = 0.45 self.table_dims = gymapi.Vec3(0.6, 1.0, self.table_dimz) table_options = gymapi.AssetOptions() table_options.fix_base_link = True @@ -257,7 +257,7 @@ def _reset_table(self, env_ids): else: rand_heights = torch.ones((len(env_ids), 1), device=self.device, dtype=torch.float)*self.table_heights_fix - self.table_dimz / 2 - self._table_root_states[env_ids, 2] = rand_heights.squeeze(1) - self.table_dimz / 2.0 + self._table_root_states[env_ids, 2] = self.table_dimz / 2.0 self.table_heights[env_ids] = self._table_root_states[env_ids, 2] + self.table_dimz / 2.0 def _reset_actors(self, env_ids): diff --git a/high-level/train_multistate.py b/high-level/train_multistate.py index 4061c92..93fa4e7 100644 --- a/high-level/train_multistate.py +++ b/high-level/train_multistate.py @@ -21,7 +21,7 @@ from utils.config import load_cfg, get_params, copy_cfg import utils.wrapper as wrapper -set_seed(43) +set_seed(101) def create_env(cfg, args): cfg["env"]["enableDebugVis"] = args.debugvis @@ -36,6 +36,9 @@ def create_env(cfg, args): robot_start_pose = (-2.00, 0, 0.55) if args.eval: robot_start_pose = (-0.85, 0, 0.55) + + # python train_multistate.py --rl_device "cuda:0" --sim_device "cuda:0" --timesteps 60000 --headless --task B1Z1PickMulti --experiment_dir b1-pick-multi-teacher --wandb --wandb_project "b1-pick-multi-teacher" --wandb_name "some descriptions" --roboinfo --observe_gait_commands --small_value_set_zero --rand_control --stop_pick + _env = eval(args.task)(cfg=cfg, rl_device=args.rl_device, sim_device=args.sim_device, graphics_device_id=args.graphics_device_id, headless=args.headless, use_roboinfo=args.roboinfo, observe_gait_commands=args.observe_gait_commands, no_feature=args.no_feature, mask_arm=args.mask_arm, pitch_control=args.pitch_control, @@ -110,8 +113,61 @@ def get_trainer(is_eval=False): args = get_params() args.eval = is_eval args.wandb = args.wandb and (not args.eval) and (not args.debug) - cfg_file = "b1z1_" + args.task[4:].lower() + ".yaml" + cfg_file = "b1z1_" + args.task[4:].lower() + ".yaml" # B1Z1PickMulti -> b1z1_pick_multi.yaml file_path = "data/cfg/" + cfg_file + + print("Arguments passed to get_trainer:") + + # Arguments passed to get_trainer: + # task: B1Z1PickMulti + # timesteps: 60000 + # control_freq: None + # rl_device: cuda:0 + # sim_device: cuda:0 + # graphics_device_id: -1 + # headless: True + # wandb: True + # wandb_project: b1-pick-multi-teacher + # wandb_name: some descriptions + # checkpoint: + # experiment_dir: b1-pick-multi-teacher + # debugvis: False + # save_image: False + # debug: False + # wrist_seg: False + # front_only: False + # seperate: False + # teacher_ckpt_path: + # resume: False + # roboinfo: True + # observe_gait_commands: True + # small_value_set_zero: True + # fixed_base: False + # use_tanh: False + # reach_only: False + # record_video: False + # last_commands: False + # no_feature: False + # mask_arm: False + # mlp_stu: False + # depth_random: False + # pitch_control: False + # pred_success: False + # near_goal_stop: False + # obj_move_prob: 0.0 + # rand_control: True + # arm_delay: False + # rand_cmd_scale: False + # rand_depth_clip: False + # stop_pick: True + # arm_kp: 40 + # arm_kd: 2 + # table_height: None + # seed: 43 + # eval: False + + for key, value in vars(args).items(): + print(f"{key}: {value}") if args.resume: experiment_dir = os.path.join(args.experiment_dir, args.wandb_name) @@ -130,7 +186,7 @@ def get_trainer(is_eval=False): file_path = os.path.join(experiment_dir, cfg_file) print("Find the latest checkpoint: ", args.checkpoint) - print("Using config file: ", file_path) + print("Using config file: ", file_path) # data/cfg/b1z1_pickmulti.yaml cfg = load_cfg(file_path) cfg['env']['wandb'] = args.wandb @@ -226,3 +282,4 @@ def get_trainer(is_eval=False): trainer = get_trainer() trainer.train() + diff --git a/low-level/README.md b/low-level/README.md index 9eff31d..c3a5dd8 100644 --- a/low-level/README.md +++ b/low-level/README.md @@ -1,6 +1,7 @@ # Training a universal low-level policy ## Code structure + `legged_gym/envs` contains environment-related codes. `legged_gym/scripts` contains train and test scripts. @@ -13,6 +14,7 @@ The environment related code is `legged_gym/legged_gym/envs/manip_loco/manip_loc cd legged_gym/scripts python train.py --headless --exptid SOME_YOUR_DESCRIPTION --proj_name b1z1-low --task b1z1 --sim_device cuda:0 --rl_device cuda:0 --observe_gait_commands ``` + - `--debug` disables wandb and set a small number of envs for faster execution. - `--headless` disables rendering, typically used when you train model. - `--proj_name` the folder containing all your logs and wandb project name. `manip-loco` is default. @@ -21,12 +23,24 @@ python train.py --headless --exptid SOME_YOUR_DESCRIPTION --proj_name b1z1-low - Check `legged_gym/legged_gym/utils/helpers.py` for all command line args. ## Play + Only need to specify `--exptid`. The parser will automatically find corresponding runs. + ```bash cd legged_gym/scripts python play.py --exptid SOME_YOUR_DESCRIPTION --task b1z1 --proj_name b1z1-low --checkpoint 64000 --observe_gait_commands ``` + Use `--sim_device cpu --rl_device cpu` in case not enough GPU memory. ## Suggestions + To choose a good low-level policy that can be further used for training the high-level policy, we suggest you deploy the low-level policy first, and see if it goes well before training a high-level policy. + +command line + +python play_multistate.py --task B1Z1PickMulti --checkpoint /home/wang/Desktop/visual_wholebody/high-level/b1-pick-multi-teacher/policy_2/checkpoints/agent_30001.pt --roboinfo --observe_gait_commands --small_value_set_zero --rand_control --stop_pick + + +python train_multistate.py --timesteps 60000 --headless --task B1Z1PickMulti --experiment_dir b1-pick-multi-teacher --w +andb --wandb_project "b1-pick-multi-teacher" --wandb_name "policy_4" --roboinfo --observe_gait_commands --small_value_set_zero --rand_control --stop_pick diff --git a/low-level/legged_gym/envs/base/legged_robot.py b/low-level/legged_gym/envs/base/legged_robot.py index 6cf95c4..18a3351 100644 --- a/low-level/legged_gym/envs/base/legged_robot.py +++ b/low-level/legged_gym/envs/base/legged_robot.py @@ -783,7 +783,7 @@ def _get_env_origins(self): self.env_origins[:, 1] = spacing * yy.flatten()[:self.num_envs] self.env_origins[:, 2] = 0. - def _parse_cfg(self, cfg): + def _parse_cfg(self, cfg): # load legged_robot_config.py self.dt = self.cfg.control.decimation * self.sim_params.dt self.obs_scales = self.cfg.normalization.obs_scales self.reward_scales = class_to_dict(self.cfg.rewards.scales) diff --git a/low-level/legged_gym/scripts/train.py b/low-level/legged_gym/scripts/train.py index 36a7632..2ec51ab 100644 --- a/low-level/legged_gym/scripts/train.py +++ b/low-level/legged_gym/scripts/train.py @@ -49,7 +49,7 @@ def train(args): mode = "disabled" args.rows = 6 args.cols = 2 - args.num_envs = 128 + args.num_envs = 1 else: mode = "online" wandb.init(project=args.proj_name, name=args.exptid, mode=mode, dir=LEGGED_GYM_ENVS_DIR +"/logs") diff --git a/low-level/legged_gym/utils/helpers.py b/low-level/legged_gym/utils/helpers.py index 61e335f..6285c8a 100644 --- a/low-level/legged_gym/utils/helpers.py +++ b/low-level/legged_gym/utils/helpers.py @@ -30,11 +30,11 @@ import os import copy -import torch import numpy as np import random from isaacgym import gymapi from isaacgym import gymutil +import torch from legged_gym import LEGGED_GYM_ROOT_DIR, LEGGED_GYM_ENVS_DIR diff --git a/low-level/record.txt b/low-level/record.txt new file mode 100644 index 0000000..34f696f --- /dev/null +++ b/low-level/record.txt @@ -0,0 +1,20 @@ +Observation (obs): + Joint positions, velocities + Contact sensors or gait phase info + Command targets (if --observe_gait_commands) + Possibly noise from _get_noise_scale_vec() + + + +Reward: +Tracks performance: velocity tracking, foot clearance, energy cost +Done condition: falls, too far from goal, etc. + +Command: +python scripts/train.py --headless --exptid gait_tunning_v2 --proj_name b1z1-low --task b1z1 --sim_device cuda:0 --rl_device cuda:0 --observe_gait_commands --debug + + +python play.py --exptid gait_tunning_v2 --task b1z1 --proj_name b1z1-low --checkpoint 13000 --observe_gait_commands + + +