diff --git a/.gitignore b/.gitignore
index e144cbd..0085af3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -58,8 +58,9 @@ temp/
 data/AMASS/Transitions/
 data/g1_29dof/
 data/smpl/
-
+data/Transitions
 # Pickle files
 # *.pkl
 
-homework.md
\ No newline at end of file
+homework.md
+
diff --git a/README.md b/README.md
index 72e1e02..8def89f 100644
--- a/README.md
+++ b/README.md
@@ -41,9 +41,18 @@ Then we need to fit the motion data to the humanoid robot:
 ```bash
 uv run scripts/3-fit_smpl_motion_mp.py +motion_path=./data/AMASS +target_fps=30 +output_name=<motion_file>
 ```
+fit motion with quat
+```bash
+uv run scripts/3-fit_smpl_motion_withquat.py +motion_path=./data/AMASS +target_fps=30 +output_name=<motion_file>
+```
+
 It will save the fitted motion data in the `data/g1_29dof/<output_name>.pkl` file.
 
 Finally, we can visualize the fitted motion data:
 ```bash
-uv run scripts/3-vis_q_mj.py +motion_file=./data/g1_29dof/<output_name>.pkl
+uv run scripts/4-vis_mj.py +motion_file=./data/g1_29dof/<output_name>.pkl
+```
+find orit offset
+```bash
+uv run 5-findoffset.py
 ```
\ No newline at end of file
diff --git a/cfg/unitree_g1_fitting copy.yaml b/cfg/unitree_g1_fitting copy.yaml
new file mode 100644
index 0000000..d7de046
--- /dev/null
+++ b/cfg/unitree_g1_fitting copy.yaml	
@@ -0,0 +1,48 @@
+humanoid_type: g1_29dof
+
+asset:
+  assetFileName: "assets/robot/unitree_g1/g1_29dof_rev_1_0.xml"
+
+
+extend_config:
+  - joint_name: "head_link"
+    parent_name: "pelvis"
+    pos: [0.0, 0.0, 0.4]
+    rot: [1.0, 0.0, 0.0, 0.0]
+
+  - joint_name: "left_toe_link"
+    parent_name: "left_ankle_roll_link"
+    pos: [0.08, 0.0, 0.0]
+    rot: [1.0, 0.0, 0.0, 0.0]
+
+  - joint_name: "right_toe_link"
+    parent_name: "right_ankle_roll_link"
+    pos: [0.08, 0.0, 0.0]
+    rot: [1.0, 0.0, 0.0, 0.0]
+
+base_link: "torso_link"
+joint_matches:
+  - ["pelvis", "Pelvis"]
+  - ["left_hip_pitch_link", "L_Hip"]
+  - ["left_knee_link", "L_Knee"]
+  - ["left_ankle_roll_link", "L_Ankle"]
+  - ["right_hip_pitch_link", "R_Hip"]
+  - ["right_knee_link", "R_Knee"]
+  - ["right_ankle_roll_link", "R_Ankle"]
+  - ["left_shoulder_roll_link", "L_Shoulder"]
+  - ["left_elbow_link", "L_Elbow"]
+  - ["left_wrist_yaw_link", "L_Hand"]
+  - ["right_shoulder_roll_link", "R_Shoulder"]
+  - ["right_elbow_link", "R_Elbow"]
+  - ["right_wrist_yaw_link", "R_Hand"]
+  - ["head_link", "Head"]
+  - ["left_toe_link", "L_Toe"]
+  - ["right_toe_link", "R_Toe"]
+
+
+smpl_pose_modifier:
+  Pelvis: "[np.pi/2, 0, np.pi/2]"
+  L_Shoulder: "[0, 0, -np.pi/2]"
+  R_Shoulder: "[0, 0, np.pi/2]"
+  L_Elbow: "[0, -np.pi/2, 0]"
+  R_Elbow: "[0, np.pi/2, 0]"
\ No newline at end of file
diff --git a/cfg/unitree_g1_fitting.yaml b/cfg/unitree_g1_fitting.yaml
index d7de046..099eb05 100644
--- a/cfg/unitree_g1_fitting.yaml
+++ b/cfg/unitree_g1_fitting.yaml
@@ -45,4 +45,42 @@ smpl_pose_modifier:
   L_Shoulder: "[0, 0, -np.pi/2]"
   R_Shoulder: "[0, 0, np.pi/2]"
   L_Elbow: "[0, -np.pi/2, 0]"
-  R_Elbow: "[0, np.pi/2, 0]"
\ No newline at end of file
+  R_Elbow: "[0, np.pi/2, 0]"
+
+
+orit_offset:
+  Pelvis: "[np.pi/2, 0, np.pi/2]"
+  L_Hip: "[np.pi/2, 0, np.pi/2]"
+  R_Hip: "[np.pi/2, 0, np.pi/2]"
+  L_Knee: "[np.pi/2, 0, np.pi/2]"
+  R_Knee: "[np.pi/2, 0, np.pi/2]"
+  L_Ankle: "[np.pi/2, 0, np.pi/2]"
+  R_Ankle: "[np.pi/2, 0, np.pi/2]"
+  L_Toe: "[np.pi/2, 0, np.pi/2]"
+  R_Toe: "[np.pi/2, 0, np.pi/2]"
+  Head: "[np.pi/2, 0, np.pi/2]"
+  L_Shoulder: "[0, np.pi/2, 0]"
+  R_Shoulder: "[-np.pi, -np.pi/2, 0]"
+  L_Elbow: "[np.pi * 0.25, 0, 0]"
+  R_Elbow: "[0 , 0, -np.pi]"
+  L_Hand: "[0, 0, 0]"
+  R_Hand: "[0.0, 0, -np.pi ]"
+
+quat_offset:
+  Pelvis: "[0.5, -0.5, -0.5, -0.5]"
+  L_Hip: "[0.4267755048530407,-0.5637931078484661,-0.5637931078484661,-0.4267755048530407]"
+  L_Knee: "[0.5, -0.5, -0.5, -0.5]"
+  L_Ankle: "[0.5, -0.5, -0.5, -0.5]"
+  R_Hip: "[0.4267755048530407,-0.5637931078484661,-0.5637931078484661,-0.4267755048530407]"
+  R_Knee: "[0.5, -0.5, -0.5, -0.5]"
+  R_Ankle: "[0.5, -0.5, -0.5, -0.5]"
+  L_Toe: "[0.5, -0.5, -0.5, -0.5]"
+  R_Toe: "[0.5, -0.5, -0.5, -0.5]"
+  Head: "[0.5, -0.5, -0.5, -0.5]"
+  L_Shoulder: "[0.7071, 0, -0.7071, 0]"
+  #L_Elbow: "[0.924, -0.383, 0, 0]"
+  L_Elbow: "[1, 0, 0, 0]"
+  L_Hand: "[1, 0, 0, 0]"
+  R_Shoulder: "[0, 0.7071, 0, 0.7071]"
+  R_Elbow: "[0, 0, 0, -1]"
+  R_Hand: "[0, 0, 0, -1]"
\ No newline at end of file
diff --git a/data/AMASS/SFU/0005/0005_Walking001_poses.npz b/data/AMASS/SFU/0005/0005_Walking001_poses.npz
deleted file mode 100644
index 8044d08..0000000
Binary files a/data/AMASS/SFU/0005/0005_Walking001_poses.npz and /dev/null differ
diff --git a/data/AMASS/SFU/0005/twistdance_kick_stageii.npz b/data/AMASS/SFU/0005/twistdance_kick_stageii.npz
new file mode 100644
index 0000000..d1fff63
Binary files /dev/null and b/data/AMASS/SFU/0005/twistdance_kick_stageii.npz differ
diff --git a/data/Transitions/LICENSE.txt b/data/Transitions/LICENSE.txt
new file mode 100644
index 0000000..d2332d9
--- /dev/null
+++ b/data/Transitions/LICENSE.txt
@@ -0,0 +1,62 @@
+License
+Dataset Copyright License for non-commercial scientific research purposes
+
+Please read carefully the following terms and conditions and any accompanying documentation before you download and/or use the AMASS Dataset and the accompanying materials (jointly referred to as the "Dataset"). By downloading and/or using the Dataset, you acknowledge that you have read these terms and conditions, understand them, and agree to be bound by them. If you do not agree with these terms and conditions, you must not download and/or use the Dataset. Any infringement of the terms of this agreement will automatically terminate your rights under this License.
+
+Ownership / Licensees
+The Dataset and the associated materials has been developed at the 
+
+Max Planck Institute for Intelligent Systems (hereinafter "MPI") and the Max Planck Institute for Biological Cybernetics (hereinafter “KYB”).
+
+Any copyright or patent right is owned by and proprietary material of the
+
+Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (hereinafter “MPG”; MPI and MPG hereinafter collectively “Max-Planck”)
+
+hereinafter the “Licensor”.
+
+License Grant
+Licensor grants you (Licensee) personally a single-user, non-exclusive, non-transferable, free of charge right:
+
+To obtain and install the Dataset on computers owned, leased or otherwise controlled by you and/or your organization;
+To use the Dataset for the sole purpose of performing non-commercial scientific research, non-commercial education, or non-commercial artistic projects;
+To modify, adapt, translate or create derivative works based upon the Dataset.
+Any other use, in particular any use for commercial, pornographic, military, or surveillance, purposes is prohibited. This includes, without limitation, incorporation in a commercial product, use in a commercial service, or production of other artifacts for commercial purposes. The Data & Software may not be used to create fake, libelous, misleading, or defamatory content of any kind excluding analyses in peer-reviewed scientific research. The Data & Software may not be reproduced, modified and/or made available in any form to any third party without Max-Planck’s prior written permission.
+
+The Data & Software may not be used for pornographic purposes or to generate pornographic material whether commercial or not. This license also prohibits the use of the Software to train methods/algorithms/neural networks/etc. for commercial, pornographic, military, surveillance, or defamatory use of any kind. By downloading the Data & Software, you agree not to reverse engineer it.
+
+No Distribution
+The Dataset and the license herein granted shall not be copied, shared, distributed, re-sold, offered for re-sale, transferred or sub-licensed in whole or in part except that you may make one copy for archive purposes only.
+
+Disclaimer of Representations and Warranties
+You expressly acknowledge and agree that the Dataset results from basic research, is provided “AS IS”, may contain errors, and that any use of the Dataset is at your sole risk. LICENSOR MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE SOFTWARE, NEITHER EXPRESS NOR IMPLIED, AND THE ABSENCE OF ANY LEGAL OR ACTUAL DEFECTS, WHETHER DISCOVERABLE OR NOT. Specifically, and not to limit the foregoing, licensor makes no representations or warranties (i) regarding the merchantability or fitness for a particular purpose of the Dataset, (ii) that the use of the Dataset will not infringe any patents, copyrights or other intellectual property rights of a third party, and (iii) that the use of the Dataset will not cause any damage of any kind to you or a third party.
+
+Limitation of Liability
+Because this Software License Agreement qualifies as a donation, according to Section 521 of the German Civil Code (Bürgerliches Gesetzbuch – BGB) Licensor as a donor is liable for intent and gross negligence only. If the Licensor fraudulently conceals a legal or material defect, they are obliged to compensate the Licensee for the resulting damage.
+Licensor shall be liable for loss of data only up to the amount of typical recovery costs which would have arisen had proper and regular data backup measures been taken. For the avoidance of doubt Licensor shall be liable in accordance with the German Product Liability Act in the event of product liability. The foregoing applies also to Licensor’s legal representatives or assistants in performance. Any further liability shall be excluded.
+Patent claims generated through the usage of the Dataset cannot be directed towards the copyright holders.
+The contractor points out that add-ons as well as minor modifications to the Dataset may lead to unforeseeable and considerable disruptions.
+
+No Maintenance Services
+You understand and agree that Licensor is under no obligation to provide either maintenance services, update services, notices of latent defects, or corrections of defects with regard to the Dataset. Licensor nevertheless reserves the right to update, modify, or discontinue the Dataset at any time.
+
+Defects of the Dataset must be notified in writing to the Licensor with a comprehensible description of the error symptoms. The notification of the defect should enable the reproduction of the error. The Licensee is encouraged to communicate any use, results, modification or publication.
+
+Publications using the Dataset
+You acknowledge that the Dataset is a valuable scientific resource and agree to appropriately reference the following paper in any publication making use of the Dataset.
+
+Citation:
+
+@inproceedings{AMASS:2019,
+  title={AMASS: Archive of Motion Capture as Surface Shapes},
+  author={Mahmood, Naureen and Ghorbani, Nima and F. Troje, Nikolaus and Pons-Moll, Gerard and Black, Michael J.},
+  booktitle = {The IEEE International Conference on Computer Vision (ICCV)},
+  year={2019},
+  month = {Oct},
+  url = {https://amass.is.tue.mpg.de},
+  month_numeric = {10}
+}
+
+Commercial licensing opportunities
+For commercial uses of the Dataset, please send email to ps-license@tue.mpg.de
+
+This Agreement shall be governed by the laws of the Federal Republic of Germany except for the UN Sales Convention.
diff --git a/data/g1_29dof_offsets.npz b/data/g1_29dof_offsets.npz
new file mode 100644
index 0000000..cfc4a02
Binary files /dev/null and b/data/g1_29dof_offsets.npz differ
diff --git a/scripts/3-fit_smpl_motion_mp.py b/scripts/3-fit_smpl_motion_mp.py
index 9d0371e..bf70243 100644
--- a/scripts/3-fit_smpl_motion_mp.py
+++ b/scripts/3-fit_smpl_motion_mp.py
@@ -212,6 +212,8 @@ def mat_rotate(rotmat, v):
         opt.step()
         # if i % 10 == 0:
         #     print(f"iter {i}, loss {100 * loss.item():.3f}")
+        if i % 490 == 0:
+            print(f"iter {i}, loss {100 * loss.item():.3f}, keypoint_error {100 * keypoints_pos_error.item():.3f}, joint_limit_violation {L_limit.item():.6f}")
 
     with torch.no_grad():
         robot_keypoints_b = torch.stack([
diff --git a/scripts/3-fit_smpl_motion_withquat.py b/scripts/3-fit_smpl_motion_withquat.py
new file mode 100644
index 0000000..30f9920
--- /dev/null
+++ b/scripts/3-fit_smpl_motion_withquat.py
@@ -0,0 +1,365 @@
+import os
+import torch
+import torch.nn as nn
+import hydra
+import numpy as np
+import glob
+import pytorch_kinematics as pk
+import xml.etree.ElementTree as ET
+import multiprocessing as mp
+import joblib
+from isaaclab.utils.math import quat_mul, quat_error_magnitude
+
+from smplx import SMPL
+from scipy.spatial.transform import Rotation as sRot, Slerp
+
+os.environ["OMP_NUM_THREADS"] = "1"
+
+DATA_PATH = os.path.join(os.path.dirname(__file__), "../data")
+
+SMPL_BONE_ORDER_NAMES = [
+    "Pelvis",
+    "L_Hip",
+    "R_Hip",
+    "Torso",
+    "L_Knee",
+    "R_Knee",
+    "Spine",
+    "L_Ankle",
+    "R_Ankle",
+    "Chest",
+    "L_Toe",
+    "R_Toe",
+    "Neck",
+    "L_Thorax",
+    "R_Thorax",
+    "Head",
+    "L_Shoulder",
+    "R_Shoulder",
+    "L_Elbow",
+    "R_Elbow",
+    "L_Wrist",
+    "R_Wrist",
+    "L_Hand",
+    "R_Hand",
+]
+
+def parse_joint_limits_from_mjcf(root):
+    limits = {}
+    for j in root.findall(".//joint"):
+        jtype = j.get("type", "hinge")
+        if jtype not in ("hinge", "slide"):
+            continue
+        name = j.get("name")
+        rng = j.get("range")
+        if name is None or rng is None:
+            continue
+        low, high = map(float, rng.split())
+        limits[name] = (low, high)
+    return limits
+
+
+def build_chain(cfg) -> pk.Chain:
+    mjcf_path = cfg.asset.assetFileName
+
+    tree = ET.parse(mjcf_path)
+    root = tree.getroot()
+
+    # remove the free joint of the base link
+    root_name = cfg.get("root_name", "pelvis")
+    root_body = root.find(f".//body[@name='{root_name}']")
+    root_joint = root.find(".//joint[@type='free']")
+    root_body.remove(root_joint)
+    root_body.set("pos", "0 0 0")
+
+    for extend_config in cfg.extend_config:
+        parent = root.find(f".//body[@name='{extend_config.parent_name}']")
+        if parent is None:
+            raise ValueError(f"Parent body {extend_config.parent_name} not found in MJCF")
+        
+        pos = extend_config.pos
+        rot = extend_config.rot
+        # create and insert a dummy body with a fixed joint
+        body = ET.Element("body", name=extend_config.joint_name)
+        body.set("pos", f"{pos[0]} {pos[1]} {pos[2]}")
+        body.set("quat", f"{rot[0]} {rot[1]} {rot[2]} {rot[3]}")
+        inertial = ET.Element("inertial", pos="0 0 0", quat="0 0 0 1", mass="0.1", diaginertia="0.1 0.1 0.1")
+        body.append(inertial)
+        parent.append(body)
+
+    cwd = os.getcwd()
+    os.chdir(os.path.dirname(mjcf_path))
+    chain = pk.build_chain_from_mjcf(ET.tostring(root, method="xml"), body=root_name)
+    os.chdir(cwd)
+
+    joint_limits = parse_joint_limits_from_mjcf(root)
+    return chain, joint_limits
+
+
+def lerp(x, xp, fp):
+    return np.stack([np.interp(x, xp, fp[:, i]) for i in range(fp.shape[1])], axis=1)
+
+
+def slerp(x, xp, fp):
+    s = Slerp(xp, sRot.from_rotvec(fp))
+    return s(x).as_rotvec()
+
+
+def fit_motion(cfg, motion_path: str, fitted_shape: torch.Tensor):
+    
+    with open(motion_path, "rb") as f:
+        motion = dict(np.load(f, allow_pickle=True))
+    
+    if "mocap_framerate" not in motion and "mocap_frame_rate" not in motion:
+        print(f"Skipping motion file (no mocap_framerate or mocap_frame_rate): {motion_path}")
+        return None
+    
+    fps = int(motion["mocap_framerate"].item()) if "mocap_framerate" in motion else int(motion["mocap_frame_rate"].item())
+    T = motion["poses"].shape[0]
+    motion["poses"] = motion["poses"][:, :66].reshape(T, 22, 3)
+    if fps != int(cfg.target_fps):
+        end_t =  motion["poses"].shape[0] / fps
+        xp = np.arange(T) / fps
+        x = np.arange(0, end_t, 1 / int(cfg.target_fps))
+        if x[-1] > xp[-1]:
+            x = x[:-1]
+        motion["poses"] = np.stack([
+            slerp(x, xp, motion["poses"][:, i])
+            for i in range(22)
+        ], axis=1)
+        motion["trans"] = lerp(x, xp, motion["trans"])
+    
+    print(f"Retargeting motion at {motion_path} from {fps} to {cfg.target_fps}")
+
+    chain, joint_limits = build_chain(cfg)
+    chain.forward_kinematics(torch.zeros(1, chain.n_joints))
+
+    joint_names = chain.get_joint_parameter_names()
+    assert len(joint_names) == len(joint_limits), f"Number of joints in chain ({len(joint_names)}) does not match number of joint limits ({len(joint_limits)})"
+
+    low_list, high_list = [], []
+    for joint_name, (joint_limit_key, joint_limit_val) in zip(joint_names, joint_limits.items()):
+        assert joint_name == joint_limit_key, f"Joint name {joint_name} does not match joint name in joint limits ({joint_limit_key})"
+        low_list.append(joint_limit_val[0])
+        high_list.append(joint_limit_val[1])
+    low = torch.as_tensor(low_list, dtype=torch.float32).unsqueeze(0)   # [1, J]
+    high = torch.as_tensor(high_list, dtype=torch.float32).unsqueeze(0) # [1, J]
+
+    T = motion["poses"].shape[0]
+    body_pose = torch.as_tensor(motion["poses"][:, 1:], dtype=torch.float32)
+    hand_pose = torch.zeros(T, 2, 3)
+    data = {
+        "body_pose": torch.cat([body_pose, hand_pose], dim=1),
+        "global_orient": torch.as_tensor(motion["poses"][:, 0], dtype=torch.float32),
+        "trans": torch.as_tensor(motion["trans"], dtype=torch.float32),
+    }
+    body_model = SMPL(model_path=os.path.join(DATA_PATH, "smpl"), gender="neutral")
+
+    with torch.no_grad():
+        result = body_model.forward(
+            fitted_shape,
+            body_pose=data["body_pose"].reshape(T, 69),
+            global_orient=data["global_orient"],
+            transl=data["trans"],
+            return_full_pose=True
+        )
+       
+    full_pose_aa = result.full_pose  # [T, 72]
+    T = full_pose_aa.shape[0]
+    
+    full_pose_aa = full_pose_aa.reshape(T, 24, 3)
+    full_pose_aa_np = full_pose_aa.numpy()  # [T, 24, 3]
+    
+    # convert axis-angle to quaternions
+    smpl_local_quats = np.zeros((T, 24, 4), dtype=np.float32)
+    for t in range(T):
+        frame_pose = full_pose_aa_np[t]  # [24, 3]
+        rotations = sRot.from_rotvec(frame_pose)  
+        smpl_local_quats[t] = rotations.as_quat(scalar_first=True)  # [24, 4]
+    
+    smpl_local_quats = torch.as_tensor(smpl_local_quats, dtype=torch.float32)
+    
+    parents = body_model.parents
+
+    # compute global quaternions
+    smpl_global_quats = torch.zeros(T, 24, 4, dtype=torch.float32)
+    smpl_global_quats[:, 0] = smpl_local_quats[:, 0]
+    for i in range(1, 24):
+        parent_idx = parents[i]
+        smpl_global_quats[:, i] = quat_mul(
+            smpl_global_quats[:, parent_idx],
+            smpl_local_quats[:, i]
+        )
+    # apply orientation offsets
+    smpl_global_quats_modified = smpl_global_quats.clone()
+    for joint_name, quat in cfg.quat_offset.items():
+        joint_idx = SMPL_BONE_ORDER_NAMES.index(joint_name)
+        
+        # transform string to list
+        if isinstance(quat, str):
+            quat_str = quat.strip('[]')
+            quat = [float(x.strip()) for x in quat_str.split(',')]
+        
+        quat_offset = torch.as_tensor(quat, dtype=torch.float32)  # [4]
+        for t in range(T):
+            smpl_global_quats_modified[t, joint_idx] = quat_mul(
+                smpl_global_quats[t, joint_idx:joint_idx+1],
+                quat_offset.unsqueeze(0)
+            ).squeeze(0)
+
+    # which joints to match
+    robot_body_names = []
+    smpl_joint_idx = []
+    for robot_body_name, smpl_joint_name in cfg.joint_matches:
+        robot_body_names.append(robot_body_name)
+        smpl_joint_idx.append(SMPL_BONE_ORDER_NAMES.index(smpl_joint_name))
+    
+    smpl_target_mats = smpl_global_quats_modified[:, smpl_joint_idx]
+   
+    #print(f"joint_idx={smpl_joint_idx}   robot_body_names={robot_body_names}")   
+
+    # since the betas are changed and so are the SMPL body morphology,
+    # we need to make some corrections to avoid ground pentration
+    ground_offset = result.vertices[:, :, 2].min()
+    smpl_keypoints_w = result.joints[:, smpl_joint_idx] - ground_offset
+
+    # again, convert between Y-up and Z-up
+    robot_rot = sRot.from_rotvec(data["global_orient"]) * sRot.from_euler("xyz", [np.pi/2, 0., np.pi/2]).inv()
+    root_quat_init = torch.as_tensor(robot_rot.as_quat(scalar_first=True), dtype=torch.float32)
+    
+    robot_rotmat = torch.as_tensor(robot_rot.as_matrix(), dtype=torch.float32)
+
+    robot_root_quat = torch.nn.Parameter(root_quat_init.clone())
+
+    robot_th = torch.nn.Parameter(torch.zeros(T, chain.n_joints))        
+    robot_trans = torch.nn.Parameter(data["trans"].clone() - ground_offset)
+    opt = torch.optim.Adam([robot_th, robot_trans, robot_root_quat], lr=0.02)
+
+    indices = chain.get_all_frame_indices()
+    
+    def mat_rotate(rotmat, v):
+        return (rotmat @ v.unsqueeze(-1)).squeeze(-1)
+
+    for i in range(500):
+        root_quat_norm = robot_root_quat / (torch.norm(robot_root_quat, dim=-1, keepdim=True) + 1e-9)
+        robot_rotmat_opt = pk.quaternion_to_matrix(root_quat_norm)  # [T, 3, 3]
+        
+    
+        fk_output = chain.forward_kinematics(robot_th, indices) # in robot's root frame
+        robot_keypoints_b = torch.stack([
+            fk_output[name].get_matrix()[:, :3, 3]
+            for name in robot_body_names
+        ], dim=1)        
+        
+        robot_keypoints_w = robot_trans.unsqueeze(1) + mat_rotate(robot_rotmat_opt.unsqueeze(1), robot_keypoints_b)
+        
+        robot_orient_mats_b = torch.stack([
+            fk_output[name].get_matrix()[:, :3, :3]
+            for name in robot_body_names
+        ], dim=1) 
+        
+        robot_orient_mats_w = torch.matmul(robot_rotmat_opt.unsqueeze(1), robot_orient_mats_b)
+
+        robot_quats_w = pk.matrix_to_quaternion(robot_orient_mats_w)
+        robot_quats_w = robot_quats_w / (torch.norm(robot_quats_w, dim=-1, keepdim=True) + 1e-9)
+                
+        quat_error = quat_error_magnitude(robot_quats_w, smpl_global_quats_modified[:, smpl_joint_idx])  # [T, N]
+                
+        omega = torch.gradient(robot_th, spacing=1/cfg.target_fps, dim=0)[0]    # [T, J]
+        
+        violate_low  = torch.relu(low  - robot_th)
+        violate_high = torch.relu(robot_th - high)
+        L_limit = (violate_low**2 + violate_high**2).mean()
+
+        keypoints_pos_error = nn.functional.mse_loss(robot_keypoints_w, smpl_keypoints_w)
+        joint_pos_reg = 1e-2 * torch.mean(torch.square(robot_th))
+        joint_vel_reg = 1e-3 * torch.mean(torch.square(omega))
+        orientation_error = 1e-2 * torch.mean(torch.square(quat_error))  
+        joint_limit_reg = 1e2 * L_limit
+
+        if i < 20:
+            # first stage (0-19): only position loss, without this stage the optimization often Collapse
+            orientation_error = torch.tensor(0.0)
+            loss = keypoints_pos_error + joint_pos_reg + joint_vel_reg + joint_limit_reg            
+        
+        elif i < 500:
+            # second stage (200-399): add orientation loss with small weight
+            orientation_error = 1e-2* torch.mean(torch.square(quat_error))
+            loss = (keypoints_pos_error + joint_pos_reg + joint_vel_reg + 
+                    joint_limit_reg + orientation_error)
+        if i == 490:
+            print(f"iter {i}, loss {100 * loss.item():.3f}, keypoint_error {100 * keypoints_pos_error.item():.3f}, orientation_error {100 * orientation_error.item():.3f}")
+            
+        loss = (keypoints_pos_error + joint_pos_reg + joint_vel_reg + 
+                joint_limit_reg + orientation_error)
+        
+        opt.zero_grad()
+        loss.backward()        
+        opt.step()
+
+    with torch.no_grad():
+        robot_rotmat_final = pk.quaternion_to_matrix(robot_root_quat)
+
+        fk_output = chain.forward_kinematics(robot_th, indices)
+
+        robot_keypoints_b = torch.stack([
+            fk_output[name].get_matrix()[:, :3, 3]
+            for name in chain.get_link_names()
+        ], dim=1)        
+        
+        robot_keypoints_w = robot_trans.unsqueeze(1) + mat_rotate(robot_rotmat_final.unsqueeze(1), robot_keypoints_b)
+        
+        final_robot_rot = sRot.from_quat(robot_root_quat.detach().numpy(), scalar_first=True)
+
+    split_len = len(cfg.motion_path.split("/"))
+    motion_name = "0-" + "_".join(motion_path.split("/")[split_len:]).replace(".npz", "")
+    data = {
+        "fps": int(cfg.target_fps),
+        "joint_pos": robot_th.data.numpy(),
+        "root_pos_w": robot_trans.data.numpy(),
+        "root_quat_w": final_robot_rot.as_quat(),  
+        "body_pos_w": robot_keypoints_w.data.numpy(),
+        "body_pos_b": robot_keypoints_b.data.numpy(),
+    }
+    return motion_name, data
+
+
+def _worker(args):
+    cfg, path, betas = args
+    return fit_motion(cfg, path, betas)
+
+@hydra.main(version_base=None, config_path="../cfg", config_name="unitree_g1_fitting")
+def main(cfg):
+    if os.path.isdir(cfg.motion_path):
+        motion_paths = glob.glob(os.path.join(cfg.motion_path, "**/*.npz"), recursive=True)
+    else:
+        motion_paths = [cfg.motion_path]
+
+    motion_paths = [path for path in motion_paths]
+    print(f"Found {len(motion_paths)} motion files under {cfg.motion_path}")
+
+    path = os.path.join(os.path.dirname(__file__), f"{cfg.humanoid_type}_shape.npz")
+    fitted_shape = torch.from_numpy(np.load(path)["betas"])
+
+
+    from tqdm import tqdm
+    all_data = {}
+    with mp.get_context("spawn").Pool(
+        processes=6,
+        maxtasksperchild=1,
+    ) as pool, tqdm(total=len(motion_paths)) as pbar:
+        for result in pool.imap_unordered(
+            _worker,
+            [(cfg, p, fitted_shape) for p in motion_paths],
+            chunksize=1,
+        ):
+            if result is not None:
+                motion_name, data = result
+                all_data[motion_name] = data
+            pbar.update(1)
+
+    os.makedirs(f"data/{cfg.humanoid_type}", exist_ok=True)
+    joblib.dump(all_data, f"data/{cfg.humanoid_type}/{cfg.output_name}.pkl")
+
+if __name__ == "__main__":
+    main()  
\ No newline at end of file
diff --git a/scripts/5-findoffset.py b/scripts/5-findoffset.py
new file mode 100644
index 0000000..709b243
--- /dev/null
+++ b/scripts/5-findoffset.py
@@ -0,0 +1,272 @@
+import os
+import torch
+import torch.nn as nn
+import hydra
+import numpy as np
+import pytorch_kinematics as pk
+import xml.etree.ElementTree as ET
+import open3d as o3d
+
+from smplx import SMPL
+from scipy.spatial.transform import Rotation as sRot
+
+DATA_PATH = os.path.join(os.path.dirname(__file__), "../data")
+
+SMPL_BONE_ORDER_NAMES = [
+    "Pelvis", "L_Hip", "R_Hip", "Torso", "L_Knee", "R_Knee",
+    "Spine", "L_Ankle", "R_Ankle", "Chest", "L_Toe", "R_Toe",
+    "Neck", "L_Thorax", "R_Thorax", "Head", "L_Shoulder", "R_Shoulder",
+    "L_Elbow", "R_Elbow", "L_Wrist", "R_Wrist", "L_Hand", "R_Hand",
+]
+
+def build_chain(cfg):
+    mjcf_path = cfg.asset.assetFileName
+    tree = ET.parse(mjcf_path)
+    root = tree.getroot()
+
+    root_name = cfg.get("root_name", "pelvis")
+    root_body = root.find(f".//body[@name='{root_name}']")
+    root_joint = root.find(".//joint[@type='free']")
+    if root_joint is not None:
+        root_body.remove(root_joint)
+    root_body.set("pos", "0 0 0")
+
+    for extend_config in cfg.extend_config:
+        parent = root.find(f".//body[@name='{extend_config.parent_name}']")
+        if parent is None:
+            raise ValueError(f"Parent body {extend_config.parent_name} not found")
+        
+        body = ET.Element("body", name=extend_config.joint_name)
+        body.set("pos", f"{extend_config.pos[0]} {extend_config.pos[1]} {extend_config.pos[2]}")
+        body.set("quat", f"{extend_config.rot[0]} {extend_config.rot[1]} {extend_config.rot[2]} {extend_config.rot[3]}")
+        inertial = ET.Element("inertial", pos="0 0 0", quat="0 0 0 1", mass="0.1", diaginertia="0.1 0.1 0.1")
+        body.append(inertial)
+        parent.append(body)
+
+    cwd = os.getcwd()
+    os.chdir(os.path.dirname(mjcf_path))
+    chain = pk.build_chain_from_mjcf(ET.tostring(root, method="xml"), body=root_name)
+    os.chdir(cwd)
+    
+    return chain
+
+def rodrigues(axis_angle: torch.Tensor) -> torch.Tensor:
+    N = axis_angle.shape[0]
+    device = axis_angle.device
+    dtype = axis_angle.dtype
+    
+    theta = torch.norm(axis_angle, p=2, dim=1, keepdim=True)
+    I = torch.eye(3, device=device, dtype=dtype).expand(N, 3, 3)
+    mask = (theta > 1e-8).squeeze()
+    R = I.clone()
+    
+    if not mask.any():
+        return R
+        
+    aa_non_zero = axis_angle[mask]
+    theta_non_zero = theta[mask].unsqueeze(-1)
+    k = aa_non_zero / theta_non_zero.squeeze(-1)
+    
+    K = torch.zeros((k.shape[0], 3, 3), device=device, dtype=dtype)
+    K[:, 0, 1] = -k[:, 2]
+    K[:, 0, 2] = k[:, 1]
+    K[:, 1, 0] = k[:, 2]
+    K[:, 1, 2] = -k[:, 0]
+    K[:, 2, 0] = -k[:, 1]
+    K[:, 2, 1] = k[:, 0]
+    
+    sin_theta = torch.sin(theta_non_zero)
+    cos_theta = torch.cos(theta_non_zero)
+    
+    R_non_zero = I[mask] + K * sin_theta + torch.matmul(K, K) * (1.0 - cos_theta)
+    R[mask] = R_non_zero
+    
+    return R
+
+def compute_offsets(cfg):
+    """compute orientation offsets between SMPL and robot model"""
+    print("Loading robot chain...")
+    chain = build_chain(cfg)
+    
+    print("Loading SMPL model...")
+    path = os.path.join(os.path.dirname(__file__), f"{cfg.humanoid_type}_shape.npz")
+    fitted_shape = torch.from_numpy(np.load(path)["betas"]).float()
+    body_model = SMPL(model_path=os.path.join(DATA_PATH, "smpl"), gender="neutral")
+    
+    # set robot joint angles to zero
+    robot_th = torch.zeros(1, chain.n_joints, dtype=torch.float32)
+    
+    # get robot keypoints and orientations
+    robot_body_names = []
+    smpl_joint_idx = []
+    for robot_body_name, smpl_joint_name in cfg.joint_matches:
+        robot_body_names.append(robot_body_name)
+        smpl_joint_idx.append(SMPL_BONE_ORDER_NAMES.index(smpl_joint_name))
+    
+    # forward kinematics
+    indices = chain.get_all_frame_indices()
+    fk_output = chain.forward_kinematics(robot_th, indices)
+    
+    # get robot keypoints and orientations
+    root_translation = fk_output[chain.get_link_names()[0]].get_matrix()[:, :3, 3]
+    
+    robot_keypoints = torch.stack([
+        fk_output[name].get_matrix()[:, :3, 3]
+        for name in robot_body_names
+    ], dim=1)  # [1, N, 3]
+
+    robot_keypoints = robot_keypoints - root_translation.unsqueeze(1)
+    
+    robot_orient_mats = torch.stack([
+        fk_output[name].get_matrix()[:, :3, :3]
+        for name in robot_body_names
+    ], dim=1)  # [1, N, 3, 3]
+    
+    print(f"Robot keypoints shape: {robot_keypoints.shape}")
+    print(f"Matching {len(robot_body_names)} joints")
+    
+    # initialize SMPL pose parameters
+    body_pose = torch.nn.Parameter(torch.zeros(1, 23, 3, dtype=torch.float32))
+    global_orient = torch.nn.Parameter(torch.zeros(1, 3, dtype=torch.float32))
+    transl = torch.zeros(1, 3, dtype=torch.float32)  # 不优化
+    
+    opt = torch.optim.Adam([body_pose, global_orient], lr=0.01)
+    
+    # visualization storage
+    vertices_history = []
+    smpl_keypoints_history = []
+    
+    print("Optimizing SMPL pose...")
+    ITERS = 5000
+    for i in range(ITERS):
+        result = body_model.forward(
+            fitted_shape,
+            body_pose=body_pose.reshape(1, 69),
+            global_orient=global_orient,
+            transl=transl,
+            return_full_pose=True
+        )
+
+        pelvis = result.joints[:, None, 0]
+        smpl_keypoints = result.joints[:, smpl_joint_idx] - pelvis
+
+        pos_loss = nn.functional.mse_loss(smpl_keypoints, robot_keypoints)
+        
+        # regularization to keep pose reasonable
+        reg_loss = 1e-4 * (torch.sum(body_pose**2) + torch.sum(global_orient**2))
+        
+        loss = pos_loss + reg_loss
+        
+        if i % 1000 == 0 or i == ITERS - 1:
+            print(f"Iter {i}, loss: {loss.item():.6f}, pos_loss: {pos_loss.item():.6f}")
+            # store for visualization
+            with torch.no_grad():
+                v = (result.vertices[0] - result.joints[:, 0]).detach().cpu()
+                vertices_history.append(v)
+                smpl_keypoints_history.append(smpl_keypoints[0].detach().cpu())
+        
+        opt.zero_grad()
+        loss.backward()
+        opt.step()
+    print("\nComputing orientation offsets...")
+    
+    # compute SMPL global rotation matrices
+    with torch.no_grad():
+        full_pose_aa = result.full_pose  # [1, 24, 3]
+        full_pose_aa_flat = full_pose_aa.view(-1, 3)
+        smpl_local_mats_flat = rodrigues(full_pose_aa_flat)
+        smpl_local_mats = smpl_local_mats_flat.view(1, 24, 3, 3)
+        
+        parents = body_model.parents
+        smpl_global_mats = torch.zeros_like(smpl_local_mats)
+        smpl_global_mats[:, 0] = smpl_local_mats[:, 0]
+        for j in range(1, 24):
+            parent_idx = parents[j]
+            smpl_global_mats[:, j] = torch.matmul(
+                smpl_global_mats[:, parent_idx],
+                smpl_local_mats[:, j]
+            )
+        
+        smpl_target_mats = smpl_global_mats[:, smpl_joint_idx]  # [1, N, 3, 3]
+    
+    # 计算offset: R_offset = R_smpl^T @ R_robot
+    offsets = {}
+    print("\nComputed offsets:")
+    for idx, (robot_name, smpl_name) in enumerate(cfg.joint_matches):
+        R_robot = robot_orient_mats[0, idx].numpy()  # [3, 3]
+        R_smpl = smpl_target_mats[0, idx].numpy()    # [3, 3]
+        
+        # R_offset = R_smpl^T @ R_robot
+        R_offset = R_smpl.T @ R_robot
+
+        rot = sRot.from_matrix(R_offset)
+        quat = rot.as_quat(scalar_first=True)
+        
+        offsets[smpl_name] = quat.tolist()
+        print(f"  {smpl_name}: [{quat[0]:.6f}, {quat[1]:.6f}, {quat[2]:.6f}, {quat[3]:.6f}]")
+
+    output_path = f"data/{cfg.humanoid_type}_offsets.npz"
+    os.makedirs(os.path.dirname(output_path), exist_ok=True)
+    np.savez(output_path, **offsets)
+    print(f"\nOffsets saved to {output_path}")
+
+    print("\nYAML config format:")
+    print("quat_offset:")
+    for joint_name, quat in offsets.items():
+        print(f"  {joint_name}: [{quat[0]:.6f}, {quat[1]:.6f}, {quat[2]:.6f}, {quat[3]:.6f}]")
+    
+    print("\nVisualizing results...")
+    visualize_optimization(body_model, vertices_history, smpl_keypoints_history, robot_keypoints)
+    
+    return offsets
+
+def visualize_optimization(body_model, vertices_history, smpl_keypoints_history, robot_keypoints):
+    """visualize the optimization process using Open3D"""
+    vis = o3d.visualization.Visualizer()
+    vis.create_window()
+
+    frame = o3d.geometry.TriangleMesh.create_coordinate_frame()
+    frame.compute_vertex_normals()
+    vis.add_geometry(frame)
+
+    def init_mesh(vertices, color=[0.3, 0.3, 0.3]):
+        mesh = o3d.geometry.TriangleMesh()
+        mesh.vertices = o3d.utility.Vector3dVector(vertices)
+        mesh.triangles = o3d.utility.Vector3iVector(body_model.faces)
+        mesh.compute_vertex_normals()
+        mesh.paint_uniform_color(color)
+        return mesh
+
+    robot_keypoints_cpu = robot_keypoints.detach().cpu()
+
+    for i, v in enumerate(vertices_history):
+        offset = torch.tensor([0., i * 2, 0.])  # 每个阶段间隔开
+
+        v_with_offset = v + offset + torch.tensor([2., 0., 0.])
+        mesh = init_mesh(v_with_offset.numpy())
+        vis.add_geometry(mesh)
+
+        pcd_smpl = o3d.geometry.PointCloud()
+        pcd_smpl.points = o3d.utility.Vector3dVector(
+            (smpl_keypoints_history[i] + offset + torch.tensor([1., 0., 0.])).numpy()
+        )
+        pcd_smpl.colors = o3d.utility.Vector3dVector([[0, 0, 1] for _ in range(len(smpl_keypoints_history[i]))])
+        vis.add_geometry(pcd_smpl)
+
+        pcd_robot = o3d.geometry.PointCloud()
+        pcd_robot.points = o3d.utility.Vector3dVector((robot_keypoints_cpu[0] + offset).numpy())
+        pcd_robot.colors = o3d.utility.Vector3dVector([[1, 0, 0] for _ in range(len(robot_keypoints_cpu[0]))])
+        vis.add_geometry(pcd_robot)
+
+    opt = vis.get_render_option()
+    opt.point_size = 10.0
+    
+    vis.run()
+    vis.destroy_window()
+
+@hydra.main(version_base=None, config_path="../cfg", config_name="unitree_g1_fitting")
+def main(cfg):
+    offsets = compute_offsets(cfg)
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/scripts/g1_29dof_shape.npz b/scripts/g1_29dof_shape.npz
index 67f4f85..4e9d2bc 100644
Binary files a/scripts/g1_29dof_shape.npz and b/scripts/g1_29dof_shape.npz differ