From cea6d67d5edfa822453cf87967966405bed30d90 Mon Sep 17 00:00:00 2001
From: typoverflow <typoverflow@outlook.com>
Date: Mon, 16 Feb 2026 13:42:37 -0500
Subject: [PATCH] feat: add fast_td3

---
 scripts/reinforcement_learning/rlopt/train.py | 27 ++++---
 source/isaaclab_rl/isaaclab_rl/rlopt.py       |  7 +-
 .../locomotion/velocity/config/g1/__init__.py |  1 +
 .../config/g1/agents/rlopt_fasttd3_cfg.py     | 70 +++++++++++++++++++
 4 files changed, 96 insertions(+), 9 deletions(-)
 create mode 100644 source/isaaclab_tasks/isaaclab_tasks/manager_based/locomotion/velocity/config/g1/agents/rlopt_fasttd3_cfg.py

diff --git a/scripts/reinforcement_learning/rlopt/train.py b/scripts/reinforcement_learning/rlopt/train.py
index b2a37f69fae..347d1c1300d 100644
--- a/scripts/reinforcement_learning/rlopt/train.py
+++ b/scripts/reinforcement_learning/rlopt/train.py
@@ -1,4 +1,9 @@
-# Feiyang Wu (feiyangwu@gatech.edu), based on sb3/trian.py
+# Copyright (c) 2022-2026, The Isaac Lab Project Developers (https://github.com/isaac-sim/IsaacLab/blob/main/CONTRIBUTORS.md).
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+# Feiyang Wu (feiyangwu@gatech.edu), based on sb3/train.py
 
 """Script to train RL agent with Stable Baselines3."""
 
@@ -35,7 +40,7 @@
     dest="algorithm",
     type=str.upper,
     default="PPO",
-    choices=["PPO", "SAC", "IPMD"],
+    choices=["PPO", "SAC", "IPMD", "FASTTD3"],
     help="RLOpt algorithm to train (must match the agent config).",
 )
 
@@ -85,7 +90,7 @@ def cleanup_pbar(*args):
 
 import gymnasium as gym
 import torch
-from rlopt.agent import IPMD, PPO, SAC
+from rlopt.agent import IPMD, PPO, SAC, FastTD3
 from torchrl.data import TensorDictReplayBuffer
 from torchrl.data.replay_buffers.storages import LazyMemmapStorage
 from torchrl.envs import (
@@ -120,6 +125,7 @@ def cleanup_pbar(*args):
     "PPO": PPO,
     "SAC": SAC,
     "IPMD": IPMD,
+    "FASTTD3": FastTD3,
 }
 
 
@@ -169,6 +175,7 @@ def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg | DirectMARLEnvCfg, agen
             args_cli.max_iterations * agent_cfg.collector.total_frames * env_cfg.scene.num_envs
         )
     agent_cfg.collector.frames_per_batch *= env_cfg.scene.num_envs
+    agent_cfg.collector.init_random_frames *= env_cfg.scene.num_envs
     # set the environment seed
     # note: certain randomizations occur in the environment initialization so we set the seed here
     env_cfg.seed = agent_cfg.seed
@@ -236,13 +243,17 @@ def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg | DirectMARLEnvCfg, agen
     env = env.set_info_dict_reader(
         IsaacLabTerminalObsReader(observation_spec=env.observation_spec, backend="gymnasium")  # type: ignore
     )
+    if args_cli.algorithm in ["FASTTD3", "SAC"]:
+        # off-policy algorithms, should not use normalization in environment wrapper
+        transform = Compose(
+            RewardSum(),
+            StepCounter(1000),
+        )
+    else:
+        transform = Compose(RewardSum(), StepCounter(1000), VecNormV2(in_keys=agent_cfg.policy.input_keys + ["reward"]))
     env = TransformedEnv(
         env=env,
-        transform=Compose(
-            RewardSum(),  # type: ignore
-            StepCounter(1000),  # type: ignore
-            VecNormV2(in_keys=policy_in_keys + ["reward"]),
-        ),
+        transform=transform,
     )
 
     agent_class = ALGORITHM_CLASS_MAP[args_cli.algorithm]
diff --git a/source/isaaclab_rl/isaaclab_rl/rlopt.py b/source/isaaclab_rl/isaaclab_rl/rlopt.py
index 527b27cd7e5..d66fb3e7c16 100644
--- a/source/isaaclab_rl/isaaclab_rl/rlopt.py
+++ b/source/isaaclab_rl/isaaclab_rl/rlopt.py
@@ -1,3 +1,8 @@
+# Copyright (c) 2022-2026, The Isaac Lab Project Developers (https://github.com/isaac-sim/IsaacLab/blob/main/CONTRIBUTORS.md).
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
 from __future__ import annotations
 
 from collections import deque
@@ -5,7 +10,7 @@
 
 import gymnasium as gym
 import torch
-from rlopt.agent import IPMDRLOptConfig, PPORLOptConfig, SACRLOptConfig  # noqa: F401
+from rlopt.agent import IPMDRLOptConfig, PPORLOptConfig, SACRLOptConfig, FastTD3RLOptConfig  # noqa: F401
 from rlopt.config_base import RLOptConfig
 from torchrl.data.tensor_specs import Bounded, Composite, Unbounded
 from torchrl.envs.libs.gym import GymWrapper, _gym_to_torchrl_spec_transform, terminal_obs_reader
diff --git a/source/isaaclab_tasks/isaaclab_tasks/manager_based/locomotion/velocity/config/g1/__init__.py b/source/isaaclab_tasks/isaaclab_tasks/manager_based/locomotion/velocity/config/g1/__init__.py
index 627b309a030..6f9cc3e9fad 100644
--- a/source/isaaclab_tasks/isaaclab_tasks/manager_based/locomotion/velocity/config/g1/__init__.py
+++ b/source/isaaclab_tasks/isaaclab_tasks/manager_based/locomotion/velocity/config/g1/__init__.py
@@ -49,6 +49,7 @@
         "rlopt_cfg_entry_point": f"{agents.__name__}.rlopt_sac_cfg:G1RLOptSACFlatConfig",
         "rlopt_ppo_cfg_entry_point": f"{agents.__name__}.rlopt_ppo_cfg:G1RLOptPPOFlatConfig",
         "rlopt_sac_cfg_entry_point": f"{agents.__name__}.rlopt_sac_cfg:G1RLOptSACFlatConfig",
+        "rlopt_fasttd3_cfg_entry_point": f"{agents.__name__}.rlopt_fasttd3_cfg:G1RLOptFastTD3FlatConfig",
     },
 )
 
diff --git a/source/isaaclab_tasks/isaaclab_tasks/manager_based/locomotion/velocity/config/g1/agents/rlopt_fasttd3_cfg.py b/source/isaaclab_tasks/isaaclab_tasks/manager_based/locomotion/velocity/config/g1/agents/rlopt_fasttd3_cfg.py
new file mode 100644
index 00000000000..8dd75626dcb
--- /dev/null
+++ b/source/isaaclab_tasks/isaaclab_tasks/manager_based/locomotion/velocity/config/g1/agents/rlopt_fasttd3_cfg.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2022-2026, The Isaac Lab Project Developers (https://github.com/isaac-sim/IsaacLab/blob/main/CONTRIBUTORS.md).
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+from isaaclab.utils import configclass
+
+from isaaclab_rl.rlopt import FastTD3RLOptConfig
+
+
+# Convenience configurations for different scenarios
+@configclass
+class G1RLOptFastTD3Config(FastTD3RLOptConfig):
+    """RLOpt FastTD3 configuration for G1.
+
+    Note: input_dim values are left as None for lazy initialization.
+    The networks will automatically infer dimensions from the environment specs.
+    """
+
+    def __post_init__(self):
+        """Post-initialization setup."""
+        super().__post_init__()
+
+        # Collector settings
+        self.collector.frames_per_batch = 1  # num_steps_per_env (multiplied by num_envs in train.py)
+        self.collector.init_random_frames = 10
+
+        # FastTD3 settings
+        self.fasttd3.gamma = 0.99
+        self.fasttd3.policy_noise = 0.001
+        self.fasttd3.noise_clip = 0.5
+        self.fasttd3.use_cdq = True
+        self.fasttd3.disable_bootstrap = False
+        self.fasttd3.v_min = -10.0
+        self.fasttd3.v_max = 10.0
+        self.fasttd3.batch_size = 8
+        self.fasttd3.action_bounds = 1.0
+        self.fasttd3.std_max = 0.4
+        self.fasttd3.num_atoms = 251
+        self.fasttd3.tau = 0.1
+        self.fasttd3.num_updates = 4
+        self.fasttd3.num_steps = 8
+
+        # optimizer
+        self.optim.optimizer = "adamw"
+        self.optim.weight_decay = 0.1
+        self.optim.lr = 3e-4
+        self.optim.max_grad_norm = None
+
+        # buffer
+        self.replay_buffer.size = 1024 * 10
+        self.replay_buffer.prb = False
+
+
+@configclass
+class G1RLOptFastTD3FlatConfig(G1RLOptFastTD3Config):
+    """RLOpt SAC configuration for G1 on flat terrain."""
+
+    def __post_init__(self):
+        """Post-initialization setup for flat terrain."""
+        super().__post_init__()
+
+        # assert self.q_function is not None, "Q function configuration must be provided."
+
+        # Network architecture for flat terrain
+        self.fasttd3.num_steps = 8
+        self.fasttd3.num_updates = 4
+
+        # Training duration
+        self.collector.total_frames = 100_000_000