From ba3cee8e24ca5f9f6a23ae6448d4aaa8ebdd5a65 Mon Sep 17 00:00:00 2001
From: Brian Lee <brian.lee@tri.global>
Date: Mon, 10 Nov 2025 20:35:26 -0500
Subject: [PATCH 1/2] Add RL Box Flipping Problem

---
 book/rl.html               |  27 +++++++++
 book/rl/train_boxflipup.py | 120 ++++++++++++++++++++++++++++++++-----
 2 files changed, 132 insertions(+), 15 deletions(-)
diff --git a/book/rl.html b/book/rl.html
index 941e334c..d6d14361 100644
--- a/book/rl.html
+++ b/book/rl.html
@@ -299,6 +299,33 @@ <h1>Using a Drake simulation as an Gym environment</h1>
         <li> Implement the advantage function. </li>
       </ol>
     </exercise>
+    <exercise id="rl-box-flipup"><h1>Analyzing Box Flipping with RL</h1>
+      In this exercise, you will analyze the behavior of a <a href="https://arxiv.org/abs/1707.06347">PPO</a> policy trained to flip over a box. Like REINFORCE, PPO is a policy-gradient method that
+      directly optimizes the policy parameters to maximize the value function. In order to have an easier problem to analyze,
+      we'll use the <a href="https://manipulation.csail.mit.edu/force.html#force_flip_up">box flipup example</a> from Chapter 8. Our robot will be a simple point finger and 
+      the goal will be to flip over the box. You can find the code used to train the policy <a href="https://github.com/RussTedrake/manipulation/blob/master/book/rl/train_boxflipup.py">here</a>.
+      <ol type="a">
+        <li> Take a look at the <a href="https://github.com/RussTedrake/manipulation/blob/master/manipulation/envs/box_flipup.py">code</a> used to generate the environment. 
+          Let $\theta$ denote the angle of the box from the vertical, $\omega$ denote the angular velocity of the box, $q_f$ denote the observed position of the finger, 
+          $v_f$ denote the velocity of the finger, and $u_f$ denote the commanded position of the finger. What is the reward function used here to train the policy? 
+          Write it down mathematically (use the modulo operator to handle the ''wrapping around'' of the angle). What do the individual terms in the reward
+          function represent? Why do they make sense?</li>
+        <li> Although we will not go into the exact details of how PPO works here, it works quite similarly to REINFORCE but using both (i) a learned value function to reduce variance, and (ii) an approximate
+          objective, along with a ''trust region'' constraint by clipping the per-sample loss to ensure that the policy is not updated too much at each step. Briefly explain why you think that 
+          (a) PPO might be more stable and sample efficient than REINFORCE, and (b) how you might expect PPO to perform on the box flipping task if the clipping limits are set to be too small or too large.</li>
+        <li> We've trained a PPO-based policy to flip the box for 3,000,000 steps (see 
+          <a href="https://youtube.com/playlist?list=PLOZK7fx6sI6lvDalINByA_kbYfBxn76by&si=XUndO5UDHMJcywr2">here</a> for videos of the
+          policy in action at each of the timesteps). How does the policy perform as the number of steps increases? Write qualitatively
+          how the policy changes over time and which parts of the reward function are having the greatest effect at each step.
+        </li>
+      </ol>
+      Notice how much time it takes to train a working policy, even for a simple manipulation problem like the 2D box flipping example with a point finger and a dense reward. 
+      Harder problems in manipulation (such as pick and place) can become extremely challenging to train naïvely with Reinforcement Learning, especially with 
+      sparse rewards such as in typical pick and place tasks where you only receive a reward when the object has been picked or placed in the right location. On the other hand,
+      reinforcement learning can work well in contact-rich settings (as in the box flipping example); see <a href="https://www.youtube.com/watch?v=x4O8pojMF0w">RL solving a rubik's cube with one hand</a>
+      for an example of RL being used to solve a contact-rich manipulation task (note this also depended heavily on things like domain randomization, curriculum learning, large scale compute, etc.).
+      The story in locomotion, on the other hand, seems to be quite different, perhaps because it is easier to design dense rewards and to automate resets in simulation. 
+    </exercise>
   </section>
 
 </chapter>
diff --git a/book/rl/train_boxflipup.py b/book/rl/train_boxflipup.py
index f13e1bb3..7af25433 100644
--- a/book/rl/train_boxflipup.py
+++ b/book/rl/train_boxflipup.py
@@ -1,25 +1,75 @@
 """
 Train a policy for manipuolation.gym.envs.box_flipup
+
+Example usage:
+
+python solutions/notebooks/rl/train_box_flipup.py --checkpoint_freq 100000 --wandb
 """
 
 import argparse
 import os
 import sys
+from pathlib import Path
 
 import gymnasium as gym
-import wandb
 
 # `multiprocessing` also provides this method, but empirically `psutil`'s
 # version seems more reliable.
 from psutil import cpu_count
 from pydrake.all import StartMeshcat
 from stable_baselines3 import PPO
+from stable_baselines3.common.callbacks import (
+    BaseCallback,
+    CallbackList,
+    EveryNTimesteps,
+    ProgressBarCallback,
+)
 from stable_baselines3.common.env_checker import check_env
 from stable_baselines3.common.env_util import make_vec_env
 from stable_baselines3.common.vec_env import SubprocVecEnv
 from wandb.integration.sb3 import WandbCallback
 
 import manipulation.envs.box_flipup  # no-member
+import wandb
+
+
+class OffsetCheckpointCallback(BaseCallback):
+    """
+    Saves checkpoints with a global step count that includes an offset, so that
+    resumed training from, e.g., 3,000,000 steps will save checkpoints named
+    with accumulated steps (e.g., 4,000,000 after 1,000,000 more steps).
+
+    This callback is intended to be wrapped by EveryNTimesteps for frequency control.
+    """
+
+    def __init__(
+        self,
+        save_path: Path,
+        name_prefix: str,
+        expected_resume_steps: int | None = None,
+        verbose: int = 0,
+    ):
+        super().__init__(verbose)
+        self.save_path = Path(save_path)
+        self.save_path.mkdir(parents=True, exist_ok=True)
+        self.name_prefix = name_prefix
+        self.expected_resume_steps = expected_resume_steps
+
+    def _on_step(self) -> bool:
+        # Determine effective offset only at save-time to avoid relying on construction order.
+        loaded_steps = int(getattr(self.model, "num_timesteps", 0))
+        offset = 0
+        if (
+            self.expected_resume_steps is not None
+            and loaded_steps < self.expected_resume_steps
+        ):
+            offset = int(self.expected_resume_steps)
+        total_steps = offset + loaded_steps
+        ckpt_path = self.save_path / f"{self.name_prefix}_{total_steps}_steps.zip"
+        if self.verbose > 0:
+            print(f"Saving checkpoint to {ckpt_path}")
+        self.model.save(str(ckpt_path))
+        return True
 
 
 def main():
@@ -27,11 +77,18 @@ def main():
     parser.add_argument("--test", action="store_true")
     parser.add_argument("--train_single_env", action="store_true")
     parser.add_argument("--wandb", action="store_true", default=False)
+    parser.add_argument("--checkpoint_freq", type=int, default=100_000)
     parser.add_argument("--debug", action="store_true")
+    parser.add_argument(
+        "--resume_steps",
+        type=int,
+        default=None,
+        help="If set, resume from checkpoint at this timestep (e.g., 3000000).",
+    )
     parser.add_argument(
         "--log_path",
         help="path to the logs directory.",
-        default="/tmp/BoxFlipUp/",
+        default="book/rl/BoxFlipUp/logs",
     )
     args = parser.parse_args()
 
@@ -55,6 +112,29 @@ def main():
     else:
         run = wandb.init(mode="disabled")
 
+    # Where to put checkpoints
+    ckpt_dir = Path(args.log_path).parent / "checkpoints"
+    ckpt_dir.mkdir(parents=True, exist_ok=True)
+
+    # Save a checkpoint when this callback is called.
+    # We'll call it via EveryNTimesteps so save_freq can be 1.
+    if True:
+        checkpoint_cb = OffsetCheckpointCallback(
+            save_path=ckpt_dir,
+            name_prefix="ppo_boxflipup",
+            expected_resume_steps=args.resume_steps,
+        )
+
+    # Trigger the checkpoint exactly every 50,000 timesteps (robust to n_envs)
+    every_n_timesteps = EveryNTimesteps(
+        n_steps=args.checkpoint_freq, callback=checkpoint_cb
+    )
+
+    # Combine with your existing Wandb callback
+    callbacks = CallbackList(
+        [WandbCallback(), every_n_timesteps, ProgressBarCallback()]
+    )
+
     zip = f"data/box_flipup_ppo_{config['observations']}.zip"
 
     num_cpu = int(cpu_count() / 2) if not args.test else 2
@@ -88,22 +168,32 @@ def make_boxflipup():
 
     if args.test:
         model = PPO("MlpPolicy", env, n_steps=4, n_epochs=2, batch_size=8)
+    elif (
+        args.resume_steps is not None
+        and (ckpt_dir / f"ppo_boxflipup_{args.resume_steps}_steps.zip").exists()
+    ):
+        print(f"Loading checkpoint at {args.resume_steps} steps")
+        model = PPO.load(
+            str(ckpt_dir / f"ppo_boxflipup_{args.resume_steps}_steps.zip"),
+            env,
+            verbose=1,
+            tensorboard_log=f"runs/{run.id}",
+            device="cuda",
+        )
     elif os.path.exists(zip):
-        model = PPO.load(zip, env, verbose=1, tensorboard_log=f"runs/{run.id}")
+        model = PPO.load(
+            zip, env, verbose=1, tensorboard_log=f"runs/{run.id}", device="cuda"
+        )
     else:
-        model = PPO("MlpPolicy", env, verbose=1, tensorboard_log=f"runs/{run.id}")
-
-    new_log = True
-    while True:
-        model.learn(
-            total_timesteps=100000 if not args.test else 4,
-            reset_num_timesteps=new_log,
-            callback=WandbCallback(),
+        model = PPO(
+            "MlpPolicy", env, verbose=1, tensorboard_log=f"runs/{run.id}", device="cuda"
         )
-        if args.test:
-            break
-        model.save(zip)
-        new_log = False
+
+    model.learn(
+        total_timesteps=3e6 if not args.test else 4,
+        callback=callbacks,
+    )
+    model.save(zip)
 
 
 if __name__ == "__main__":

From cce9a72e21c8a08e1e65800a093ade2bf5024d22 Mon Sep 17 00:00:00 2001
From: bernhardpg <bernhardpg@gmail.com>
Date: Tue, 11 Nov 2025 16:30:03 -0500
Subject: [PATCH 2/2] Run pre-commits

---
 book/rl/train_boxflipup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/book/rl/train_boxflipup.py b/book/rl/train_boxflipup.py
index 7af25433..88d0f5c8 100644
--- a/book/rl/train_boxflipup.py
+++ b/book/rl/train_boxflipup.py
@@ -12,6 +12,7 @@
 from pathlib import Path
 
 import gymnasium as gym
+import wandb
 
 # `multiprocessing` also provides this method, but empirically `psutil`'s
 # version seems more reliable.
@@ -30,7 +31,6 @@
 from wandb.integration.sb3 import WandbCallback
 
 import manipulation.envs.box_flipup  # no-member
-import wandb
 
 
 class OffsetCheckpointCallback(BaseCallback):