wip

tastelikefeet · tastelikefeet · commit 3a25caa7ad20 · 2026-03-27T17:01:22.000+08:00
diff --git a/cookbook/rl/dpo.py b/cookbook/rl/dpo.py
@@ -51,7 +51,6 @@
 import os
 from typing import Any, Dict, List, Optional
 
-import torch
 from peft import LoraConfig
 
 import twinkle
@@ -63,7 +62,6 @@
 from twinkle.model import TransformersModel
 from twinkle.preprocessor import EmojiDPOProcessor
 from twinkle.processor import InputProcessor
-from twinkle.template import Template
 
 logger = get_logger()
 
@@ -75,8 +73,8 @@
 REF_MODEL_GPUS = int(os.environ.get('REF_MODEL_GPUS', 4))
 NUM_GPUS = MODEL_GPUS + REF_MODEL_GPUS
 
-BATCH_SIZE = int(os.environ.get('BATCH_SIZE', 8))  # Number of preference pairs
-MICRO_BATCH_SIZE = int(os.environ.get('MICRO_BATCH_SIZE', 2))
+BATCH_SIZE = int(os.environ.get('BATCH_SIZE', 4))  # Number of preference pairs
+MICRO_BATCH_SIZE = int(os.environ.get('MICRO_BATCH_SIZE', 4))
 GRADIENT_ACCUMULATION_STEPS = int(os.environ.get('GRADIENT_ACCUMULATION_STEPS', 1))
 MAX_STEPS = int(os.environ.get('MAX_STEPS', 1000))
 LEARNING_RATE = float(os.environ.get('LR', 5e-6))
@@ -100,31 +98,38 @@ def create_dpo_dataset():
     )
     # DPO preprocessor returns {'positive': [...], 'negative': [...]}
     # batch_encode handles this format automatically
-    dataset.encode()
+    dataset.encode(load_from_cache_file=True)
     return dataset
 
 
-def prepare_dpo_batch(
-    batch: Dict[str, List[Any]],
-    template: Template,
-) -> List[Dict[str, Any]]:
-    """Prepare DPO batch: convert encoded batch to list format for training.
+def prepare_dpo_batch(batch: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    """Prepare DPO batch: reorganize batch for training with DP-safe interleaving.
 
     Args:
-        batch: Dict with 'positive' and 'negative' keys, each containing List[InputFeature]
+        batch: List of rows, each with 'positive' and 'negative' InputFeatures
+               and other fields (question, etc.)
 
     Returns:
-        List organized as [positive_1, ..., positive_n, negative_1, ..., negative_n]
+        List interleaved as [pos_1, neg_1, pos_2, neg_2, ...] to ensure each DP
+        worker gets complete positive/negative pairs after slicing.
+        Each item contains all original fields plus the InputFeature fields.
     """
-    positive_features = batch.get('positive', [])
-    negative_features = batch.get('negative', [])
+    result = []
 
-    # Convert to list of dicts
-    positive_samples = [dict(f) for f in positive_features]
-    negative_samples = [dict(f) for f in negative_features]
+    for row in batch:
+        # Get base fields (excluding positive/negative)
+        base_fields = {k: v for k, v in row.items() if k not in ('positive', 'negative')}
 
-    # Return [positive..., negative...]
-    return positive_samples + negative_samples
+        # Positive sample: merge base fields with positive InputFeature
+        pos_sample = {**base_fields, **row['positive']}
+        # Negative sample: merge base fields with negative InputFeature
+        neg_sample = {**base_fields, **row['negative']}
+
+        # Interleave: [pos, neg] per pair for DP-safe slicing
+        result.append(pos_sample)
+        result.append(neg_sample)
+
+    return result
 
 
 # ── Loss Factory ──────────────────────────────────────────────────────────────
@@ -196,9 +201,6 @@ def main():
     policy_model.set_processor(InputProcessor)
     policy_model.set_template('Template', model_id=MODEL_ID)
 
-    # Get template for encoding rejected messages
-    template = Template(model_id=MODEL_ID, max_length=MAX_LENGTH)
-
     # ── Reference Model Setup ─────────────────────────────────────────────────
     ref_model = None
     if not reference_free:
@@ -223,50 +225,19 @@ def main():
         if optim_step >= MAX_STEPS:
             break
 
-        # batch is Dict[str, List[Trajectory]] with 'positive' and 'negative' keys
-        dpo_batch = prepare_dpo_batch(batch, template)
+        # batch is List[Dict] with 'positive' and 'negative' keys
+        dpo_batch = prepare_dpo_batch(batch)
 
-        # Compute reference log probabilities if using reference model
-        # We compute sequence-level logps here to avoid alignment issues with micro-batching
-        ref_chosen_logps = None
-        ref_rejected_logps = None
+        # Get reference outputs (lazy - not collected to driver)
+        ref_outputs = None
         if ref_model is not None:
-            with torch.no_grad():
-                ref_outputs = ref_model.forward_only(inputs=dpo_batch)
-                ref_logps = ref_outputs.get('logps')  # [batch, seq_len]
-                if ref_logps is not None:
-                    # Get labels and pad to same length for stacking
-                    label_tensors = [torch.as_tensor(s['labels']) for s in dpo_batch]
-                    max_len = max(t.shape[0] for t in label_tensors)
-                    # Pad labels with -100 (ignore_index) to max length
-                    padded_labels = []
-                    for t in label_tensors:
-                        if t.shape[0] < max_len:
-                            pad_size = max_len - t.shape[0]
-                            t = torch.cat([torch.full((pad_size,), -100, dtype=t.dtype), t])
-                        padded_labels.append(t)
-                    ref_labels = torch.stack(padded_labels)
-                    if ref_labels.device != ref_logps.device:
-                        ref_labels = ref_labels.to(ref_logps.device)
-                    # Align sequence lengths if needed
-                    if ref_logps.shape[1] != ref_labels.shape[1]:
-                        min_len = min(ref_logps.shape[1], ref_labels.shape[1])
-                        ref_logps = ref_logps[:, -min_len:]
-                        ref_labels = ref_labels[:, -min_len:]
-                    # Compute sequence-level logps (sum of valid token logps)
-                    loss_mask = (ref_labels != -100).float()
-                    seq_logps = (ref_logps * loss_mask).sum(dim=-1)  # [batch]
-
-                    # Split into chosen and rejected
-                    half = seq_logps.shape[0] // 2
-                    ref_chosen_logps = seq_logps[:half]
-                    ref_rejected_logps = seq_logps[half:]
+            ref_outputs = ref_model.forward_only(inputs=dpo_batch)
 
         # Forward-backward pass with DPO loss
+        # ref_outputs is passed to loss which extracts logps internally
         policy_model.forward_backward(
             inputs=dpo_batch,
-            ref_chosen_logps=ref_chosen_logps,
-            ref_rejected_logps=ref_rejected_logps,
+            ref_outputs=ref_outputs,
         )
 
         # Gradient clipping and optimizer step
diff --git a/src/twinkle/loss/dpo.py b/src/twinkle/loss/dpo.py
@@ -88,9 +88,13 @@ def _split_chosen_rejected(
         self,
         tensor: 'torch.Tensor',
     ) -> tuple:
-        """Split tensor into chosen (first half) and rejected (second half)."""
-        half = tensor.shape[0] // 2
-        return tensor[:half], tensor[half:]
+        """Split interleaved tensor into chosen and rejected.
+
+        Input format: [pos_1, neg_1, pos_2, neg_2, ...] (interleaved for DP-safe slicing)
+        Output: (chosen [pos_1, pos_2, ...], rejected [neg_1, neg_2, ...])
+        """
+        # Even indices = chosen (positive), odd indices = rejected (negative)
+        return tensor[0::2], tensor[1::2]
 
 
 class DPOLoss(PreferenceLossBase):
@@ -131,20 +135,18 @@ def __init__(
         self.loss_type = loss_type
         self.reference_free = reference_free
 
-    def _pad_and_align_logps(
+    def _align_logps(
         self,
-        logps: Union['torch.Tensor', List[List[float]]],
+        logps: 'torch.Tensor',
         target_shape: tuple,
-        loss_mask: 'torch.Tensor',
         device: 'torch.device',
         dtype: 'torch.dtype',
     ) -> 'torch.Tensor':
-        """Pad and align log probabilities to target shape.
+        """Align log probabilities to target shape.
 
         Args:
-            logps: Input log probabilities (tensor or ragged list)
+            logps: Input log probabilities tensor
             target_shape: Target (batch, seq_len) shape
-            loss_mask: Boolean mask for valid positions
             device: Target device
             dtype: Target dtype
 
@@ -153,40 +155,32 @@ def _pad_and_align_logps(
         """
         import torch
 
-        if torch.is_tensor(logps):
-            if logps.dim() == 1:
-                logps = logps.unsqueeze(0)
-            if logps.shape == target_shape:
-                return logps.to(device=device, dtype=dtype)
-            # Handle tensor with different sequence length - align to target shape
-            if logps.dim() == 2 and logps.shape[0] == target_shape[0]:
-                batch_size, target_seq_len = target_shape
-                src_seq_len = logps.shape[1]
-                logps = logps.to(device=device, dtype=dtype)
-                if src_seq_len > target_seq_len:
-                    # Truncate: take the last target_seq_len tokens (response part)
-                    return logps[:, -target_seq_len:]
-                else:
-                    # Pad: add zeros at the beginning
-                    padded = torch.zeros(target_shape, device=device, dtype=dtype)
-                    padded[:, -src_seq_len:] = logps
-                    return padded
-
-        # Handle ragged list input
-        if isinstance(logps, (list, tuple)):
-            batch_size, seq_len = target_shape
-            padded = torch.zeros(target_shape, device=device, dtype=dtype)
-            for i, row in enumerate(logps):
-                if row is None:
-                    continue
-                row_t = torch.as_tensor(row, device=device, dtype=dtype)
-                valid_positions = loss_mask[i].nonzero(as_tuple=True)[0]
-                length = min(len(row_t), len(valid_positions))
-                if length > 0:
-                    padded[i, valid_positions[:length]] = row_t[:length]
-            return padded
-
-        return logps.to(device=device, dtype=dtype)
+        if not torch.is_tensor(logps):
+            raise TypeError(f'Expected torch.Tensor, got {type(logps)}')
+
+        if logps.dim() == 1:
+            logps = logps.unsqueeze(0)
+
+        if logps.shape == target_shape:
+            return logps.to(device=device, dtype=dtype)
+
+        # Handle tensor with different sequence length
+        if logps.dim() == 2 and logps.shape[0] == target_shape[0]:
+            batch_size, target_seq_len = target_shape
+            src_seq_len = logps.shape[1]
+            logps = logps.to(device=device, dtype=dtype)
+            if src_seq_len > target_seq_len:
+                # Truncate right (keep left part) - may happen in Ray result merging
+                return logps[:, :target_seq_len]
+            else:
+                raise ValueError(
+                    f'ref_logps seq_len ({src_seq_len}) < target seq_len ({target_seq_len}). '
+                    f'This should not happen when both models process the same batch.'
+                )
+
+        raise ValueError(
+            f'Cannot align ref_logps shape {logps.shape} to target shape {target_shape}'
+        )
 
     def _compute_dpo_loss(
         self,
@@ -254,6 +248,7 @@ def __call__(
         inputs: Dict,
         outputs: Dict,
         *,
+        ref_outputs: Optional[Dict] = None,
         ref_logps: Optional[Union['torch.Tensor', List[List[float]]]] = None,
         ref_chosen_logps: Optional['torch.Tensor'] = None,
         ref_rejected_logps: Optional['torch.Tensor'] = None,
@@ -271,6 +266,7 @@ def __call__(
             outputs: Dict containing either:
                 - 'logps': [batch, seq_len] pre-computed log probs, OR
                 - 'logits': [batch, seq_len, vocab] from which logps will be computed
+            ref_outputs: Dict from reference model forward, containing 'logps'.
             ref_logps: [batch, seq_len] or List[List[float]] reference model log probs.
                       Can also be provided as separate ref_chosen_logps and ref_rejected_logps.
             ref_chosen_logps: [batch/2] pre-computed reference log probs for chosen.
@@ -282,6 +278,10 @@ def __call__(
         """
         import torch
 
+        # Extract ref_logps from ref_outputs if provided
+        if ref_outputs is not None and ref_logps is None:
+            ref_logps = ref_outputs.get('logps')
+
         labels = inputs.get('labels')
         assert labels is not None, "inputs must contain 'labels'"
         if not torch.is_tensor(labels):
@@ -312,9 +312,8 @@ def __call__(
             reference_rejected_logps = ref_rejected_logps.to(device=device, dtype=dtype)
         elif ref_logps is not None:
             # Per-token reference log probs provided, need to align and sum
-            loss_mask = (labels != self.ignore_index).bool()
-            ref_logps_aligned = self._pad_and_align_logps(
-                ref_logps, labels.shape, loss_mask, device, dtype
+            ref_logps_aligned = self._align_logps(
+                ref_logps, labels.shape, device, dtype
             )
             ref_chosen, ref_rejected = self._split_chosen_rejected(ref_logps_aligned)
             reference_chosen_logps = self._compute_sequence_logps(ref_chosen, chosen_labels)
diff --git a/src/twinkle/metric/loss.py b/src/twinkle/metric/loss.py
@@ -52,7 +52,6 @@ def calculate(self):
             'grad_norm': self.grad_norm,
             'num_tokens': self.num_tokens
         }]
-
         all_results = self.gather_results(local_results)
 
         total_loss = sum(r['loss'] for r in all_results)
diff --git a/src/twinkle/template/base.py b/src/twinkle/template/base.py