wip

tastelikefeet · tastelikefeet · commit 1c12fff2687e · 2026-03-17T23:27:26.000+08:00
diff --git a/cookbook/rl/gkd_off_policy.py b/cookbook/rl/gkd_off_policy.py
@@ -69,7 +69,7 @@
 
 GKD_BETA = float(os.environ.get('GKD_BETA', 0.5))
 GKD_TEMPERATURE = float(os.environ.get('GKD_TEMPERATURE', 1.0))
-GKD_TOPK = int(os.environ.get('GKD_TOPK', 20))
+GKD_TOPK = int(os.environ.get('GKD_TOPK', 64))
 MAX_NEW_TOKENS = int(os.environ.get('MAX_NEW_TOKENS', 2048))
 N_SAMPLES = int(os.environ.get('N_SAMPLES', 1))
 ADAPTER_NAME = 'default'
@@ -188,7 +188,7 @@ def main():
     # ── Teacher vLLM sampler (for prompt logprobs) ─────────────────────────────
     teacher_sampler = vLLMSampler(
         model_id=TEACHER_MODEL_ID,
-        engine_args={'gpu_memory_utilization': 0.85, 'max_model_len': 10240, 'logprobs_mode': 'raw_logprobs'},
+        engine_args={'gpu_memory_utilization': 0.85, 'max_model_len': 10240, 'logprobs_mode': 'raw_logprobs', 'max_logprobs': 64},
         device_mesh=sampler_mesh,
         remote_group='teacher_sampler',
     )
@@ -199,7 +199,6 @@ def main():
         dataset=create_dataset,
         batch_size=BATCH_SIZE,
         min_batch_size=BATCH_SIZE,
-        device_mesh=model_mesh,
         remote_group='student_model',
     )
 
diff --git a/src/twinkle/preprocessor/llm.py b/src/twinkle/preprocessor/llm.py
@@ -122,10 +122,7 @@ class GSM8KProcessor(Preprocessor):
     Extracts the ground truth number and stores it in user_data for reward.
     """
 
-    system_prompt = ('You are a helpful math assistant. Solve the problem step by step. '
-                     'Show your reasoning in <think> </think> tags, then give the final '
-                     'numerical answer after ####.\n'
-                     'For example:\n<think> ... reasoning ... </think>\n#### 42')
+    system_prompt = ('You are a helpful math assistant. Solve the problem step by step and put your final answer within \\boxed{}.')
 
     def extract_ground_truth(self, answer_str: str) -> str:
         """Extract the number after '####' from GSM8K answer."""
diff --git a/src/twinkle/utils/torch_utils.py b/src/twinkle/utils/torch_utils.py
@@ -72,16 +72,6 @@ def selective_log_softmax(logits, index) -> 'torch.Tensor':
     import torch
     import torch.nn.functional as F
 
-    try:
-        from megatron.core import parallel_state as mpu
-        if mpu.get_tensor_model_parallel_world_size() >= 1:
-            try:
-                return _vocab_parallel_selective_log_softmax(logits, index)
-            except Exception:  # noqa
-                import traceback
-                print(traceback.format_exc())
-    except Exception:  # noqa
-        pass
     if logits.dtype in [torch.float32, torch.float64]:
         selected_logits = torch.gather(logits, dim=-1, index=index.unsqueeze(-1)).squeeze(-1)
         # loop to reduce peak mem consumption