fix

Yunnglin · Yunnglin · commit 1315d44e14e5 · 2026-04-09T20:29:09.000+08:00
diff --git a/cookbook/client/tinker/self_host/sample.py b/cookbook/client/tinker/self_host/sample.py
@@ -8,7 +8,7 @@
 from tinker import types
 
 from twinkle.data_format import Message, Trajectory
-from twinkle.template import Template
+from twinkle.template import Template, Qwen3_5Template
 from twinkle import init_tinker_client
 
 # Step 1: Initialize Tinker client
@@ -27,14 +27,14 @@
 # The model_path is a twinkle:// URI pointing to a previously saved LoRA checkpoint.
 # The server will load the base model and apply the LoRA adapter weights.
 sampling_client = service_client.create_sampling_client(
-    model_path='twinkle://xxx-Qwen_Qwen3.5-4B-xxx/weights/twinkle-lora-1',
+    # model_path='twinkle://xxx-Qwen_Qwen3.5-4B-xxx/weights/twinkle-lora-1',
     base_model=base_model
 )
 
 # Step 4: Load the tokenizer locally to encode the prompt and decode the results
 print(f'Using model {base_model}')
 
-template = Template(model_id=f'ms://{base_model}')
+template = Qwen3_5Template(model_id=f'ms://{base_model}')
 
 trajectory = Trajectory(
     messages=[
@@ -43,7 +43,7 @@
     ]
 )
 
-input_feature = template.encode(trajectory, add_generation_prompt=True)
+input_feature = template.batch_encode([trajectory], add_generation_prompt=True)[0]
 
 input_ids = input_feature['input_ids'].tolist()
 
diff --git a/cookbook/client/tinker/self_host/short_math_grpo.py b/cookbook/client/tinker/self_host/short_math_grpo.py
@@ -39,7 +39,7 @@
 
 # ========== Configuration ==========
 BASE_MODEL = 'Qwen/Qwen3.5-4B'
-NUM_GENERATIONS = 8
+NUM_GENERATIONS = 4
 MAX_NEW_TOKENS = 4096
 LEARNING_RATE = 1e-5
 MAX_STEPS = 1000
diff --git a/src/twinkle/model/megatron/megatron.py b/src/twinkle/model/megatron/megatron.py
@@ -1169,11 +1169,14 @@ def _save_hf_format(self, output_dir: str, adapter_name: str, lora_converter=Non
         # Save config on rank 0 only
         if dp_rank == 0:
             self.hf_config.save_pretrained(output_dir)
+            if isinstance(model[0], PeftModel):
+                model[0].peft_config[adapter_name].save_pretrained(output_dir)
 
     def _save_megatron_format(self, output_dir: str, adapter_name: str, lora_converter=None):
         """Save in Megatron checkpoint format."""
         os.makedirs(output_dir, exist_ok=True)
-
+        from megatron.core import parallel_state as mpu
+        dp_rank = mpu.get_data_parallel_rank() if mpu.is_initialized() else 0
         state_dict = self._get_trainable_parameters(adapter_name)
         cpu_state_dict = {}
         for k, v in state_dict.items():
@@ -1189,6 +1192,12 @@ def _save_megatron_format(self, output_dir: str, adapter_name: str, lora_convert
         rank = dist.get_rank() if dist.is_initialized() else 0
         checkpoint_path = os.path.join(output_dir, f'model_rank{rank}.pt')
         torch.save(cpu_state_dict, checkpoint_path)
+        # Save config on rank 0 only
+        model = self.strategy.unwrap_model(self.model)
+        if dp_rank == 0:
+            self.hf_config.save_pretrained(output_dir)
+            if isinstance(model[0], PeftModel):
+                model[0].peft_config[adapter_name].save_pretrained(output_dir)
 
     def _save_tokenizer(self, output_dir: str, **kwargs):
         from twinkle.utils import is_last_rank
diff --git a/src/twinkle/sampler/vllm_sampler/vllm_sampler.py b/src/twinkle/sampler/vllm_sampler/vllm_sampler.py
@@ -235,14 +235,15 @@ async def _sample_single(
         """
         multi_modal_data = self._extract_multi_modal_data(feat)
         response = await self.engine.sample(
-            prompt=feat['prompt'],
+            prompt=feat['prompt'] if 'prompt' in feat else feat['input_ids'],
             sampling_params=sampling_params,
             lora_request=lora_request,
             multi_modal_data=multi_modal_data,
             mm_processor_kwargs=feat.get('mm_processor_kwargs'),
         )
-        feat['input_ids'] = response.prompt_token_ids
-        feat['labels'] = [-100] * len(response.prompt_token_ids)
+        if 'input_ids' not in feat:
+            feat['input_ids'] = response.prompt_token_ids
+            feat['labels'] = [-100] * len(response.prompt_token_ids)
         if not logprobs_only:
             # response.sequences contains num_samples sequences for this prompt
             sequences = []
@@ -318,7 +319,7 @@ def sample(
         inputs_list = self._normalize_inputs(inputs)
 
         # Check if inputs are Trajectory (not encoded) - aligned with Model.forward logic
-        is_trajectory = 'prompt' not in inputs_list[0] or 'input_ids' not in inputs_list[0]
+        is_trajectory = 'prompt' not in inputs_list[0] and 'input_ids' not in inputs_list[0]
         logprobs_only = False
         if sampling_params.max_tokens == 0:
             sampling_params.max_tokens = 1