Wenyueh · LeeWant · Mar 30, 2026
diff --git a/src/myvllm/engine/block_manager.py b/src/myvllm/engine/block_manager.py
@@ -92,6 +92,7 @@ def allocate(self, seq: Sequence) -> None:
                 # cache miss
                 block = self._allocate_block(self.free_block_ids[0])
                 block.update(h=h, token_ids=token_ids)
+                block.ref_count = 1
                 if h != -1:
                     self.hash_to_block_id[h] = block.block_id
             seq.block_table.append(block.block_id)

diff --git a/src/myvllm/engine/llm_engine.py b/src/myvllm/engine/llm_engine.py
@@ -65,10 +65,11 @@ def exit(self):
     # return scheduled sequences and whether it is for prefilling
     # call model_runner.run() to run the model
     # call postprocessor to process the outputs and update sequences and update block manager
-    def step(self) -> tuple[list[int], bool]:
+    def step(self) -> tuple[list[tuple[int, list[int]]], int, bool]:
         scheduled_sequences, is_prefill = self.scheduler.schedule()
+        num_processed_tokens = 0
         if not scheduled_sequences:
-            return [], is_prefill
+            return [], num_processed_tokens, is_prefill
         # run the model
         outputs = self.model_runner.call("run", scheduled_sequences, is_prefill)
         # Move outputs to CPU and convert them to a list