diff --git a/src/myvllm/engine/block_manager.py b/src/myvllm/engine/block_manager.py index 122bed2..4a2ba9f 100644 --- a/src/myvllm/engine/block_manager.py +++ b/src/myvllm/engine/block_manager.py @@ -92,6 +92,7 @@ def allocate(self, seq: Sequence) -> None: # cache miss block = self._allocate_block(self.free_block_ids[0]) block.update(h=h, token_ids=token_ids) + block.ref_count = 1 if h != -1: self.hash_to_block_id[h] = block.block_id seq.block_table.append(block.block_id) diff --git a/src/myvllm/engine/llm_engine.py b/src/myvllm/engine/llm_engine.py index 594e9d8..a6bc8d4 100644 --- a/src/myvllm/engine/llm_engine.py +++ b/src/myvllm/engine/llm_engine.py @@ -65,10 +65,11 @@ def exit(self): # return scheduled sequences and whether it is for prefilling # call model_runner.run() to run the model # call postprocessor to process the outputs and update sequences and update block manager - def step(self) -> tuple[list[int], bool]: + def step(self) -> tuple[list[tuple[int, list[int]]], int, bool]: scheduled_sequences, is_prefill = self.scheduler.schedule() + num_processed_tokens = 0 if not scheduled_sequences: - return [], is_prefill + return [], num_processed_tokens, is_prefill # run the model outputs = self.model_runner.call("run", scheduled_sequences, is_prefill) # Move outputs to CPU and convert them to a list