NVIDIA-NeMo · oyilmaz-nvidia · Mar 3, 2026 · Mar 3, 2026 · Mar 5, 2026 · Mar 6, 2026
@@ -16,7 +16,7 @@
 import atexit
 import logging
 from pathlib import Path
-from typing import Any, List, Optional, Tuple, Union
+from typing import List, Optional, Tuple, Union
 
 import megatron.core.dist_checkpointing.serialization as dist_ckpt
 import torch
@@ -27,11 +27,13 @@
     get_default_load_sharded_strategy,
 )
 from megatron.core.dist_checkpointing.validation import StrictHandling
-from megatron.core.inference.contexts import StaticInferenceContext
 from megatron.core.inference.engines.mcore_engine import MCoreEngine
 from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import (
     GPTInferenceWrapper,
 )
+from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import (
+    InferenceWrapperConfig,
+)
 from megatron.core.inference.text_generation_controllers.text_generation_controller import (
     TextGenerationController,
 )
@@ -62,29 +64,14 @@
 except ImportError:
     HAVE_TRITON = False
 
-try:
-    if not HAVE_TRITON:
-        raise ImportError("Triton is not installed")
-    from nemo.collections.llm.gpt.model.base import GPTConfig
-    from nemo.collections.llm.inference.base import MCoreTokenizerWrappper
-    from nemo.collections.llm.modelopt import set_modelopt_spec_if_exists_in_ckpt
-    from nemo.collections.llm.t5.model.t5 import T5Config
-    from nemo.lightning import io
-    from nemo.lightning.ckpt_utils import ckpt_to_context_subdir
-    from nemo.lightning.io.pl import ckpt_to_weights_subdir
-
-    HAVE_NEMO = True
-except (ImportError, ModuleNotFoundError):
-    HAVE_NEMO = False
-    from typing import Any
-
-    io = None
-    GPTConfig = Any
-    T5Config = Any
-    MCoreTokenizerWrappper = Any
-    set_modelopt_spec_if_exists_in_ckpt = None
-    ckpt_to_weights_subdir = None
-    ckpt_to_context_subdir = None
+from .nemo_utils import (
+    HAVE_NEMO,
+    MCoreTokenizerWrappper,
+    ckpt_to_context_subdir,
+    ckpt_to_weights_subdir,
+    io,
+    set_modelopt_spec_if_exists_in_ckpt,
+)
 
 LOGGER = logging.getLogger("NeMo")
 
@@ -465,6 +452,7 @@ def create_mcore_engine(
     model_type: str = "gpt",
     model_format: str = "nemo",
     micro_batch_size: Optional[int] = None,
+    buffer_size_gb: float = 10.0,
     **model_config_kwargs,
 ) -> Tuple[MCoreEngineWithCleanup, GPTInferenceWrapper, Union[MCoreTokenizerWrappper, MegatronTokenizer]]:
     """Set up the model, tokenizer and MCoreEngine for inference.
@@ -534,15 +522,25 @@ def create_mcore_engine(
     else:
         raise ValueError(f"Model format {model_format} not supported.")
 
-    inference_context = StaticInferenceContext(max_batch_size, inference_max_seq_length)
-    model_inference_wrapper = GPTInferenceWrapper(model, inference_context)
+    inner_model = peel(model)
+    model_config = inner_model.config
+    inference_wrapper_config = InferenceWrapperConfig(
+        hidden_size=model_config.hidden_size,
+        params_dtype=model_config.params_dtype,
+        inference_batch_times_seqlen_threshold=inference_batch_times_seqlen_threshold,
+        padded_vocab_size=inner_model.vocab_size,
+        inference_max_requests=max_batch_size,
+        inference_max_seq_length=inference_max_seq_length,
+    )
+    model_inference_wrapper = GPTInferenceWrapper(model, inference_wrapper_config)
     text_generation_controller = TextGenerationController(
         inference_wrapped_model=model_inference_wrapper, tokenizer=tokenizer
     )
     mcore_engine = MCoreEngine(
         text_generation_controller=text_generation_controller,
         max_batch_size=max_batch_size,
         random_seed=random_seed,
+        buffer_size_gb=buffer_size_gb,
     )
 
     # Wrap the engine to ensure cleanup

@@ -0,0 +1,188 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""NeMo utility code copied from the NeMo project.
+
+Standalone utilities (MCoreTokenizerWrappper, checkpoint path helpers) are
+copied directly and have no dependency on the nemo package.
+
+Complex types that are tightly coupled to NeMo's class hierarchy and
+serialization system (GPTConfig, T5Config, io, set_modelopt_spec_if_exists_in_ckpt)
+are re-exported here from the nemo package so that inference_base.py and
+tron_utils.py do not need to import from nemo directly.
+
+Sources:
+  - MCoreTokenizerWrappper  : nemo/collections/llm/inference/base.py
+  - ckpt_to_dir,
+    idempotent_path_append,
+    ckpt_to_context_subdir  : nemo/lightning/ckpt_utils.py
+  - ckpt_to_weights_subdir  : nemo/lightning/io/pl.py
+  - constants               : nemo/lightning/ckpt_utils.py
+"""
+
+import inspect
+from pathlib import Path
+from typing import Any, Union
+
+# ---------------------------------------------------------------------------
+# Constants  (from nemo.lightning.ckpt_utils)
+# ---------------------------------------------------------------------------
+
+# NeMo-2 checkpoint structure:
+#   <ckpt_dir>/weights/  – model weights
+#   <ckpt_dir>/context/  – hyper-parameters / IO context
+WEIGHTS_PATH: str = "weights"
+CONTEXT_PATH: str = "context"
+ADAPTER_META_FILENAME: str = "adapter_metadata.json"
+
+# ---------------------------------------------------------------------------
+# Checkpoint path utilities  (simplified from nemo.lightning.ckpt_utils and
+# nemo.lightning.io.pl – AdapterPath and MultiStorageClient branches removed
+# because they are not required for basic NeMo-2 inference).
+# ---------------------------------------------------------------------------
+
+
+def ckpt_to_dir(filepath: Union[str, Path]) -> Path:
+    """Return the checkpoint directory path for a given filepath.
+
+    PTL treats checkpoints as ``.ckpt`` files.  This helper strips the
+    extension (appending it first when absent) and returns a :class:`Path`
+    suitable for use as a distributed-checkpoint directory.
+
+    Copied from ``nemo.lightning.ckpt_utils.ckpt_to_dir`` with the
+    ``AdapterPath`` and ``MultiStorageClient`` branches removed.
+    """
+    filepath = Path(filepath)
+
+    if filepath.suffix != ".ckpt":
+        filepath = filepath.with_suffix(filepath.suffix + ".ckpt")
+
+    assert filepath.suffix == ".ckpt", f"filepath: {filepath} must have .ckpt extension"
+
+    # Return path whose name is the original filepath without the .ckpt extension.
+    return filepath.with_name(filepath.stem)
+
+
+def idempotent_path_append(base_dir: Union[str, Path], suffix: str) -> Path:
+    """Append *suffix* to *base_dir* only when it is not already the last component.
+
+    Copied from ``nemo.lightning.ckpt_utils.idempotent_path_append`` with the
+    ``AdapterPath`` and ``MultiStorageClient`` branches removed.
+    """
+    base_dir = Path(base_dir)
+    if base_dir.parts[-1] != suffix:
+        base_dir = base_dir / suffix
+    return base_dir
+
+
+def ckpt_to_context_subdir(filepath: Union[str, Path]) -> Path:
+    """Return the ``context`` sub-directory of a NeMo-2 checkpoint.
+
+    Copied from ``nemo.lightning.ckpt_utils.ckpt_to_context_subdir``.
+    """
+    base_dir = ckpt_to_dir(filepath=filepath)
+    return idempotent_path_append(base_dir, CONTEXT_PATH)
+
+
+def ckpt_to_weights_subdir(filepath: Union[str, Path], is_saving: bool) -> Path:
+    """Return the ``weights`` sub-directory of a NeMo-2 checkpoint.
+
+    Copied from ``nemo.lightning.io.pl.ckpt_to_weights_subdir`` with the
+    ``AdapterPath`` branch removed.
+    """
+    filepath = ckpt_to_dir(filepath=filepath)
+    base_dir = filepath
+
+    if base_dir.parts[-1] != WEIGHTS_PATH:
+        maybe_base_dir = base_dir / WEIGHTS_PATH
+        if maybe_base_dir.is_dir() or is_saving:
+            base_dir = maybe_base_dir
+
+    if is_saving:
+        assert base_dir.parts[-1] == WEIGHTS_PATH
+        assert base_dir.parent == filepath
+
+    return base_dir
+
+
+# ---------------------------------------------------------------------------
+# MCoreTokenizerWrappper  (from nemo.collections.llm.inference.base)
+# ---------------------------------------------------------------------------
+
+
+class MCoreTokenizerWrappper:
+    """Thin wrapper that adapts a NeMo tokenizer to the MCore generate API.
+
+    MCore's generate pipeline expects ``tokenizer.detokenize``,
+    ``tokenizer.tokenize``, ``tokenizer.bos``, and ``tokenizer.pad`` –
+    this wrapper maps those calls to the corresponding NeMo tokenizer
+    methods/properties.
+
+    Copied verbatim from ``nemo.collections.llm.inference.base.MCoreTokenizerWrappper``.
+    """
+
+    def __init__(self, tokenizer, vocab_size=None):
+        self.tokenizer = tokenizer
+        self.eod = tokenizer.eod
+        self.vocab_size = vocab_size or tokenizer.vocab_size
+
+    def detokenize(self, tokens, remove_special_tokens=False):
+        """Detokenize *tokens* into a string."""
+        if "remove_special_tokens" in inspect.signature(self.tokenizer.ids_to_text).parameters:
+            return self.tokenizer.ids_to_text(tokens, remove_special_tokens)
+        return self.tokenizer.ids_to_text(tokens)
+
+    def tokenize(self, prompt):
+        """Tokenize *prompt* into a list of token IDs."""
+        return self.tokenizer.text_to_ids(prompt)
+
+    @property
+    def additional_special_tokens_ids(self):
+        """IDs of additional special tokens."""
+        return self.tokenizer.additional_special_tokens_ids
+
+    @property
+    def bos(self):
+        """Beginning-of-sequence token ID."""
+        return self.tokenizer.bos_id
+
+    @property
+    def pad(self):
+        """Padding token ID."""
+        return self.tokenizer.pad_id
+
+
+# ---------------------------------------------------------------------------
+# NeMo complex types
+#
+# GPTConfig, T5Config, io, and set_modelopt_spec_if_exists_in_ckpt are
+# deeply coupled to NeMo's class hierarchy and serialization system.
+# Checkpoints saved by NeMo contain instances of these exact classes, so
+# they must originate from the nemo package to preserve isinstance()
+# compatibility.  They are re-exported here so that inference_base.py and
+# tron_utils.py do not need to import from nemo directly.
+# ---------------------------------------------------------------------------
+
+try:
+    from nemo.collections.llm.gpt.model.base import GPTConfig
+    from nemo.collections.llm.modelopt import set_modelopt_spec_if_exists_in_ckpt
+    from nemo.collections.llm.t5.model.t5 import T5Config
+    from nemo.lightning import io
+
+    HAVE_NEMO = True
+except (ImportError, ModuleNotFoundError):
+    GPTConfig = Any
+    T5Config = Any
+    io = None
+    set_modelopt_spec_if_exists_in_ckpt = None
+    HAVE_NEMO = False
@@ -39,20 +39,7 @@
 except ImportError:
     HAVE_TRITON = False
 
-try:
-    if not HAVE_TRITON:
-        raise ImportError("Triton is not installed")
-
-    from nemo.collections.llm.gpt.model.base import GPTConfig
-    from nemo.collections.llm.t5.model.t5 import T5Config
-
-    HAVE_NEMO = True
-except (ImportError, ModuleNotFoundError):
-    from typing import Any
-
-    GPTConfig = Any
-    T5Config = Any
-    HAVE_NEMO = False
+from .nemo_utils import GPTConfig, T5Config
 
 LOGGER = logging.getLogger("NeMo")
 

@@ -79,6 +79,7 @@ class MegatronLLMDeployable(ITritonDeployable):
         legacy_ckpt (bool): use legacy checkpoint format. Defaults to False.
         model_type (str): type of model to load. Defaults to "gpt".
         micro_batch_size (Optional[int]): micro batch size for model execution. Defaults to None.
+        buffer_size_gb (float): KV cache buffer size in GiB for DynamicInferenceContext. Defaults to 10.0.
     """
 
     def __init__(
@@ -102,6 +103,7 @@ def __init__(
         legacy_ckpt: bool = False,
         model_type: str = "gpt",
         micro_batch_size: Optional[int] = None,
+        buffer_size_gb: float = 10.0,
         **model_config_kwargs,
     ):
         if not HAVE_TRITON:
@@ -131,6 +133,7 @@ def __init__(
             model_type=model_type,
             model_format="megatron",
             micro_batch_size=micro_batch_size,
+            buffer_size_gb=buffer_size_gb,
             **model_config_kwargs,
         )
         self.enable_cuda_graphs = enable_cuda_graphs

@@ -112,7 +112,7 @@ vllm = [
     { index = "pytorch-cu130", marker = "python_version < '3.9' and platform_machine == 'x86_64'" },
     { index = "pypi", marker = "platform_machine == 'aarch64'" },
 ]
-megatron-bridge = { git = "https://github.com/NVIDIA-NeMo/Megatron-Bridge.git", rev = "65a21df6bdafc198c26baa26b748fe55f3a19fd9" }
+megatron-bridge = { git = "https://github.com/NVIDIA-NeMo/Megatron-Bridge.git", rev = "15d758f1346fafa6bf0485af70deb3cc88da2909" }
 # nemo-toolkit = { git = "https://github.com/NVIDIA/NeMo.git", rev = "main" }
 
 [tool.uv]
@@ -156,6 +156,7 @@ override-dependencies = [
     "transformers>=4.57.1",
     "protobuf~=6.33.5",
     "opencv-python-headless; sys_platform == 'never'",
+    "cryptography>=43.0.0,<47",
 ]
 prerelease = "allow"