Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 25 additions & 27 deletions nemo_deploy/llm/inference/inference_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import atexit
import logging
from pathlib import Path
from typing import Any, List, Optional, Tuple, Union
from typing import List, Optional, Tuple, Union

import megatron.core.dist_checkpointing.serialization as dist_ckpt
import torch
Expand All @@ -27,11 +27,13 @@
get_default_load_sharded_strategy,
)
from megatron.core.dist_checkpointing.validation import StrictHandling
from megatron.core.inference.contexts import StaticInferenceContext
from megatron.core.inference.engines.mcore_engine import MCoreEngine
from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import (
GPTInferenceWrapper,
)
from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import (
InferenceWrapperConfig,
)
from megatron.core.inference.text_generation_controllers.text_generation_controller import (
TextGenerationController,
)
Expand Down Expand Up @@ -62,29 +64,14 @@
except ImportError:
HAVE_TRITON = False

try:
if not HAVE_TRITON:
raise ImportError("Triton is not installed")
from nemo.collections.llm.gpt.model.base import GPTConfig
from nemo.collections.llm.inference.base import MCoreTokenizerWrappper
from nemo.collections.llm.modelopt import set_modelopt_spec_if_exists_in_ckpt
from nemo.collections.llm.t5.model.t5 import T5Config
from nemo.lightning import io
from nemo.lightning.ckpt_utils import ckpt_to_context_subdir
from nemo.lightning.io.pl import ckpt_to_weights_subdir

HAVE_NEMO = True
except (ImportError, ModuleNotFoundError):
HAVE_NEMO = False
from typing import Any

io = None
GPTConfig = Any
T5Config = Any
MCoreTokenizerWrappper = Any
set_modelopt_spec_if_exists_in_ckpt = None
ckpt_to_weights_subdir = None
ckpt_to_context_subdir = None
from .nemo_utils import (
HAVE_NEMO,
MCoreTokenizerWrappper,
ckpt_to_context_subdir,
ckpt_to_weights_subdir,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@oyilmaz-nvidia do we want to move nemo 2.0 functionality here ? Can't we just remove it since nemo 2.0 deployment code is already removed anyway

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So that's for importing the nemo and I'll have another PR to remove it. It's actually a lot more challenging than just adding here.

io,
set_modelopt_spec_if_exists_in_ckpt,
)

LOGGER = logging.getLogger("NeMo")

Expand Down Expand Up @@ -465,6 +452,7 @@ def create_mcore_engine(
model_type: str = "gpt",
model_format: str = "nemo",
micro_batch_size: Optional[int] = None,
buffer_size_gb: float = 10.0,
**model_config_kwargs,
) -> Tuple[MCoreEngineWithCleanup, GPTInferenceWrapper, Union[MCoreTokenizerWrappper, MegatronTokenizer]]:
"""Set up the model, tokenizer and MCoreEngine for inference.
Expand Down Expand Up @@ -534,15 +522,25 @@ def create_mcore_engine(
else:
raise ValueError(f"Model format {model_format} not supported.")

inference_context = StaticInferenceContext(max_batch_size, inference_max_seq_length)
model_inference_wrapper = GPTInferenceWrapper(model, inference_context)
inner_model = peel(model)
model_config = inner_model.config
inference_wrapper_config = InferenceWrapperConfig(
hidden_size=model_config.hidden_size,
params_dtype=model_config.params_dtype,
inference_batch_times_seqlen_threshold=inference_batch_times_seqlen_threshold,
padded_vocab_size=inner_model.vocab_size,
inference_max_requests=max_batch_size,
inference_max_seq_length=inference_max_seq_length,
)
model_inference_wrapper = GPTInferenceWrapper(model, inference_wrapper_config)
text_generation_controller = TextGenerationController(
inference_wrapped_model=model_inference_wrapper, tokenizer=tokenizer
)
mcore_engine = MCoreEngine(
text_generation_controller=text_generation_controller,
max_batch_size=max_batch_size,
random_seed=random_seed,
buffer_size_gb=buffer_size_gb,
)

# Wrap the engine to ensure cleanup
Expand Down
188 changes: 188 additions & 0 deletions nemo_deploy/llm/inference/nemo_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""NeMo utility code copied from the NeMo project.

Standalone utilities (MCoreTokenizerWrappper, checkpoint path helpers) are
copied directly and have no dependency on the nemo package.

Complex types that are tightly coupled to NeMo's class hierarchy and
serialization system (GPTConfig, T5Config, io, set_modelopt_spec_if_exists_in_ckpt)
are re-exported here from the nemo package so that inference_base.py and
tron_utils.py do not need to import from nemo directly.

Sources:
- MCoreTokenizerWrappper : nemo/collections/llm/inference/base.py
- ckpt_to_dir,
idempotent_path_append,
ckpt_to_context_subdir : nemo/lightning/ckpt_utils.py
- ckpt_to_weights_subdir : nemo/lightning/io/pl.py
- constants : nemo/lightning/ckpt_utils.py
"""

import inspect
from pathlib import Path
from typing import Any, Union

# ---------------------------------------------------------------------------
# Constants (from nemo.lightning.ckpt_utils)
# ---------------------------------------------------------------------------

# NeMo-2 checkpoint structure:
# <ckpt_dir>/weights/ – model weights
# <ckpt_dir>/context/ – hyper-parameters / IO context
WEIGHTS_PATH: str = "weights"
CONTEXT_PATH: str = "context"
ADAPTER_META_FILENAME: str = "adapter_metadata.json"

# ---------------------------------------------------------------------------
# Checkpoint path utilities (simplified from nemo.lightning.ckpt_utils and
# nemo.lightning.io.pl – AdapterPath and MultiStorageClient branches removed
# because they are not required for basic NeMo-2 inference).
# ---------------------------------------------------------------------------


def ckpt_to_dir(filepath: Union[str, Path]) -> Path:
"""Return the checkpoint directory path for a given filepath.

PTL treats checkpoints as ``.ckpt`` files. This helper strips the
extension (appending it first when absent) and returns a :class:`Path`
suitable for use as a distributed-checkpoint directory.

Copied from ``nemo.lightning.ckpt_utils.ckpt_to_dir`` with the
``AdapterPath`` and ``MultiStorageClient`` branches removed.
"""
filepath = Path(filepath)

if filepath.suffix != ".ckpt":
filepath = filepath.with_suffix(filepath.suffix + ".ckpt")

assert filepath.suffix == ".ckpt", f"filepath: {filepath} must have .ckpt extension"

# Return path whose name is the original filepath without the .ckpt extension.
return filepath.with_name(filepath.stem)


def idempotent_path_append(base_dir: Union[str, Path], suffix: str) -> Path:
"""Append *suffix* to *base_dir* only when it is not already the last component.

Copied from ``nemo.lightning.ckpt_utils.idempotent_path_append`` with the
``AdapterPath`` and ``MultiStorageClient`` branches removed.
"""
base_dir = Path(base_dir)
if base_dir.parts[-1] != suffix:
base_dir = base_dir / suffix
return base_dir


def ckpt_to_context_subdir(filepath: Union[str, Path]) -> Path:
"""Return the ``context`` sub-directory of a NeMo-2 checkpoint.

Copied from ``nemo.lightning.ckpt_utils.ckpt_to_context_subdir``.
"""
base_dir = ckpt_to_dir(filepath=filepath)
return idempotent_path_append(base_dir, CONTEXT_PATH)


def ckpt_to_weights_subdir(filepath: Union[str, Path], is_saving: bool) -> Path:
"""Return the ``weights`` sub-directory of a NeMo-2 checkpoint.

Copied from ``nemo.lightning.io.pl.ckpt_to_weights_subdir`` with the
``AdapterPath`` branch removed.
"""
filepath = ckpt_to_dir(filepath=filepath)
base_dir = filepath

if base_dir.parts[-1] != WEIGHTS_PATH:
maybe_base_dir = base_dir / WEIGHTS_PATH
if maybe_base_dir.is_dir() or is_saving:
base_dir = maybe_base_dir

if is_saving:
assert base_dir.parts[-1] == WEIGHTS_PATH
assert base_dir.parent == filepath

return base_dir


# ---------------------------------------------------------------------------
# MCoreTokenizerWrappper (from nemo.collections.llm.inference.base)
# ---------------------------------------------------------------------------


class MCoreTokenizerWrappper:
"""Thin wrapper that adapts a NeMo tokenizer to the MCore generate API.

MCore's generate pipeline expects ``tokenizer.detokenize``,
``tokenizer.tokenize``, ``tokenizer.bos``, and ``tokenizer.pad`` –
this wrapper maps those calls to the corresponding NeMo tokenizer
methods/properties.

Copied verbatim from ``nemo.collections.llm.inference.base.MCoreTokenizerWrappper``.
"""

def __init__(self, tokenizer, vocab_size=None):
self.tokenizer = tokenizer
self.eod = tokenizer.eod
self.vocab_size = vocab_size or tokenizer.vocab_size

def detokenize(self, tokens, remove_special_tokens=False):
"""Detokenize *tokens* into a string."""
if "remove_special_tokens" in inspect.signature(self.tokenizer.ids_to_text).parameters:
return self.tokenizer.ids_to_text(tokens, remove_special_tokens)
return self.tokenizer.ids_to_text(tokens)

def tokenize(self, prompt):
"""Tokenize *prompt* into a list of token IDs."""
return self.tokenizer.text_to_ids(prompt)

@property
def additional_special_tokens_ids(self):
"""IDs of additional special tokens."""
return self.tokenizer.additional_special_tokens_ids

@property
def bos(self):
"""Beginning-of-sequence token ID."""
return self.tokenizer.bos_id

@property
def pad(self):
"""Padding token ID."""
return self.tokenizer.pad_id


# ---------------------------------------------------------------------------
# NeMo complex types
#
# GPTConfig, T5Config, io, and set_modelopt_spec_if_exists_in_ckpt are
# deeply coupled to NeMo's class hierarchy and serialization system.
# Checkpoints saved by NeMo contain instances of these exact classes, so
# they must originate from the nemo package to preserve isinstance()
# compatibility. They are re-exported here so that inference_base.py and
# tron_utils.py do not need to import from nemo directly.
# ---------------------------------------------------------------------------

try:
from nemo.collections.llm.gpt.model.base import GPTConfig
from nemo.collections.llm.modelopt import set_modelopt_spec_if_exists_in_ckpt
from nemo.collections.llm.t5.model.t5 import T5Config
from nemo.lightning import io

HAVE_NEMO = True
except (ImportError, ModuleNotFoundError):
GPTConfig = Any
T5Config = Any
io = None
set_modelopt_spec_if_exists_in_ckpt = None
HAVE_NEMO = False
15 changes: 1 addition & 14 deletions nemo_deploy/llm/inference/tron_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,20 +39,7 @@
except ImportError:
HAVE_TRITON = False

try:
if not HAVE_TRITON:
raise ImportError("Triton is not installed")

from nemo.collections.llm.gpt.model.base import GPTConfig
from nemo.collections.llm.t5.model.t5 import T5Config

HAVE_NEMO = True
except (ImportError, ModuleNotFoundError):
from typing import Any

GPTConfig = Any
T5Config = Any
HAVE_NEMO = False
from .nemo_utils import GPTConfig, T5Config

LOGGER = logging.getLogger("NeMo")

Expand Down
3 changes: 3 additions & 0 deletions nemo_deploy/llm/megatronllm_deployable.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ class MegatronLLMDeployable(ITritonDeployable):
legacy_ckpt (bool): use legacy checkpoint format. Defaults to False.
model_type (str): type of model to load. Defaults to "gpt".
micro_batch_size (Optional[int]): micro batch size for model execution. Defaults to None.
buffer_size_gb (float): KV cache buffer size in GiB for DynamicInferenceContext. Defaults to 10.0.
"""

def __init__(
Expand All @@ -102,6 +103,7 @@ def __init__(
legacy_ckpt: bool = False,
model_type: str = "gpt",
micro_batch_size: Optional[int] = None,
buffer_size_gb: float = 10.0,
**model_config_kwargs,
):
if not HAVE_TRITON:
Expand Down Expand Up @@ -131,6 +133,7 @@ def __init__(
model_type=model_type,
model_format="megatron",
micro_batch_size=micro_batch_size,
buffer_size_gb=buffer_size_gb,
**model_config_kwargs,
)
self.enable_cuda_graphs = enable_cuda_graphs
Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ vllm = [
{ index = "pytorch-cu130", marker = "python_version < '3.9' and platform_machine == 'x86_64'" },
{ index = "pypi", marker = "platform_machine == 'aarch64'" },
]
megatron-bridge = { git = "https://github.com/NVIDIA-NeMo/Megatron-Bridge.git", rev = "65a21df6bdafc198c26baa26b748fe55f3a19fd9" }
megatron-bridge = { git = "https://github.com/NVIDIA-NeMo/Megatron-Bridge.git", rev = "15d758f1346fafa6bf0485af70deb3cc88da2909" }
# nemo-toolkit = { git = "https://github.com/NVIDIA/NeMo.git", rev = "main" }

[tool.uv]
Expand Down Expand Up @@ -156,6 +156,7 @@ override-dependencies = [
"transformers>=4.57.1",
"protobuf~=6.33.5",
"opencv-python-headless; sys_platform == 'never'",
"cryptography>=43.0.0,<47",
]
prerelease = "allow"

Expand Down
Loading
Loading